diff --git a/Extensions/DirectXMathAVX.h b/Extensions/DirectXMathAVX.h
index ee891d8..d4ae467 100644
--- a/Extensions/DirectXMathAVX.h
+++ b/Extensions/DirectXMathAVX.h
@@ -1,289 +1,289 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error AVX not supported on ARM platform
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-#error AVX intrinsics requires Visual C++ 2010 Service Pack 1 or later.
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <immintrin.h>
-
-#include <DirectXMath.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace AVX
-{
-
-inline bool XMVerifyAVXSupport()
-{
-    // Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors
-    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 1  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We check for AVX, OSXSAVE, SSSE4.1, and SSE3
-    return ( (CPUInfo[2] & 0x18080001) == 0x18080001 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_  const float *pValue )
-{
-    return _mm_broadcast_ss( pValue );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(0, 0, 0, 0) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 )
-{
-    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
-    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
-
-    unsigned int elem[4] = { E0, E1, E2, E3 };
-    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
-    return _mm_permutevar_ps( V, vControl );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW )
-{
-    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
-    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
-
-    static const XMVECTORU32 three = { 3, 3, 3, 3 };
-
-    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
-    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
-    
-    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
-    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
-
-    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
-    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
-
-    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
-    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
-
-    return _mm_or_ps( masked1, masked2 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return AVX::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return AVX::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return AVX::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Permute Templates
-//-------------------------------------------------------------------------------------
-
-namespace Internal
-{
-    // Slow path fallback for permutes that do not map to a single SSE opcode.
-    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
-        {
-            static const XMVECTORU32 selectMask =
-            {
-                WhichX ? 0xFFFFFFFF : 0,
-                WhichY ? 0xFFFFFFFF : 0,
-                WhichZ ? 0xFFFFFFFF : 0,
-                WhichW ? 0xFFFFFFFF : 0,
-            };
-
-            XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
-            XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
-
-            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
-            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
-
-            return _mm_or_ps(masked1, masked2);
-        }
-    };
-
-    // Fast path for permutes that only read from the first vector.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
-    };
-
-    // Fast path for permutes that only read from the second vector.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); }
-    };
-
-    // Fast path for permutes that read XY from the first vector, ZW from the second.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
-    };
-
-    // Fast path for permutes that read XY from the second vector, ZW from the first.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
-    };
-};
-
-// General permute template
-template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
-    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
-{
-    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
-    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
-    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
-    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
-
-    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
-
-    const bool WhichX = PermuteX > 3;
-    const bool WhichY = PermuteY > 3;
-    const bool WhichZ = PermuteZ > 3;
-    const bool WhichW = PermuteW > 3;
-
-    return AVX::Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
-}
-
-// Special-case permute templates
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
-
-
-//-------------------------------------------------------------------------------------
-// Swizzle Templates
-//-------------------------------------------------------------------------------------
-
-// General swizzle template
-template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
-    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
-{
-    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
-    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
-    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
-    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
-
-    return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
-}
-
-// Specialized swizzles
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
-
-
-//-------------------------------------------------------------------------------------
-// Other Templates
-//-------------------------------------------------------------------------------------
-
-template<uint32_t Elements>
-    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return AVX::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
-}
-
-template<uint32_t Elements>
-    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return AVX::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
-}
-
-template<uint32_t Elements>
-    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
-}
-
-}; // namespace AVX
-
-}; // namespace DirectX;
+//-------------------------------------------------------------------------------------
+// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error AVX not supported on ARM platform
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#error AVX intrinsics requires Visual C++ 2010 Service Pack 1 or later.
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <immintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace AVX
+{
+
+inline bool XMVerifyAVXSupport()
+{
+    // Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We check for AVX, OSXSAVE, SSSE4.1, and SSE3
+    return ( (CPUInfo[2] & 0x18080001) == 0x18080001 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_  const float *pValue )
+{
+    return _mm_broadcast_ss( pValue );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(0, 0, 0, 0) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 )
+{
+    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    return _mm_permutevar_ps( V, vControl );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW )
+{
+    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+
+    static const XMVECTORU32 three = { 3, 3, 3, 3 };
+
+    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    
+    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
+    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
+
+    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
+    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
+
+    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
+    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
+
+    return _mm_or_ps( masked1, masked2 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Permute Templates
+//-------------------------------------------------------------------------------------
+
+namespace Internal
+{
+    // Slow path fallback for permutes that do not map to a single SSE opcode.
+    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
+        {
+            static const XMVECTORU32 selectMask =
+            {
+                WhichX ? 0xFFFFFFFF : 0,
+                WhichY ? 0xFFFFFFFF : 0,
+                WhichZ ? 0xFFFFFFFF : 0,
+                WhichW ? 0xFFFFFFFF : 0,
+            };
+
+            XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
+            XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
+
+            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+            return _mm_or_ps(masked1, masked2);
+        }
+    };
+
+    // Fast path for permutes that only read from the first vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
+    };
+
+    // Fast path for permutes that only read from the second vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the first vector, ZW from the second.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the second vector, ZW from the first.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
+    };
+};
+
+// General permute template
+template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
+    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+
+    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+    const bool WhichX = PermuteX > 3;
+    const bool WhichY = PermuteY > 3;
+    const bool WhichZ = PermuteZ > 3;
+    const bool WhichW = PermuteW > 3;
+
+    return AVX::Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+}
+
+// Special-case permute templates
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+
+
+//-------------------------------------------------------------------------------------
+// Swizzle Templates
+//-------------------------------------------------------------------------------------
+
+// General swizzle template
+template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
+    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
+{
+    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+    return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+}
+
+// Specialized swizzles
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+
+
+//-------------------------------------------------------------------------------------
+// Other Templates
+//-------------------------------------------------------------------------------------
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+}
+
+}; // namespace AVX
+
+}; // namespace DirectX;
diff --git a/Extensions/DirectXMathAVX2.h b/Extensions/DirectXMathAVX2.h
index f968b8b..c3cdae2 100644
--- a/Extensions/DirectXMathAVX2.h
+++ b/Extensions/DirectXMathAVX2.h
@@ -1,972 +1,972 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathAVX2.h -- AVX2 extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error AVX2 not supported on ARM platform
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1700)
-#error AVX2 intrinsics requires Visual C++ 2012 or later.
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <immintrin.h>
-
-#include <DirectXMath.h>
-#include <DirectXPackedVector.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace AVX2
-{
-
-inline bool XMVerifyAVX2Support()
-{
-    // Should return true for AMD "Excavator", Intel "Haswell" or later processors
-    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 7  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3
-    if ( (CPUInfo[2] & 0x38081001) != 0x38081001 )
-        return false;
-
-    __cpuidex(CPUInfo, 7, 0);
-
-    return ( (CPUInfo[1] & 0x20 ) == 0x20 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_  const float *pValue )
-{
-    return _mm_broadcast_ss( pValue );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V )
-{
-    return _mm_broadcastss_ps( V );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V )
-{
-    return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    return _mm_fmadd_ps( V1, V2, V3 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    return _mm_fnmadd_ps( V1, V2, V3 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 )
-{
-    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
-    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
-
-    unsigned int elem[4] = { E0, E1, E2, E3 };
-    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
-    return _mm_permutevar_ps( V, vControl );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW )
-{
-    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
-    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
-
-    static const XMVECTORU32 three = { 3, 3, 3, 3 };
-
-    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
-    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
-    
-    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
-    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
-
-    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
-    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
-
-    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
-    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
-
-    return _mm_or_ps( masked1, masked2 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return AVX2::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return AVX2::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return AVX2::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector2
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
-    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
-    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
-    vResult = _mm_div_ps( vResult, W );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_mul_ps( vResult, M.r[1] );
-    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector3
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
-    vResult = _mm_div_ps( vResult, W );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_mul_ps( vResult, M.r[2] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
-
-inline XMVECTOR XM_CALLCONV XMVector3Project
-(
-    FXMVECTOR V, 
-    float    ViewportX, 
-    float    ViewportY, 
-    float    ViewportWidth, 
-    float    ViewportHeight, 
-    float    ViewportMinZ, 
-    float    ViewportMaxZ, 
-    CXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
-    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View);
-    Transform = AVX2::XMMatrixMultiply(Transform, Projection);
-
-    XMVECTOR Result = AVX2::XMVector3TransformCoord(V, Transform);
-
-    Result = AVX2::XMVectorMultiplyAdd(Result, Scale, Offset);
-
-    return Result;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3Unproject
-(
-    FXMVECTOR V, 
-    float     ViewportX, 
-    float     ViewportY, 
-    float     ViewportWidth, 
-    float     ViewportHeight, 
-    float     ViewportMinZ, 
-    float     ViewportMaxZ, 
-    CXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = AVX2::XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View);
-    Transform = AVX2::XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    XMVECTOR Result = AVX2::XMVectorMultiplyAdd(V, Scale, Offset);
-
-    return AVX2::XMVector3TransformCoord(Result, Transform);
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector4
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
-    vResult = _mm_mul_ps( vResult, M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_broadcastss_ps(V); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Matrix
-//-------------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiply
-(
-    CXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-    XMMATRIX mResult;
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_broadcastss_ps(vW);
-    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[0] = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = _mm_broadcastss_ps(vW);
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[1] = vX;
-    vW = M1.r[2];
-    vX = _mm_broadcastss_ps(vW);
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[2] = vX;
-    vW = M1.r[3];
-    vX = _mm_broadcastss_ps(vW);
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[3] = vX;
-    return mResult;
-}
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
-(
-    FXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_broadcastss_ps(vW);
-    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r0 = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = _mm_broadcastss_ps(vW);
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r1 = vX;
-    vW = M1.r[2];
-    vX = _mm_broadcastss_ps(vW);
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r2 = vX;
-    vW = M1.r[3];
-    vX = _mm_broadcastss_ps(vW);
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r3 = vX;
-
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
-
-    XMMATRIX mResult;
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
-    return mResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Permute Templates
-//-------------------------------------------------------------------------------------
-
-namespace Internal
-{
-    // Slow path fallback for permutes that do not map to a single SSE opcode.
-    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
-        {
-            static const XMVECTORU32 selectMask =
-            {
-                WhichX ? 0xFFFFFFFF : 0,
-                WhichY ? 0xFFFFFFFF : 0,
-                WhichZ ? 0xFFFFFFFF : 0,
-                WhichW ? 0xFFFFFFFF : 0,
-            };
-
-            XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
-            XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
-
-            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
-            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
-
-            return _mm_or_ps(masked1, masked2);
-        }
-    };
-
-    // Fast path for permutes that only read from the first vector.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
-    };
-
-    // Fast path for permutes that only read from the second vector.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); }
-    };
-
-    // Fast path for permutes that read XY from the first vector, ZW from the second.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
-    };
-
-    // Fast path for permutes that read XY from the second vector, ZW from the first.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
-    {
-        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
-    };
-};
-
-// General permute template
-template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
-    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
-{
-    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
-    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
-    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
-    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
-
-    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
-
-    const bool WhichX = PermuteX > 3;
-    const bool WhichY = PermuteY > 3;
-    const bool WhichZ = PermuteZ > 3;
-    const bool WhichW = PermuteW > 3;
-
-    return AVX2::Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
-}
-
-// Special-case permute templates
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
-
-
-//-------------------------------------------------------------------------------------
-// Swizzle Templates
-//-------------------------------------------------------------------------------------
-
-// General swizzle template
-template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
-    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
-{
-    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
-    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
-    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
-    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
-
-    return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
-}
-
-// Specialized swizzles
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps(V); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
-template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
-
-
-//-------------------------------------------------------------------------------------
-// Other Templates
-//-------------------------------------------------------------------------------------
-
-template<uint32_t Elements>
-    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return AVX2::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
-}
-
-template<uint32_t Elements>
-    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return AVX2::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
-}
-
-template<uint32_t Elements>
-    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return AVX2::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
-}
-
-//-------------------------------------------------------------------------------------
-// Data conversion
-//-------------------------------------------------------------------------------------
-
-inline float XMConvertHalfToFloat( PackedVector::HALF Value )
-{
-    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
-    __m128 V2 = _mm_cvtph_ps( V1 );
-    return _mm_cvtss_f32( V2 );
-}
-
-inline PackedVector::HALF XMConvertFloatToHalf( float Value )
-{
-    __m128 V1 = _mm_set_ss( Value );
-    __m128i V2 = _mm_cvtps_ph( V1, 0 );
-    return static_cast<PackedVector::HALF>( _mm_cvtsi128_si32(V2) );
-}
-
-inline float* XMConvertHalfToFloatStream
-(
-    _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, 
-     _In_ size_t      OutputStride, 
-    _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, 
-    _In_ size_t      InputStride, 
-    _In_ size_t      HalfCount
-)
-{
-    using namespace PackedVector;
-
-    assert(pOutputStream);
-    assert(pInputStream);
-    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = HalfCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(HALF))
-        {
-            if (OutputStride == sizeof(float))
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                        pHalf += InputStride*4;
-
-                        __m128 FV = _mm_cvtph_ps( HV );
-
-                        _mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
-                        pFloat += OutputStride*4; 
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                        pHalf += InputStride*4;
-
-                        __m128 FV = _mm_cvtph_ps( HV );
-
-                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
-                        pFloat += OutputStride*4; 
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, scattered output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                    pHalf += InputStride*4;
-
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
-                    pFloat += OutputStride; 
-                    i += 4;
-                }
-            }
-        }
-        else if (OutputStride == sizeof(float))
-        {
-            if ( ((uintptr_t)pFloat & 0xF) == 0)
-            {
-                // Scattered input, aligned & packed output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16( HV, H1, 0 );
-                    HV = _mm_insert_epi16( HV, H2, 1 );
-                    HV = _mm_insert_epi16( HV, H3, 2 );
-                    HV = _mm_insert_epi16( HV, H4, 3 );
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
-                    pFloat += OutputStride*4; 
-                    i += 4;
-                }
-            }
-            else
-            {
-                // Scattered input, packed output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16( HV, H1, 0 );
-                    HV = _mm_insert_epi16( HV, H2, 1 );
-                    HV = _mm_insert_epi16( HV, H3, 2 );
-                    HV = _mm_insert_epi16( HV, H4, 3 );
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
-                    pFloat += OutputStride*4; 
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < HalfCount; ++i)
-    {
-        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride; 
-    }
-
-    return pOutputStream;
-}
-
-
-inline PackedVector::HALF* XMConvertFloatToHalfStream
-(
-    _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, 
-    _In_ size_t       OutputStride, 
-    _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, 
-    _In_ size_t       InputStride, 
-    _In_ size_t       FloatCount
-)
-{
-    using namespace PackedVector;
-
-    assert(pOutputStream);
-    assert(pInputStream);
-    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = FloatCount >> 2;
-    if (four > 0)
-    {
-        if (InputStride == sizeof(float))
-        {
-            if (OutputStride == sizeof(HALF))
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Aligned and packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                        pHalf += OutputStride*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                        pHalf += OutputStride*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Aligned & packed input, scattered output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, scattered output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-            }
-        }
-        else if (OutputStride == sizeof(HALF))
-        {
-            // Scattered input, packed output
-            for (size_t j = 0; j < four; ++j)
-            {
-                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
-                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
-                FV = _mm_blend_ps( FV, FT, 0xC );
-
-                __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                pHalf += OutputStride*4;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < FloatCount; ++i)
-    {
-        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride; 
-        pHalf += OutputStride;
-    }
-
-    return pOutputStream;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Half2
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource )
-{
-    assert(pSource);
-    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
-    return _mm_cvtph_ps( _mm_castps_si128( V ) );
-}
-
-inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V )
-{
-    assert(pDestination);
-    __m128i V1 = _mm_cvtps_ph( V, 0 );
-    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Half4
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource )
-{
-    assert(pSource);
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_cvtph_ps( V );
-}
-
-inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V )
-{
-    assert(pDestination);
-    __m128i V1 = _mm_cvtps_ph( V, 0 );
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
-}
-
-}; // namespace AVX2
-
-}; // namespace DirectX;
+//-------------------------------------------------------------------------------------
+// DirectXMathAVX2.h -- AVX2 extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error AVX2 not supported on ARM platform
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1700)
+#error AVX2 intrinsics requires Visual C++ 2012 or later.
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <immintrin.h>
+
+#include <DirectXMath.h>
+#include <DirectXPackedVector.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace AVX2
+{
+
+inline bool XMVerifyAVX2Support()
+{
+    // Should return true for AMD "Excavator", Intel "Haswell" or later processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 7  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3
+    if ( (CPUInfo[2] & 0x38081001) != 0x38081001 )
+        return false;
+
+    __cpuidex(CPUInfo, 7, 0);
+
+    return ( (CPUInfo[1] & 0x20 ) == 0x20 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_  const float *pValue )
+{
+    return _mm_broadcast_ss( pValue );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V )
+{
+    return _mm_broadcastss_ps( V );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fmadd_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fnmadd_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 )
+{
+    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    return _mm_permutevar_ps( V, vControl );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW )
+{
+    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+
+    static const XMVECTORU32 three = { 3, 3, 3, 3 };
+
+    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    
+    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
+    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
+
+    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
+    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
+
+    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
+    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
+
+    return _mm_or_ps( masked1, masked2 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX2::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX2::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX2::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_mul_ps( vResult, M.r[1] );
+    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_mul_ps( vResult, M.r[2] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View);
+    Transform = AVX2::XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = AVX2::XMVector3TransformCoord(V, Transform);
+
+    Result = AVX2::XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = AVX2::XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View);
+    Transform = AVX2::XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = AVX2::XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return AVX2::XMVector3TransformCoord(Result, Transform);
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
+    vResult = _mm_mul_ps( vResult, M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Matrix
+//-------------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    CXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_broadcastss_ps(vW);
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[3] = vX;
+    return mResult;
+}
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_broadcastss_ps(vW);
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Permute Templates
+//-------------------------------------------------------------------------------------
+
+namespace Internal
+{
+    // Slow path fallback for permutes that do not map to a single SSE opcode.
+    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
+        {
+            static const XMVECTORU32 selectMask =
+            {
+                WhichX ? 0xFFFFFFFF : 0,
+                WhichY ? 0xFFFFFFFF : 0,
+                WhichZ ? 0xFFFFFFFF : 0,
+                WhichW ? 0xFFFFFFFF : 0,
+            };
+
+            XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
+            XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
+
+            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+            return _mm_or_ps(masked1, masked2);
+        }
+    };
+
+    // Fast path for permutes that only read from the first vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
+    };
+
+    // Fast path for permutes that only read from the second vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the first vector, ZW from the second.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the second vector, ZW from the first.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
+    };
+};
+
+// General permute template
+template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
+    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+
+    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+    const bool WhichX = PermuteX > 3;
+    const bool WhichY = PermuteY > 3;
+    const bool WhichZ = PermuteZ > 3;
+    const bool WhichW = PermuteW > 3;
+
+    return AVX2::Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+}
+
+// Special-case permute templates
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+
+
+//-------------------------------------------------------------------------------------
+// Swizzle Templates
+//-------------------------------------------------------------------------------------
+
+// General swizzle template
+template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
+    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
+{
+    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+    return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+}
+
+// Specialized swizzles
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps(V); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+
+
+//-------------------------------------------------------------------------------------
+// Other Templates
+//-------------------------------------------------------------------------------------
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX2::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX2::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX2::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+}
+
+//-------------------------------------------------------------------------------------
+// Data conversion
+//-------------------------------------------------------------------------------------
+
+inline float XMConvertHalfToFloat( PackedVector::HALF Value )
+{
+    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
+    __m128 V2 = _mm_cvtph_ps( V1 );
+    return _mm_cvtss_f32( V2 );
+}
+
+inline PackedVector::HALF XMConvertFloatToHalf( float Value )
+{
+    __m128 V1 = _mm_set_ss( Value );
+    __m128i V2 = _mm_cvtps_ph( V1, 0 );
+    return static_cast<PackedVector::HALF>( _mm_cvtsi128_si32(V2) );
+}
+
+inline float* XMConvertHalfToFloatStream
+(
+    _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, 
+     _In_ size_t      OutputStride, 
+    _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, 
+    _In_ size_t      InputStride, 
+    _In_ size_t      HalfCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        _mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                    pHalf += InputStride*4;
+
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
+                    pFloat += OutputStride; 
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ( ((uintptr_t)pFloat & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride; 
+    }
+
+    return pOutputStream;
+}
+
+
+inline PackedVector::HALF* XMConvertFloatToHalfStream
+(
+    _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, 
+    _In_ size_t       OutputStride, 
+    _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, 
+    _In_ size_t       InputStride, 
+    _In_ size_t       FloatCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
+                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
+                FV = _mm_blend_ps( FV, FT, 0xC );
+
+                __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                pHalf += OutputStride*4;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource )
+{
+    assert(pSource);
+    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_cvtph_ps( _mm_castps_si128( V ) );
+}
+
+inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource )
+{
+    assert(pSource);
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtph_ps( V );
+}
+
+inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
+}
+
+}; // namespace AVX2
+
+}; // namespace DirectX;
diff --git a/Extensions/DirectXMathBE.h b/Extensions/DirectXMathBE.h
index 3cc4e6c..3b8e4aa 100644
--- a/Extensions/DirectXMathBE.h
+++ b/Extensions/DirectXMathBE.h
@@ -1,103 +1,103 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathBE.h -- Big-endian swap extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#ifndef _M_ARM
-#include <tmmintrin.h>
-#endif
-
-#include <DirectXMath.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-inline XMVECTOR XM_CALLCONV XMVectorEndian
-(
-    FXMVECTOR V 
-)
-{
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
-
-    int8x8x2_t tbl;
-    tbl.val[0] = vget_low_f32(V);
-    tbl.val[1] = vget_high_f32(V);
-
-    const __n64 rL = vtbl2_u8( tbl, vget_low_f32(idx) );
-    const __n64 rH = vtbl2_u8( tbl, vget_high_f32(idx) );
-    return vcombine_f32( rL, rH );
-#else
-    XMVECTORU32 E;
-    E.v = V;
-    uint32_t value = E.u[0];
-    E.u[0] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
-    value = E.u[1];
-    E.u[1] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
-    value = E.u[2];
-    E.u[2] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
-    value = E.u[3];
-    E.u[3] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
-    return E.v;
-#endif
-}
-
-
-#ifndef _M_ARM
-namespace SSSE3
-{
-
-inline bool XMVerifySSSE3Support()
-{
-    // Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 1  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // Check for SSSE3 instruction set.
-    return ( (CPUInfo[2] & 0x200) != 0 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorEndian
-(
-    FXMVECTOR V 
-)
-{
-    static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
-   
-    __m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx );
-    return _mm_castsi128_ps( Result );
-}
-
-}; // namespace SSSE3
-#endif // !_M_ARM
-
+//-------------------------------------------------------------------------------------
+// DirectXMathBE.h -- Big-endian swap extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#ifndef _M_ARM
+#include <tmmintrin.h>
+#endif
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+inline XMVECTOR XM_CALLCONV XMVectorEndian
+(
+    FXMVECTOR V 
+)
+{
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
+
+    int8x8x2_t tbl;
+    tbl.val[0] = vget_low_f32(V);
+    tbl.val[1] = vget_high_f32(V);
+
+    const __n64 rL = vtbl2_u8( tbl, vget_low_f32(idx) );
+    const __n64 rH = vtbl2_u8( tbl, vget_high_f32(idx) );
+    return vcombine_f32( rL, rH );
+#else
+    XMVECTORU32 E;
+    E.v = V;
+    uint32_t value = E.u[0];
+    E.u[0] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    value = E.u[1];
+    E.u[1] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    value = E.u[2];
+    E.u[2] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    value = E.u[3];
+    E.u[3] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    return E.v;
+#endif
+}
+
+
+#ifndef _M_ARM
+namespace SSSE3
+{
+
+inline bool XMVerifySSSE3Support()
+{
+    // Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // Check for SSSE3 instruction set.
+    return ( (CPUInfo[2] & 0x200) != 0 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorEndian
+(
+    FXMVECTOR V 
+)
+{
+    static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
+   
+    __m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx );
+    return _mm_castsi128_ps( Result );
+}
+
+}; // namespace SSSE3
+#endif // !_M_ARM
+
 }; // namespace DirectX;
\ No newline at end of file
diff --git a/Extensions/DirectXMathF16C.h b/Extensions/DirectXMathF16C.h
index d486ed2..902f661 100644
--- a/Extensions/DirectXMathF16C.h
+++ b/Extensions/DirectXMathF16C.h
@@ -1,410 +1,410 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathF16C.h -- F16C/CVT16 extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error F16C not supported on ARM platform
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1700)
-#error F16C/CVT16 intrinsics requires Visual C++ 2012 or later.
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <immintrin.h>
-
-#include <DirectXMath.h>
-#include <DirectXPackedVector.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace F16C
-{
-
-inline bool XMVerifyF16CSupport()
-{
-    // Should return true for AMD "Piledriver" and Intel "Ivy Bridge" processors
-    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 1  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We check for F16C, AVX, OSXSAVE, and SSE4.1
-    return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Data conversion
-//-------------------------------------------------------------------------------------
-
-inline float XMConvertHalfToFloat( PackedVector::HALF Value )
-{
-    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
-    __m128 V2 = _mm_cvtph_ps( V1 );
-    return _mm_cvtss_f32( V2 );
-}
-
-inline PackedVector::HALF XMConvertFloatToHalf( float Value )
-{
-    __m128 V1 = _mm_set_ss( Value );
-    __m128i V2 = _mm_cvtps_ph( V1, 0 );
-    return static_cast<PackedVector::HALF>( _mm_cvtsi128_si32(V2) );
-}
-
-inline float* XMConvertHalfToFloatStream
-(
-    _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, 
-     _In_ size_t      OutputStride, 
-    _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, 
-    _In_ size_t      InputStride, 
-    _In_ size_t      HalfCount
-)
-{
-    using namespace PackedVector;
-
-    assert(pOutputStream);
-    assert(pInputStream);
-    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = HalfCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(HALF))
-        {
-            if (OutputStride == sizeof(float))
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                        pHalf += InputStride*4;
-
-                        __m128 FV = _mm_cvtph_ps( HV );
-
-                        _mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
-                        pFloat += OutputStride*4; 
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                        pHalf += InputStride*4;
-
-                        __m128 FV = _mm_cvtph_ps( HV );
-
-                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
-                        pFloat += OutputStride*4; 
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, scattered output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                    pHalf += InputStride*4;
-
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
-                    pFloat += OutputStride; 
-                    i += 4;
-                }
-            }
-        }
-        else if (OutputStride == sizeof(float))
-        {
-            if ( ((uintptr_t)pFloat & 0xF) == 0)
-            {
-                // Scattered input, aligned & packed output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16( HV, H1, 0 );
-                    HV = _mm_insert_epi16( HV, H2, 1 );
-                    HV = _mm_insert_epi16( HV, H3, 2 );
-                    HV = _mm_insert_epi16( HV, H4, 3 );
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
-                    pFloat += OutputStride*4; 
-                    i += 4;
-                }
-            }
-            else
-            {
-                // Scattered input, packed output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16( HV, H1, 0 );
-                    HV = _mm_insert_epi16( HV, H2, 1 );
-                    HV = _mm_insert_epi16( HV, H3, 2 );
-                    HV = _mm_insert_epi16( HV, H4, 3 );
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
-                    pFloat += OutputStride*4; 
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < HalfCount; ++i)
-    {
-        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride; 
-    }
-
-    return pOutputStream;
-}
-
-
-inline PackedVector::HALF* XMConvertFloatToHalfStream
-(
-    _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, 
-    _In_ size_t       OutputStride, 
-    _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, 
-    _In_ size_t       InputStride, 
-    _In_ size_t       FloatCount
-)
-{
-    using namespace PackedVector;
-
-    assert(pOutputStream);
-    assert(pInputStream);
-    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = FloatCount >> 2;
-    if (four > 0)
-    {
-        if (InputStride == sizeof(float))
-        {
-            if (OutputStride == sizeof(HALF))
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Aligned and packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                        pHalf += OutputStride*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                        pHalf += OutputStride*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Aligned & packed input, scattered output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, scattered output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-            }
-        }
-        else if (OutputStride == sizeof(HALF))
-        {
-            // Scattered input, packed output
-            for (size_t j = 0; j < four; ++j)
-            {
-                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
-                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
-                FV = _mm_blend_ps( FV, FT, 0xC );
-
-                __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                pHalf += OutputStride*4;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < FloatCount; ++i)
-    {
-        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride; 
-        pHalf += OutputStride;
-    }
-
-    return pOutputStream;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Half2
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource )
-{
-    assert(pSource);
-    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
-    return _mm_cvtph_ps( _mm_castps_si128( V ) );
-}
-
-inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V )
-{
-    assert(pDestination);
-    __m128i V1 = _mm_cvtps_ph( V, 0 );
-    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Half4
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource )
-{
-    assert(pSource);
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_cvtph_ps( V );
-}
-
-inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V )
-{
-    assert(pDestination);
-    __m128i V1 = _mm_cvtps_ph( V, 0 );
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
-}
-
-}; // namespace F16C
-
+//-------------------------------------------------------------------------------------
+// DirectXMathF16C.h -- F16C/CVT16 extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error F16C not supported on ARM platform
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1700)
+#error F16C/CVT16 intrinsics requires Visual C++ 2012 or later.
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <immintrin.h>
+
+#include <DirectXMath.h>
+#include <DirectXPackedVector.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace F16C
+{
+
+inline bool XMVerifyF16CSupport()
+{
+    // Should return true for AMD "Piledriver" and Intel "Ivy Bridge" processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We check for F16C, AVX, OSXSAVE, and SSE4.1
+    return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Data conversion
+//-------------------------------------------------------------------------------------
+
+inline float XMConvertHalfToFloat( PackedVector::HALF Value )
+{
+    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
+    __m128 V2 = _mm_cvtph_ps( V1 );
+    return _mm_cvtss_f32( V2 );
+}
+
+inline PackedVector::HALF XMConvertFloatToHalf( float Value )
+{
+    __m128 V1 = _mm_set_ss( Value );
+    __m128i V2 = _mm_cvtps_ph( V1, 0 );
+    return static_cast<PackedVector::HALF>( _mm_cvtsi128_si32(V2) );
+}
+
+inline float* XMConvertHalfToFloatStream
+(
+    _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, 
+     _In_ size_t      OutputStride, 
+    _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, 
+    _In_ size_t      InputStride, 
+    _In_ size_t      HalfCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        _mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                    pHalf += InputStride*4;
+
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
+                    pFloat += OutputStride; 
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ( ((uintptr_t)pFloat & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride; 
+    }
+
+    return pOutputStream;
+}
+
+
+inline PackedVector::HALF* XMConvertFloatToHalfStream
+(
+    _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, 
+    _In_ size_t       OutputStride, 
+    _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, 
+    _In_ size_t       InputStride, 
+    _In_ size_t       FloatCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
+                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
+                FV = _mm_blend_ps( FV, FT, 0xC );
+
+                __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                pHalf += OutputStride*4;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource )
+{
+    assert(pSource);
+    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_cvtph_ps( _mm_castps_si128( V ) );
+}
+
+inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource )
+{
+    assert(pSource);
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtph_ps( V );
+}
+
+inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
+}
+
+}; // namespace F16C
+
 }; // namespace DirectX;
\ No newline at end of file
diff --git a/Extensions/DirectXMathFMA3.h b/Extensions/DirectXMathFMA3.h
index 6997d9d..5874014 100644
--- a/Extensions/DirectXMathFMA3.h
+++ b/Extensions/DirectXMathFMA3.h
@@ -1,405 +1,405 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathFMA3.h -- FMA3 extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error FMA3 not supported on ARM platform
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1700)
-#error FMA3 intrinsics requires Visual C++ 2012 or later.
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <immintrin.h>
-
-#include <DirectXMath.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace FMA3
-{
-
-inline bool XMVerifyFMA3Support()
-{
-    // Should return true for AMD "Pildriver" and Intel "Haswell" processors
-    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 1  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We check for FMA3, AVX, OSXSAVE
-    return ( (CPUInfo[2] & 0x18001000) == 0x18001000 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    return _mm_fmadd_ps( V1, V2, V3 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    return _mm_fnmadd_ps( V1, V2, V3 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector2
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
-    vResult = _mm_div_ps( vResult, W );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_mul_ps( vResult, M.r[1] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector3
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
-    vResult = _mm_div_ps( vResult, W );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_mul_ps( vResult, M.r[2] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
-
-inline XMVECTOR XM_CALLCONV XMVector3Project
-(
-    FXMVECTOR V, 
-    float    ViewportX, 
-    float    ViewportY, 
-    float    ViewportWidth, 
-    float    ViewportHeight, 
-    float    ViewportMinZ, 
-    float    ViewportMaxZ, 
-    CXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
-    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View);
-    Transform = FMA3::XMMatrixMultiply(Transform, Projection);
-
-    XMVECTOR Result = FMA3::XMVector3TransformCoord(V, Transform);
-
-    Result = FMA3::XMVectorMultiplyAdd(Result, Scale, Offset);
-
-    return Result;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3Unproject
-(
-    FXMVECTOR V, 
-    float     ViewportX, 
-    float     ViewportY, 
-    float     ViewportWidth, 
-    float     ViewportHeight, 
-    float     ViewportMinZ, 
-    float     ViewportMaxZ, 
-    CXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = FMA3::XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View);
-    Transform = FMA3::XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    XMVECTOR Result = FMA3::XMVectorMultiplyAdd(V, Scale, Offset);
-
-    return FMA3::XMVector3TransformCoord(Result, Transform);
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector4
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
-    vResult = _mm_mul_ps( vResult, M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Matrix
-//-------------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiply
-(
-    CXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-    XMMATRIX mResult;
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[0] = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[1] = vX;
-    vW = M1.r[2];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[2] = vX;
-    vW = M1.r[3];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    mResult.r[3] = vX;
-    return mResult;
-}
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
-(
-    FXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r0 = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r1 = vX;
-    vW = M1.r[2];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r2 = vX;
-    vW = M1.r[3];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
-    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
-    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
-    __m128 r3 = vX;
-
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
-
-    XMMATRIX mResult;
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
-    return mResult;
-}
-
-}; // namespace FMA3
-
-}; // namespace DirectX;
+//-------------------------------------------------------------------------------------
+// DirectXMathFMA3.h -- FMA3 extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error FMA3 not supported on ARM platform
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1700)
+#error FMA3 intrinsics requires Visual C++ 2012 or later.
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <immintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace FMA3
+{
+
+inline bool XMVerifyFMA3Support()
+{
+    // Should return true for AMD "Pildriver" and Intel "Haswell" processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We check for FMA3, AVX, OSXSAVE
+    return ( (CPUInfo[2] & 0x18001000) == 0x18001000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fmadd_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fnmadd_ps( V1, V2, V3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_mul_ps( vResult, M.r[1] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_mul_ps( vResult, M.r[2] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View);
+    Transform = FMA3::XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = FMA3::XMVector3TransformCoord(V, Transform);
+
+    Result = FMA3::XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = FMA3::XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View);
+    Transform = FMA3::XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = FMA3::XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return FMA3::XMVector3TransformCoord(Result, Transform);
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
+    vResult = _mm_mul_ps( vResult, M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Matrix
+//-------------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    CXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[3] = vX;
+    return mResult;
+}
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+}
+
+}; // namespace FMA3
+
+}; // namespace DirectX;
diff --git a/Extensions/DirectXMathFMA4.h b/Extensions/DirectXMathFMA4.h
index 2e0cbc3..2a3e1d0 100644
--- a/Extensions/DirectXMathFMA4.h
+++ b/Extensions/DirectXMathFMA4.h
@@ -1,414 +1,414 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathFMA4.h -- FMA4 extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error FMA4 not supported on ARM platform
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-#error FMA4 intrinsics requires Visual C++ 2010 Service Pack 1 or later.
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <ammintrin.h>
-
-#include <DirectXMath.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace FMA4
-{
-
-inline bool XMVerifyFMA4Support()
-{
-    // Should return true for AMD Bulldozer processors
-    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
-
-   // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-   int CPUInfo[4] = {-1};
-   __cpuid( CPUInfo, 0 );
-
-   if ( CPUInfo[0] < 1  )
-       return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We check for AVX, OSXSAVE (required to access FMA4)
-    if ( (CPUInfo[2] & 0x18000000) != 0x18000000 )
-        return false;
-
-    __cpuid( CPUInfo, 0x80000000 );
-
-    if ( CPUInfo[0] < 0x80000001 )
-        return false;
-
-    // We check for FMA4
-    return ( CPUInfo[2] & 0x10000 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    return _mm_macc_ps( V1, V2, V3 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    return _mm_nmacc_ps( V1, V2, V3 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector2
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
-    vResult = _mm_div_ps( vResult, W );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_mul_ps( vResult, M.r[1] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector3
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
-    vResult = _mm_div_ps( vResult, W );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_mul_ps( vResult, M.r[2] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
-
-inline XMVECTOR XM_CALLCONV XMVector3Project
-(
-    FXMVECTOR V, 
-    float    ViewportX, 
-    float    ViewportY, 
-    float    ViewportWidth, 
-    float    ViewportHeight, 
-    float    ViewportMinZ, 
-    float    ViewportMaxZ, 
-    CXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
-    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View);
-    Transform = FMA4::XMMatrixMultiply(Transform, Projection);
-
-    XMVECTOR Result = FMA4::XMVector3TransformCoord(V, Transform);
-
-    Result = FMA4::XMVectorMultiplyAdd(Result, Scale, Offset);
-
-    return Result;
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3Unproject
-(
-    FXMVECTOR V, 
-    float     ViewportX, 
-    float     ViewportY, 
-    float     ViewportWidth, 
-    float     ViewportHeight, 
-    float     ViewportMinZ, 
-    float     ViewportMaxZ, 
-    CXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = FMA4::XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View);
-    Transform = FMA4::XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    XMVECTOR Result = FMA4::XMVectorMultiplyAdd(V, Scale, Offset);
-
-    return FMA4::XMVector3TransformCoord(Result, Transform);
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector4
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Transform
-(
-    FXMVECTOR V, 
-    CXMMATRIX M
-)
-{
-    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
-    vResult = _mm_mul_ps( vResult, M.r[3] );
-    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
-    vResult = _mm_macc_ps( vTemp, M.r[2], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
-    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
-    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
-    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Matrix
-//-------------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiply
-(
-    CXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-    XMMATRIX mResult;
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    mResult.r[0] = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    mResult.r[1] = vX;
-    vW = M1.r[2];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    mResult.r[2] = vX;
-    vW = M1.r[3];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    mResult.r[3] = vX;
-    return mResult;
-}
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
-(
-    FXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    __m128 r0 = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    __m128 r1 = vX;
-    vW = M1.r[2];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    __m128 r2 = vX;
-    vW = M1.r[3];
-    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vX = _mm_macc_ps(vY,M2.r[1],vX);
-    vX = _mm_macc_ps(vZ,M2.r[2],vX);
-    vX = _mm_macc_ps(vW,M2.r[3],vX);
-    __m128 r3 = vX;
-
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
-
-    XMMATRIX mResult;
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
-    return mResult;
-}
-
-}; // namespace FMA4
-
-}; // namespace DirectX;
+//-------------------------------------------------------------------------------------
+// DirectXMathFMA4.h -- FMA4 extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error FMA4 not supported on ARM platform
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#error FMA4 intrinsics requires Visual C++ 2010 Service Pack 1 or later.
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <ammintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace FMA4
+{
+
+inline bool XMVerifyFMA4Support()
+{
+    // Should return true for AMD Bulldozer processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+   // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+   int CPUInfo[4] = {-1};
+   __cpuid( CPUInfo, 0 );
+
+   if ( CPUInfo[0] < 1  )
+       return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We check for AVX, OSXSAVE (required to access FMA4)
+    if ( (CPUInfo[2] & 0x18000000) != 0x18000000 )
+        return false;
+
+    __cpuid( CPUInfo, 0x80000000 );
+
+    if ( CPUInfo[0] < 0x80000001 )
+        return false;
+
+    // We check for FMA4
+    return ( CPUInfo[2] & 0x10000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_macc_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_nmacc_ps( V1, V2, V3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_mul_ps( vResult, M.r[1] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_mul_ps( vResult, M.r[2] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View);
+    Transform = FMA4::XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = FMA4::XMVector3TransformCoord(V, Transform);
+
+    Result = FMA4::XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = FMA4::XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View);
+    Transform = FMA4::XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = FMA4::XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return FMA4::XMVector3TransformCoord(Result, Transform);
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
+    vResult = _mm_mul_ps( vResult, M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_macc_ps( vTemp, M.r[2], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Matrix
+//-------------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    CXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[3] = vX;
+    return mResult;
+}
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+}
+
+}; // namespace FMA4
+
+}; // namespace DirectX;
diff --git a/Extensions/DirectXMathSSE3.h b/Extensions/DirectXMathSSE3.h
index c61dde8..9d3911b 100644
--- a/Extensions/DirectXMathSSE3.h
+++ b/Extensions/DirectXMathSSE3.h
@@ -1,120 +1,120 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathSSE3.h -- SSE3 extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error SSE3 not supported on ARM platform
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <pmmintrin.h>
-
-#include <DirectXMath.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace SSE3
-{
-
-inline bool XMVerifySSE3Support()
-{
-    // Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 1  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We only check for SSE3 instruction set. SSSE3 instructions are not used.
-    return ( (CPUInfo[2] & 0x1) != 0 );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2Dot
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
-    vTemp = _mm_hadd_ps(vTemp,vTemp);
-    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(0,0,0,0));
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
-{
-    return SSE3::XMVector2Dot(V, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3Dot
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
-    vTemp = _mm_and_ps( vTemp, g_XMMask3 );
-    vTemp = _mm_hadd_ps(vTemp,vTemp);
-    return _mm_hadd_ps(vTemp,vTemp);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
-{
-    return SSE3::XMVector3Dot(V, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4Dot
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
-    vTemp = _mm_hadd_ps( vTemp, vTemp );
-    return _mm_hadd_ps( vTemp, vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
-{
-    return SSE3::XMVector4Dot(V, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle_0022( FXMVECTOR V )
-{
-    return _mm_moveldup_ps(V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V )
-{
-    return _mm_movehdup_ps(V);
-}
-
-}; // namespace SSE3
-
+//-------------------------------------------------------------------------------------
+// DirectXMathSSE3.h -- SSE3 extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error SSE3 not supported on ARM platform
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <pmmintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace SSE3
+{
+
+inline bool XMVerifySSE3Support()
+{
+    // Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We only check for SSE3 instruction set. SSSE3 instructions are not used.
+    return ( (CPUInfo[2] & 0x1) != 0 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_hadd_ps(vTemp,vTemp);
+    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(0,0,0,0));
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
+{
+    return SSE3::XMVector2Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_and_ps( vTemp, g_XMMask3 );
+    vTemp = _mm_hadd_ps(vTemp,vTemp);
+    return _mm_hadd_ps(vTemp,vTemp);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
+{
+    return SSE3::XMVector3Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_hadd_ps( vTemp, vTemp );
+    return _mm_hadd_ps( vTemp, vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
+{
+    return SSE3::XMVector4Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle_0022( FXMVECTOR V )
+{
+    return _mm_moveldup_ps(V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V )
+{
+    return _mm_movehdup_ps(V);
+}
+
+}; // namespace SSE3
+
 }; // namespace DirectX;
\ No newline at end of file
diff --git a/Extensions/DirectXMathSSE4.h b/Extensions/DirectXMathSSE4.h
index 8495626..6f10dc1 100644
--- a/Extensions/DirectXMathSSE4.h
+++ b/Extensions/DirectXMathSSE4.h
@@ -1,422 +1,422 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#ifdef _M_ARM
-#error SSE4 not supported on ARM platform
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4987)
-#include <intrin.h>
-#pragma warning(pop)
-
-#include <smmintrin.h>
-
-#include <DirectXMath.h>
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-namespace SSE4
-{
-
-inline bool XMVerifySSE4Support()
-{
-    // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
-
-    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
-    int CPUInfo[4] = {-1};
-    __cpuid( CPUInfo, 0 );
-
-    if ( CPUInfo[0] < 1  )
-        return false;
-
-    __cpuid(CPUInfo, 1 );
-
-    // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
-    return ( (CPUInfo[2] & 0x80000) == 0x80000 );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector
-//-------------------------------------------------------------------------------------
-
-inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
-{
-    assert( y != nullptr );
-    *((int*)y) = _mm_extract_ps( V, 1 );
-}
-
-inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
-{
-    assert( z != nullptr );
-    *((int*)z) = _mm_extract_ps( V, 2 );
-}
-
-inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
-{
-    assert( w != nullptr );
-    *((int*)w) = _mm_extract_ps( V, 3 );
-}
-
-inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
-{
-    __m128i V1 = _mm_castps_si128( V );
-    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
-}
-
-inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
-{
-    __m128i V1 = _mm_castps_si128( V );
-    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
-}
-
-inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
-{
-    __m128i V1 = _mm_castps_si128( V );
-    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
-}
-
-inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V)
-{
-    assert( y != nullptr );
-    __m128i V1 = _mm_castps_si128( V );
-    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
-}
-
-inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V)
-{
-    assert( z != nullptr );
-    __m128i V1 = _mm_castps_si128( V );
-    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
-}
-
-inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V)
-{
-    assert( w != nullptr );
-    __m128i V1 = _mm_castps_si128( V );
-    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
-{
-    XMVECTOR vResult = _mm_set_ss(y);
-    vResult = _mm_insert_ps( V, vResult, 0x10 );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
-{
-    XMVECTOR vResult = _mm_set_ss(z);
-    vResult = _mm_insert_ps( V, vResult, 0x20 );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
-{
-    XMVECTOR vResult = _mm_set_ss(w);
-    vResult = _mm_insert_ps( V, vResult, 0x30 );
-    return vResult;
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
-{
-    __m128i vResult = _mm_castps_si128( V );
-    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
-    return _mm_castsi128_ps( vResult );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
-{
-    __m128i vResult = _mm_castps_si128( V );
-    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
-    return _mm_castsi128_ps( vResult );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
-{
-    __m128i vResult = _mm_castps_si128( V );
-    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
-    return _mm_castsi128_ps( vResult );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V )
-{
-    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V )
-{
-    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V )
-{
-    return _mm_floor_ps( V );
-}
-
-inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V )
-{
-    return _mm_ceil_ps( V );
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector2
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 )
-{
-    return _mm_dp_ps( V1, V2, 0x3f );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
-{
-    return SSE4::XMVector2Dot(V, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    return _mm_rsqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
-    return _mm_div_ps( g_XMOne, vLengthSq );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    return _mm_sqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    return _mm_sqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V )
-{
-    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector3
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 )
-{
-    return _mm_dp_ps( V1, V2, 0x7f );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
-{
-    return SSE4::XMVector3Dot(V, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    return _mm_rsqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
-    return _mm_div_ps( g_XMOne, vLengthSq );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    return _mm_sqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    return _mm_sqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V )
-{
-    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Vector4
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 )
-{
-    return _mm_dp_ps( V1, V2, 0xff );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
-{
-    return SSE4::XMVector4Dot(V, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    return _mm_rsqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
-    return _mm_div_ps( g_XMOne, vLengthSq );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    return _mm_sqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    return _mm_sqrt_ps( vTemp );
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V )
-{
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, V);
-}
-
-inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V )
-{
-    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Plane
-//-------------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P )
-{
-    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, P);
-}
-
-inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
-{
-    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(P,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vLengthSq);
-    return vResult;
-}
-
-}; // namespace SSE4
-
+//-------------------------------------------------------------------------------------
+// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#ifdef _M_ARM
+#error SSE4 not supported on ARM platform
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+
+#include <smmintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+namespace SSE4
+{
+
+inline bool XMVerifySSE4Support()
+{
+    // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+    __cpuid( CPUInfo, 0 );
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
+    return ( (CPUInfo[2] & 0x80000) == 0x80000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
+{
+    assert( y != nullptr );
+    *((int*)y) = _mm_extract_ps( V, 1 );
+}
+
+inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
+{
+    assert( z != nullptr );
+    *((int*)z) = _mm_extract_ps( V, 2 );
+}
+
+inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
+{
+    assert( w != nullptr );
+    *((int*)w) = _mm_extract_ps( V, 3 );
+}
+
+inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
+{
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
+}
+
+inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
+{
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
+}
+
+inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
+{
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
+}
+
+inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V)
+{
+    assert( y != nullptr );
+    __m128i V1 = _mm_castps_si128( V );
+    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
+}
+
+inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V)
+{
+    assert( z != nullptr );
+    __m128i V1 = _mm_castps_si128( V );
+    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
+}
+
+inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V)
+{
+    assert( w != nullptr );
+    __m128i V1 = _mm_castps_si128( V );
+    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
+{
+    XMVECTOR vResult = _mm_set_ss(y);
+    vResult = _mm_insert_ps( V, vResult, 0x10 );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
+{
+    XMVECTOR vResult = _mm_set_ss(z);
+    vResult = _mm_insert_ps( V, vResult, 0x20 );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
+{
+    XMVECTOR vResult = _mm_set_ss(w);
+    vResult = _mm_insert_ps( V, vResult, 0x30 );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
+{
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
+    return _mm_castsi128_ps( vResult );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
+{
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
+    return _mm_castsi128_ps( vResult );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
+{
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
+    return _mm_castsi128_ps( vResult );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V )
+{
+    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V )
+{
+    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V )
+{
+    return _mm_floor_ps( V );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V )
+{
+    return _mm_ceil_ps( V );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 )
+{
+    return _mm_dp_ps( V1, V2, 0x3f );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
+{
+    return SSE4::XMVector2Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_rsqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 )
+{
+    return _mm_dp_ps( V1, V2, 0x7f );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
+{
+    return SSE4::XMVector3Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_rsqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 )
+{
+    return _mm_dp_ps( V1, V2, 0xff );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
+{
+    return SSE4::XMVector4Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_rsqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Plane
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P )
+{
+    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, P);
+}
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
+}
+
+}; // namespace SSE4
+
 }; // namespace DirectX;
\ No newline at end of file
diff --git a/Inc/DirectXCollision.h b/Inc/DirectXCollision.h
index a6e341c..8b51516 100644
--- a/Inc/DirectXCollision.h
+++ b/Inc/DirectXCollision.h
@@ -1,341 +1,341 @@
-//-------------------------------------------------------------------------------------
-// DirectXCollision.h -- C++ Collision Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#include "DirectXMath.h"
-
-namespace DirectX
-{
-
-enum ContainmentType
-{
-    DISJOINT = 0,
-    INTERSECTS = 1,
-    CONTAINS = 2,
-};
-
-enum PlaneIntersectionType
-{
-    FRONT = 0,
-    INTERSECTING = 1,
-    BACK = 2,
-};
-
-struct BoundingBox;
-struct BoundingOrientedBox;
-struct BoundingFrustum;
-
-#pragma warning(push)
-#pragma warning(disable:4324 4820)
-// C4324: alignment padding warnings
-// C4820: Off by default noise
-
-//-------------------------------------------------------------------------------------
-// Bounding sphere
-//-------------------------------------------------------------------------------------
-struct BoundingSphere
-{
-    XMFLOAT3 Center;            // Center of the sphere.
-    float Radius;               // Radius of the sphere.
-
-    // Creators
-    BoundingSphere() : Center(0,0,0), Radius( 1.f ) {}
-    XM_CONSTEXPR BoundingSphere( _In_ const XMFLOAT3& center, _In_ float radius )
-        : Center(center), Radius(radius) {}
-    BoundingSphere( _In_ const BoundingSphere& sp )
-        : Center(sp.Center), Radius(sp.Radius) {}
-
-    // Methods
-    BoundingSphere& operator=( _In_ const BoundingSphere& sp ) { Center = sp.Center; Radius = sp.Radius; return *this; }
-
-    void    XM_CALLCONV     Transform( _Out_ BoundingSphere& Out, _In_ FXMMATRIX M ) const;
-    void    XM_CALLCONV     Transform( _Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
-        // Transform the sphere
-
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-    ContainmentType Contains( _In_ const BoundingSphere& sh ) const;
-    ContainmentType Contains( _In_ const BoundingBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
-
-    bool Intersects( _In_ const BoundingSphere& sh ) const;
-    bool Intersects( _In_ const BoundingBox& box ) const;
-    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
-    bool Intersects( _In_ const BoundingFrustum& fr ) const;
-    
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-        // Triangle-sphere test
-
-    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
-        // Plane-sphere test
-    
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
-        // Ray-sphere test
-
-    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
-                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
-        // Test sphere against six planes (see BoundingFrustum::GetPlanes)
-
-    // Static methods
-    static void CreateMerged( _Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2 );
-
-    static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingBox& box );
-    static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box );
-
-    static void CreateFromPoints( _Out_ BoundingSphere& Out, _In_ size_t Count,
-                                  _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride );
-
-    static void CreateFromFrustum( _Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr );
-};
-
-//-------------------------------------------------------------------------------------
-// Axis-aligned bounding box
-//-------------------------------------------------------------------------------------
-struct BoundingBox
-{
-    static const size_t CORNER_COUNT = 8;
-
-    XMFLOAT3 Center;            // Center of the box.
-    XMFLOAT3 Extents;           // Distance from the center to each side.
-
-    // Creators
-    BoundingBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ) {}
-    XM_CONSTEXPR BoundingBox( _In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents )
-        : Center(center), Extents(extents) {}
-    BoundingBox( _In_ const BoundingBox& box ) : Center(box.Center), Extents(box.Extents) {}
-    
-    // Methods
-    BoundingBox& operator=( _In_ const BoundingBox& box) { Center = box.Center; Extents = box.Extents; return *this; }
-
-    void    XM_CALLCONV     Transform( _Out_ BoundingBox& Out, _In_ FXMMATRIX M ) const;
-    void    XM_CALLCONV     Transform( _Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
-
-    void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const;
-        // Gets the 8 corners of the box
-
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-    ContainmentType Contains( _In_ const BoundingSphere& sh ) const;
-    ContainmentType Contains( _In_ const BoundingBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
-    
-    bool Intersects( _In_ const BoundingSphere& sh ) const;
-    bool Intersects( _In_ const BoundingBox& box ) const;
-    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
-    bool Intersects( _In_ const BoundingFrustum& fr ) const;
-
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-        // Triangle-Box test
-
-    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
-        // Plane-box test
-
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
-        // Ray-Box test
-
-    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
-                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
-        // Test box against six planes (see BoundingFrustum::GetPlanes)
-
-    // Static methods
-    static void CreateMerged( _Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2 );
-
-    static void CreateFromSphere( _Out_ BoundingBox& Out, _In_ const BoundingSphere& sh );
-
-    static void    XM_CALLCONV     CreateFromPoints( _Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2 );
-    static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ size_t Count,
-                                  _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride );
-};
-
-//-------------------------------------------------------------------------------------
-// Oriented bounding box
-//-------------------------------------------------------------------------------------
-struct BoundingOrientedBox
-{
-    static const size_t CORNER_COUNT = 8;
-
-    XMFLOAT3 Center;            // Center of the box.
-    XMFLOAT3 Extents;           // Distance from the center to each side.
-    XMFLOAT4 Orientation;       // Unit quaternion representing rotation (box -> world).
-
-    // Creators
-    BoundingOrientedBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ), Orientation(0,0,0, 1.f ) {}
-    XM_CONSTEXPR BoundingOrientedBox( _In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation )
-        : Center(_Center), Extents(_Extents), Orientation(_Orientation) {}
-    BoundingOrientedBox( _In_ const BoundingOrientedBox& box )
-        : Center(box.Center), Extents(box.Extents), Orientation(box.Orientation) {}
-
-    // Methods
-    BoundingOrientedBox& operator=( _In_ const BoundingOrientedBox& box ) { Center = box.Center; Extents = box.Extents; Orientation = box.Orientation; return *this; }
-
-    void    XM_CALLCONV     Transform( _Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M ) const;
-    void    XM_CALLCONV     Transform( _Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
-
-    void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const;
-        // Gets the 8 corners of the box
-
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-    ContainmentType Contains( _In_ const BoundingSphere& sh ) const;
-    ContainmentType Contains( _In_ const BoundingBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
-
-    bool Intersects( _In_ const BoundingSphere& sh ) const;
-    bool Intersects( _In_ const BoundingBox& box ) const;
-    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
-    bool Intersects( _In_ const BoundingFrustum& fr ) const;
-
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-        // Triangle-OrientedBox test
-
-    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
-        // Plane-OrientedBox test
-    
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
-        // Ray-OrientedBox test
-
-    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
-                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
-        // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes)
-
-    // Static methods
-    static void CreateFromBoundingBox( _Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box );
-
-    static void CreateFromPoints( _Out_ BoundingOrientedBox& Out, _In_ size_t Count,
-                                  _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride );
-};
-
-//-------------------------------------------------------------------------------------
-// Bounding frustum
-//-------------------------------------------------------------------------------------
-struct BoundingFrustum
-{
-    static const size_t CORNER_COUNT = 8;
-
-    XMFLOAT3 Origin;            // Origin of the frustum (and projection).
-    XMFLOAT4 Orientation;       // Quaternion representing rotation.
-
-    float RightSlope;           // Positive X slope (X/Z).
-    float LeftSlope;            // Negative X slope.
-    float TopSlope;             // Positive Y slope (Y/Z).
-    float BottomSlope;          // Negative Y slope.
-    float Near, Far;            // Z of the near plane and far plane.
-
-    // Creators
-    BoundingFrustum() : Origin(0,0,0), Orientation(0,0,0, 1.f), RightSlope( 1.f ), LeftSlope( -1.f ),
-                        TopSlope( 1.f ), BottomSlope( -1.f ), Near(0), Far( 1.f ) {}
-    XM_CONSTEXPR BoundingFrustum( _In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation,
-                     _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope,
-                     _In_ float _Near, _In_ float _Far )
-        : Origin(_Origin), Orientation(_Orientation),
-          RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope),
-          Near(_Near), Far(_Far) {}
-    BoundingFrustum( _In_ const BoundingFrustum& fr )
-        : Origin(fr.Origin), Orientation(fr.Orientation), RightSlope(fr.RightSlope), LeftSlope(fr.LeftSlope),
-          TopSlope(fr.TopSlope), BottomSlope(fr.BottomSlope), Near(fr.Near), Far(fr.Far) {}
-    BoundingFrustum( _In_ CXMMATRIX Projection ) { CreateFromMatrix( *this, Projection ); }
-
-    // Methods
-    BoundingFrustum& operator=( _In_ const BoundingFrustum& fr ) { Origin=fr.Origin; Orientation=fr.Orientation;
-                                                                   RightSlope=fr.RightSlope; LeftSlope=fr.LeftSlope;
-                                                                   TopSlope=fr.TopSlope; BottomSlope=fr.BottomSlope;
-                                                                   Near=fr.Near; Far=fr.Far; return *this; }
-
-    void    XM_CALLCONV     Transform( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX M ) const;
-    void    XM_CALLCONV     Transform( _Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
-
-    void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const;
-        // Gets the 8 corners of the frustum
-
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
-    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-    ContainmentType Contains( _In_ const BoundingSphere& sp ) const;
-    ContainmentType Contains( _In_ const BoundingBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
-    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
-        // Frustum-Frustum test
-
-    bool Intersects( _In_ const BoundingSphere& sh ) const;
-    bool Intersects( _In_ const BoundingBox& box ) const;
-    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
-    bool Intersects( _In_ const BoundingFrustum& fr ) const;
-
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
-        // Triangle-Frustum test
-
-    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
-        // Plane-Frustum test
-
-    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
-        // Ray-Frustum test
-
-    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
-                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
-        // Test frustum against six planes (see BoundingFrustum::GetPlanes)
-
-    void GetPlanes( _Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane,
-                    _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane ) const;
-        // Create 6 Planes representation of Frustum
-
-    // Static methods
-    static void     XM_CALLCONV     CreateFromMatrix( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection );
-};
-
-//-----------------------------------------------------------------------------
-// Triangle intersection testing routines.
-//-----------------------------------------------------------------------------
-namespace TriangleTests
-{
-    bool                    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist );
-        // Ray-Triangle
-
-    bool                    XM_CALLCONV     Intersects( _In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2 );
-        // Triangle-Triangle
-
-    PlaneIntersectionType   XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane );
-        // Plane-Triangle
-
-    ContainmentType         XM_CALLCONV     ContainedBy( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2,
-                                                         _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2,
-                                                         _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 );
-        // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes)
-};
-
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * Implementation
- *
- ****************************************************************************/
-
-#pragma warning(push)
-#pragma warning(disable : 4068 4365 4616 6001)
-// C4068/4616: ignore unknown pragmas
-// C4365: Off by default noise
-// C6001: False positives
-
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-
-#include "DirectXCollision.inl"
-
-#pragma prefast(pop)
-#pragma warning(pop)
-
-}; // namespace DirectX
-
+//-------------------------------------------------------------------------------------
+// DirectXCollision.h -- C++ Collision Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+
+enum ContainmentType
+{
+    DISJOINT = 0,
+    INTERSECTS = 1,
+    CONTAINS = 2,
+};
+
+enum PlaneIntersectionType
+{
+    FRONT = 0,
+    INTERSECTING = 1,
+    BACK = 2,
+};
+
+struct BoundingBox;
+struct BoundingOrientedBox;
+struct BoundingFrustum;
+
+#pragma warning(push)
+#pragma warning(disable:4324 4820)
+// C4324: alignment padding warnings
+// C4820: Off by default noise
+
+//-------------------------------------------------------------------------------------
+// Bounding sphere
+//-------------------------------------------------------------------------------------
+struct BoundingSphere
+{
+    XMFLOAT3 Center;            // Center of the sphere.
+    float Radius;               // Radius of the sphere.
+
+    // Creators
+    BoundingSphere() : Center(0,0,0), Radius( 1.f ) {}
+    XM_CONSTEXPR BoundingSphere( _In_ const XMFLOAT3& center, _In_ float radius )
+        : Center(center), Radius(radius) {}
+    BoundingSphere( _In_ const BoundingSphere& sp )
+        : Center(sp.Center), Radius(sp.Radius) {}
+
+    // Methods
+    BoundingSphere& operator=( _In_ const BoundingSphere& sp ) { Center = sp.Center; Radius = sp.Radius; return *this; }
+
+    void    XM_CALLCONV     Transform( _Out_ BoundingSphere& Out, _In_ FXMMATRIX M ) const;
+    void    XM_CALLCONV     Transform( _Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
+        // Transform the sphere
+
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+    ContainmentType Contains( _In_ const BoundingSphere& sh ) const;
+    ContainmentType Contains( _In_ const BoundingBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
+
+    bool Intersects( _In_ const BoundingSphere& sh ) const;
+    bool Intersects( _In_ const BoundingBox& box ) const;
+    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
+    bool Intersects( _In_ const BoundingFrustum& fr ) const;
+    
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+        // Triangle-sphere test
+
+    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
+        // Plane-sphere test
+    
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
+        // Ray-sphere test
+
+    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
+        // Test sphere against six planes (see BoundingFrustum::GetPlanes)
+
+    // Static methods
+    static void CreateMerged( _Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2 );
+
+    static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingBox& box );
+    static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box );
+
+    static void CreateFromPoints( _Out_ BoundingSphere& Out, _In_ size_t Count,
+                                  _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride );
+
+    static void CreateFromFrustum( _Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr );
+};
+
+//-------------------------------------------------------------------------------------
+// Axis-aligned bounding box
+//-------------------------------------------------------------------------------------
+struct BoundingBox
+{
+    static const size_t CORNER_COUNT = 8;
+
+    XMFLOAT3 Center;            // Center of the box.
+    XMFLOAT3 Extents;           // Distance from the center to each side.
+
+    // Creators
+    BoundingBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ) {}
+    XM_CONSTEXPR BoundingBox( _In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents )
+        : Center(center), Extents(extents) {}
+    BoundingBox( _In_ const BoundingBox& box ) : Center(box.Center), Extents(box.Extents) {}
+    
+    // Methods
+    BoundingBox& operator=( _In_ const BoundingBox& box) { Center = box.Center; Extents = box.Extents; return *this; }
+
+    void    XM_CALLCONV     Transform( _Out_ BoundingBox& Out, _In_ FXMMATRIX M ) const;
+    void    XM_CALLCONV     Transform( _Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
+
+    void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const;
+        // Gets the 8 corners of the box
+
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+    ContainmentType Contains( _In_ const BoundingSphere& sh ) const;
+    ContainmentType Contains( _In_ const BoundingBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
+    
+    bool Intersects( _In_ const BoundingSphere& sh ) const;
+    bool Intersects( _In_ const BoundingBox& box ) const;
+    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
+    bool Intersects( _In_ const BoundingFrustum& fr ) const;
+
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+        // Triangle-Box test
+
+    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
+        // Plane-box test
+
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
+        // Ray-Box test
+
+    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
+        // Test box against six planes (see BoundingFrustum::GetPlanes)
+
+    // Static methods
+    static void CreateMerged( _Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2 );
+
+    static void CreateFromSphere( _Out_ BoundingBox& Out, _In_ const BoundingSphere& sh );
+
+    static void    XM_CALLCONV     CreateFromPoints( _Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2 );
+    static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ size_t Count,
+                                  _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride );
+};
+
+//-------------------------------------------------------------------------------------
+// Oriented bounding box
+//-------------------------------------------------------------------------------------
+struct BoundingOrientedBox
+{
+    static const size_t CORNER_COUNT = 8;
+
+    XMFLOAT3 Center;            // Center of the box.
+    XMFLOAT3 Extents;           // Distance from the center to each side.
+    XMFLOAT4 Orientation;       // Unit quaternion representing rotation (box -> world).
+
+    // Creators
+    BoundingOrientedBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ), Orientation(0,0,0, 1.f ) {}
+    XM_CONSTEXPR BoundingOrientedBox( _In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation )
+        : Center(_Center), Extents(_Extents), Orientation(_Orientation) {}
+    BoundingOrientedBox( _In_ const BoundingOrientedBox& box )
+        : Center(box.Center), Extents(box.Extents), Orientation(box.Orientation) {}
+
+    // Methods
+    BoundingOrientedBox& operator=( _In_ const BoundingOrientedBox& box ) { Center = box.Center; Extents = box.Extents; Orientation = box.Orientation; return *this; }
+
+    void    XM_CALLCONV     Transform( _Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M ) const;
+    void    XM_CALLCONV     Transform( _Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
+
+    void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const;
+        // Gets the 8 corners of the box
+
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+    ContainmentType Contains( _In_ const BoundingSphere& sh ) const;
+    ContainmentType Contains( _In_ const BoundingBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
+
+    bool Intersects( _In_ const BoundingSphere& sh ) const;
+    bool Intersects( _In_ const BoundingBox& box ) const;
+    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
+    bool Intersects( _In_ const BoundingFrustum& fr ) const;
+
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+        // Triangle-OrientedBox test
+
+    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
+        // Plane-OrientedBox test
+    
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
+        // Ray-OrientedBox test
+
+    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
+        // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes)
+
+    // Static methods
+    static void CreateFromBoundingBox( _Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box );
+
+    static void CreateFromPoints( _Out_ BoundingOrientedBox& Out, _In_ size_t Count,
+                                  _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride );
+};
+
+//-------------------------------------------------------------------------------------
+// Bounding frustum
+//-------------------------------------------------------------------------------------
+struct BoundingFrustum
+{
+    static const size_t CORNER_COUNT = 8;
+
+    XMFLOAT3 Origin;            // Origin of the frustum (and projection).
+    XMFLOAT4 Orientation;       // Quaternion representing rotation.
+
+    float RightSlope;           // Positive X slope (X/Z).
+    float LeftSlope;            // Negative X slope.
+    float TopSlope;             // Positive Y slope (Y/Z).
+    float BottomSlope;          // Negative Y slope.
+    float Near, Far;            // Z of the near plane and far plane.
+
+    // Creators
+    BoundingFrustum() : Origin(0,0,0), Orientation(0,0,0, 1.f), RightSlope( 1.f ), LeftSlope( -1.f ),
+                        TopSlope( 1.f ), BottomSlope( -1.f ), Near(0), Far( 1.f ) {}
+    XM_CONSTEXPR BoundingFrustum( _In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation,
+                     _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope,
+                     _In_ float _Near, _In_ float _Far )
+        : Origin(_Origin), Orientation(_Orientation),
+          RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope),
+          Near(_Near), Far(_Far) {}
+    BoundingFrustum( _In_ const BoundingFrustum& fr )
+        : Origin(fr.Origin), Orientation(fr.Orientation), RightSlope(fr.RightSlope), LeftSlope(fr.LeftSlope),
+          TopSlope(fr.TopSlope), BottomSlope(fr.BottomSlope), Near(fr.Near), Far(fr.Far) {}
+    BoundingFrustum( _In_ CXMMATRIX Projection ) { CreateFromMatrix( *this, Projection ); }
+
+    // Methods
+    BoundingFrustum& operator=( _In_ const BoundingFrustum& fr ) { Origin=fr.Origin; Orientation=fr.Orientation;
+                                                                   RightSlope=fr.RightSlope; LeftSlope=fr.LeftSlope;
+                                                                   TopSlope=fr.TopSlope; BottomSlope=fr.BottomSlope;
+                                                                   Near=fr.Near; Far=fr.Far; return *this; }
+
+    void    XM_CALLCONV     Transform( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX M ) const;
+    void    XM_CALLCONV     Transform( _Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const;
+
+    void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const;
+        // Gets the 8 corners of the frustum
+
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR Point ) const;
+    ContainmentType    XM_CALLCONV     Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+    ContainmentType Contains( _In_ const BoundingSphere& sp ) const;
+    ContainmentType Contains( _In_ const BoundingBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const;
+    ContainmentType Contains( _In_ const BoundingFrustum& fr ) const;
+        // Frustum-Frustum test
+
+    bool Intersects( _In_ const BoundingSphere& sh ) const;
+    bool Intersects( _In_ const BoundingBox& box ) const;
+    bool Intersects( _In_ const BoundingOrientedBox& box ) const;
+    bool Intersects( _In_ const BoundingFrustum& fr ) const;
+
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const;
+        // Triangle-Frustum test
+
+    PlaneIntersectionType    XM_CALLCONV     Intersects( _In_ FXMVECTOR Plane ) const;
+        // Plane-Frustum test
+
+    bool    XM_CALLCONV     Intersects( _In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const;
+        // Ray-Frustum test
+
+    ContainmentType     XM_CALLCONV     ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+                                                     _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const;
+        // Test frustum against six planes (see BoundingFrustum::GetPlanes)
+
+    void GetPlanes( _Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane,
+                    _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane ) const;
+        // Create 6 Planes representation of Frustum
+
+    // Static methods
+    static void     XM_CALLCONV     CreateFromMatrix( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection );
+};
+
+//-----------------------------------------------------------------------------
+// Triangle intersection testing routines.
+//-----------------------------------------------------------------------------
+namespace TriangleTests
+{
+    bool                    XM_CALLCONV     Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist );
+        // Ray-Triangle
+
+    bool                    XM_CALLCONV     Intersects( _In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2 );
+        // Triangle-Triangle
+
+    PlaneIntersectionType   XM_CALLCONV     Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane );
+        // Plane-Triangle
+
+    ContainmentType         XM_CALLCONV     ContainedBy( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2,
+                                                         _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2,
+                                                         _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 );
+        // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes)
+};
+
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Implementation
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable : 4068 4365 4616 6001)
+// C4068/4616: ignore unknown pragmas
+// C4365: Off by default noise
+// C6001: False positives
+
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+
+#include "DirectXCollision.inl"
+
+#pragma prefast(pop)
+#pragma warning(pop)
+
+}; // namespace DirectX
+
diff --git a/Inc/DirectXCollision.inl b/Inc/DirectXCollision.inl
index 470e28b..752bba2 100644
--- a/Inc/DirectXCollision.inl
+++ b/Inc/DirectXCollision.inl
@@ -1,4811 +1,4811 @@
-//-------------------------------------------------------------------------------------
-// DirectXCollision.inl -- C++ Collision Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] =
-{
-    { -1.0f, -1.0f,  1.0f, 0.0f },
-    {  1.0f, -1.0f,  1.0f, 0.0f },
-    {  1.0f,  1.0f,  1.0f, 0.0f },
-    { -1.0f,  1.0f,  1.0f, 0.0f },
-    { -1.0f, -1.0f, -1.0f, 0.0f },
-    {  1.0f, -1.0f, -1.0f, 0.0f },
-    {  1.0f,  1.0f, -1.0f, 0.0f },
-    { -1.0f,  1.0f, -1.0f, 0.0f },
-};
-
-XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { 1e-20f, 1e-20f, 1e-20f, 1e-20f };
-XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { -1e-20f, -1e-20f, -1e-20f, -1e-20f };
-XMGLOBALCONST XMVECTORF32 g_FltMin = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX };
-XMGLOBALCONST XMVECTORF32 g_FltMax = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-namespace Internal
-{
-
-//-----------------------------------------------------------------------------
-// Return true if any of the elements of a 3 vector are equal to 0xffffffff.
-// Slightly more efficient than using XMVector3EqualInt.
-//-----------------------------------------------------------------------------
-inline bool XMVector3AnyTrue( _In_ FXMVECTOR V )
-{
-    // Duplicate the fourth element from the first element.
-    XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(V);
-
-    return XMComparisonAnyTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) );
-}
-
-
-//-----------------------------------------------------------------------------
-// Return true if all of the elements of a 3 vector are equal to 0xffffffff.
-// Slightly more efficient than using XMVector3EqualInt.
-//-----------------------------------------------------------------------------
-inline bool XMVector3AllTrue( _In_ FXMVECTOR V )
-{
-    // Duplicate the fourth element from the first element.
-    XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>( V );
-
-    return XMComparisonAllTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) );
-}
-
-#if defined(_PREFAST) || !defined(NDEBUG)
-
-XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f };
-XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f };
-XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f };
-
-//-----------------------------------------------------------------------------
-// Return true if the vector is a unit vector (length == 1).
-//-----------------------------------------------------------------------------
-inline bool XMVector3IsUnit( _In_ FXMVECTOR V )
-{
-    XMVECTOR Difference = XMVector3Length( V ) - XMVectorSplatOne();
-    return XMVector4Less( XMVectorAbs( Difference ), g_UnitVectorEpsilon );
-}
-
-//-----------------------------------------------------------------------------
-// Return true if the quaterion is a unit quaternion.
-//-----------------------------------------------------------------------------
-inline bool XMQuaternionIsUnit( _In_ FXMVECTOR Q )
-{
-    XMVECTOR Difference = XMVector4Length( Q ) - XMVectorSplatOne();
-    return XMVector4Less( XMVectorAbs( Difference ), g_UnitQuaternionEpsilon );
-}
-
-//-----------------------------------------------------------------------------
-// Return true if the plane is a unit plane.
-//-----------------------------------------------------------------------------
-inline bool XMPlaneIsUnit( _In_ FXMVECTOR Plane )
-{
-    XMVECTOR Difference = XMVector3Length( Plane ) - XMVectorSplatOne();
-    return XMVector4Less( XMVectorAbs( Difference ), g_UnitPlaneEpsilon );
-}
-
-#endif // __PREFAST__ || !NDEBUG
-
-//-----------------------------------------------------------------------------
-inline XMVECTOR XMPlaneTransform( _In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation )
-{
-    XMVECTOR vNormal = XMVector3Rotate( Plane, Rotation );
-    XMVECTOR vD = XMVectorSplatW( Plane ) - XMVector3Dot( vNormal, Translation );
-
-    return XMVectorInsert<0, 0, 0, 0, 1>( vNormal, vD );
-}
-
-//-----------------------------------------------------------------------------
-// Return the point on the line segement (S1, S2) nearest the point P.
-//-----------------------------------------------------------------------------
-inline XMVECTOR PointOnLineSegmentNearestPoint( _In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P )
-{
-    XMVECTOR Dir = S2 - S1;
-    XMVECTOR Projection = ( XMVector3Dot( P, Dir ) - XMVector3Dot( S1, Dir ) );
-    XMVECTOR LengthSq = XMVector3Dot( Dir, Dir );
-
-    XMVECTOR t = Projection * XMVectorReciprocal( LengthSq );
-    XMVECTOR Point = S1 + t * Dir;
-
-    // t < 0
-    XMVECTOR SelectS1 = XMVectorLess( Projection, XMVectorZero() );
-    Point = XMVectorSelect( Point, S1, SelectS1 );
-
-    // t > 1
-    XMVECTOR SelectS2 = XMVectorGreater( Projection, LengthSq );
-    Point = XMVectorSelect( Point, S2, SelectS2 );
-
-    return Point;
-}
-
-//-----------------------------------------------------------------------------
-// Test if the point (P) on the plane of the triangle is inside the triangle 
-// (V0, V1, V2).
-//-----------------------------------------------------------------------------
-inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle( _In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2 )
-{
-    // Compute the triangle normal.
-    XMVECTOR N = XMVector3Cross( V2 - V0, V1 - V0 );
-
-    // Compute the cross products of the vector from the base of each edge to 
-    // the point with each edge vector.
-    XMVECTOR C0 = XMVector3Cross( P - V0, V1 - V0 );
-    XMVECTOR C1 = XMVector3Cross( P - V1, V2 - V1 );
-    XMVECTOR C2 = XMVector3Cross( P - V2, V0 - V2 );
-
-    // If the cross product points in the same direction as the normal the the
-    // point is inside the edge (it is zero if is on the edge).
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Inside0 = XMVectorGreaterOrEqual( XMVector3Dot( C0, N ), Zero );
-    XMVECTOR Inside1 = XMVectorGreaterOrEqual( XMVector3Dot( C1, N ), Zero );
-    XMVECTOR Inside2 = XMVectorGreaterOrEqual( XMVector3Dot( C2, N ), Zero );
-
-    // If the point inside all of the edges it is inside.
-    return XMVectorAndInt( XMVectorAndInt( Inside0, Inside1 ), Inside2 );
-}
-
-//-----------------------------------------------------------------------------
-inline bool SolveCubic( _In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v )
-{
-    float p, q, h, rc, d, theta, costh3, sinth3;
-
-    p = f - e * e / 3.0f;
-    q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f;
-    h = q * q / 4.0f + p * p * p / 27.0f;
-
-    if( h > 0.0 )
-    {
-        *t = *u = *v = 0.f;
-        return false; // only one real root
-    }
-
-    if( ( h == 0.0 ) && ( q == 0.0 ) ) // all the same root
-    {
-        *t = - e / 3;
-        *u = - e / 3;
-        *v = - e / 3;
-
-        return true;
-    }
-
-    d = sqrtf( q * q / 4.0f - h );
-    if( d < 0 )
-        rc = -powf( -d, 1.0f / 3.0f );
-    else
-        rc = powf( d, 1.0f / 3.0f );
-
-    theta = XMScalarACos( -q / ( 2.0f * d ) );
-    costh3 = XMScalarCos( theta / 3.0f );
-    sinth3 = sqrtf( 3.0f ) * XMScalarSin( theta / 3.0f );
-    *t = 2.0f * rc * costh3 - e / 3.0f;
-    *u = -rc * ( costh3 + sinth3 ) - e / 3.0f;
-    *v = -rc * ( costh3 - sinth3 ) - e / 3.0f;
-
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-inline XMVECTOR CalculateEigenVector( _In_ float m11, _In_ float m12, _In_ float m13,
-                                      _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e )
-{
-    float fTmp[3];
-    fTmp[0] = ( float )( m12 * m23 - m13 * ( m22 - e ) );
-    fTmp[1] = ( float )( m13 * m12 - m23 * ( m11 - e ) );
-    fTmp[2] = ( float )( ( m11 - e ) * ( m22 - e ) - m12 * m12 );
-
-    XMVECTOR vTmp = XMLoadFloat3( (XMFLOAT3*)fTmp );
-
-    if( XMVector3Equal( vTmp, XMVectorZero() ) ) // planar or linear
-    {
-        float f1, f2, f3;
-
-        // we only have one equation - find a valid one
-        if( ( m11 - e != 0.0 ) || ( m12 != 0.0 ) || ( m13 != 0.0 ) )
-        {
-            f1 = m11 - e; f2 = m12; f3 = m13;
-        }
-        else if( ( m12 != 0.0 ) || ( m22 - e != 0.0 ) || ( m23 != 0.0 ) )
-        {
-            f1 = m12; f2 = m22 - e; f3 = m23;
-        }
-        else if( ( m13 != 0.0 ) || ( m23 != 0.0 ) || ( m33 - e != 0.0 ) )
-        {
-            f1 = m13; f2 = m23; f3 = m33 - e;
-        }
-        else
-        {
-            // error, we'll just make something up - we have NO context
-            f1 = 1.0; f2 = 0.0; f3 = 0.0;
-        }
-
-        if( f1 == 0.0 )
-            vTmp = XMVectorSetX( vTmp, 0.0f );
-        else
-            vTmp = XMVectorSetX( vTmp, 1.0f );
-
-        if( f2 == 0.0 )
-            vTmp = XMVectorSetY( vTmp, 0.0f );
-        else
-            vTmp = XMVectorSetY( vTmp, 1.0f );
-
-        if( f3 == 0.0 )
-        {
-            vTmp = XMVectorSetZ( vTmp, 0.0f );
-            // recalculate y to make equation work
-            if( m12 != 0.0 )
-                vTmp = XMVectorSetY( vTmp, ( float )( -f1 / f2 ) );
-        }
-        else
-        {
-            vTmp = XMVectorSetZ( vTmp, ( float )( ( f2 - f1 ) / f3 ) );
-        }
-    }
-
-    if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) > 1e-5f )
-    {
-        return XMVector3Normalize( vTmp );
-    }
-    else
-    {
-        // Multiply by a value large enough to make the vector non-zero.
-        vTmp *= 1e5f;
-        return XMVector3Normalize( vTmp );
-    }
-}
-
-//-----------------------------------------------------------------------------
-inline bool CalculateEigenVectors( _In_ float m11, _In_ float m12, _In_ float m13,
-                                   _In_ float m22, _In_ float m23, _In_ float m33,
-                                   _In_ float e1, _In_ float e2, _In_ float e3,
-                                   _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 )
-{
-    *pV1 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e1 );
-    *pV2 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e2 );
-    *pV3 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e3 );
-
-    bool v1z = false;
-    bool v2z = false;
-    bool v3z = false;
-
-    XMVECTOR Zero = XMVectorZero();
-
-    if ( XMVector3Equal( *pV1, Zero ) )
-        v1z = true;
-
-    if ( XMVector3Equal( *pV2, Zero ) )
-        v2z = true;
-
-    if ( XMVector3Equal( *pV3, Zero ))
-        v3z = true;
-
-    bool e12 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV2 ) ) ) > 0.1f ); // check for non-orthogonal vectors
-    bool e13 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV3 ) ) ) > 0.1f );
-    bool e23 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV2, *pV3 ) ) ) > 0.1f );
-
-    if( ( v1z && v2z && v3z ) || ( e12 && e13 && e23 ) ||
-        ( e12 && v3z ) || ( e13 && v2z ) || ( e23 && v1z ) ) // all eigenvectors are 0- any basis set
-    {
-        *pV1 = g_XMIdentityR0.v;
-        *pV2 = g_XMIdentityR1.v;
-        *pV3 = g_XMIdentityR2.v;
-        return true;
-    }
-
-    if( v1z && v2z )
-    {
-        XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV3 );
-        if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f )
-        {
-            vTmp = XMVector3Cross( g_XMIdentityR0, *pV3 );
-        }
-        *pV1 = XMVector3Normalize( vTmp );
-        *pV2 = XMVector3Cross( *pV3, *pV1 );
-        return true;
-    }
-
-    if( v3z && v1z )
-    {
-        XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV2 );
-        if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f )
-        {
-            vTmp = XMVector3Cross( g_XMIdentityR0, *pV2 );
-        }
-        *pV3 = XMVector3Normalize( vTmp );
-        *pV1 = XMVector3Cross( *pV2, *pV3 );
-        return true;
-    }
-
-    if( v2z && v3z )
-    {
-        XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV1 );
-        if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f )
-        {
-            vTmp = XMVector3Cross( g_XMIdentityR0, *pV1 );
-        }
-        *pV2 = XMVector3Normalize( vTmp );
-        *pV3 = XMVector3Cross( *pV1, *pV2 );
-        return true;
-    }
-
-    if( ( v1z ) || e12 )
-    {
-        *pV1 = XMVector3Cross( *pV2, *pV3 );
-        return true;
-    }
-
-    if( ( v2z ) || e23 )
-    {
-        *pV2 = XMVector3Cross( *pV3, *pV1 );
-        return true;
-    }
-
-    if( ( v3z ) || e13 )
-    {
-        *pV3 = XMVector3Cross( *pV1, *pV2 );
-        return true;
-    }
-
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-inline bool CalculateEigenVectorsFromCovarianceMatrix( _In_ float Cxx, _In_ float Cyy, _In_ float Czz,
-                                                       _In_ float Cxy, _In_ float Cxz, _In_ float Cyz,
-                                                       _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 )
-{
-    // Calculate the eigenvalues by solving a cubic equation.
-    float e = -( Cxx + Cyy + Czz );
-    float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz;
-    float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz;
-
-    float ev1, ev2, ev3;
-    if( !DirectX::Internal::SolveCubic( e, f, g, &ev1, &ev2, &ev3 ) )
-    {
-        // set them to arbitrary orthonormal basis set
-        *pV1 = g_XMIdentityR0.v;
-        *pV2 = g_XMIdentityR1.v;
-        *pV3 = g_XMIdentityR2.v;
-        return false;
-    }
-
-    return DirectX::Internal::CalculateEigenVectors( Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3 );
-}
-
-//-----------------------------------------------------------------------------
-inline void XM_CALLCONV FastIntersectTrianglePlane( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane,
-                                                    XMVECTOR& Outside, XMVECTOR& Inside )
-{
-    // Plane0
-    XMVECTOR Dist0 = XMVector4Dot( V0, Plane );
-    XMVECTOR Dist1 = XMVector4Dot( V1, Plane );
-    XMVECTOR Dist2 = XMVector4Dot( V2, Plane );
-
-    XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 );
-    MinDist = XMVectorMin( MinDist, Dist2 );
-
-    XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 );
-    MaxDist = XMVectorMax( MaxDist, Dist2 );
-
-    XMVECTOR Zero = XMVectorZero();
-
-    // Outside the plane?
-    Outside = XMVectorGreater( MinDist, Zero );
-
-    // Fully inside the plane?
-    Inside = XMVectorLess( MaxDist, Zero );
-}
-
-//-----------------------------------------------------------------------------
-inline void FastIntersectSpherePlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane,
-                                      _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
-{
-    XMVECTOR Dist = XMVector4Dot( Center, Plane );
-
-    // Outside the plane?
-    Outside = XMVectorGreater( Dist, Radius );
-
-    // Fully inside the plane?
-    Inside = XMVectorLess( Dist, -Radius );
-}
-
-//-----------------------------------------------------------------------------
-inline void FastIntersectAxisAlignedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane,
-                                              _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
-{
-    // Compute the distance to the center of the box.
-    XMVECTOR Dist = XMVector4Dot( Center, Plane );
-
-    // Project the axes of the box onto the normal of the plane.  Half the
-    // length of the projection (sometime called the "radius") is equal to
-    // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
-    // where h(i) are extents of the box, n is the plane normal, and b(i) are the 
-    // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)].
-    XMVECTOR Radius = XMVector3Dot( Extents, XMVectorAbs( Plane ) );
-
-    // Outside the plane?
-    Outside = XMVectorGreater( Dist, Radius );
-
-    // Fully inside the plane?
-    Inside = XMVectorLess( Dist, -Radius );
-}
-
-//-----------------------------------------------------------------------------
-inline void XM_CALLCONV FastIntersectOrientedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, _In_ GXMVECTOR Axis1,
-                                                       _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
-{
-    // Compute the distance to the center of the box.
-    XMVECTOR Dist = XMVector4Dot( Center, Plane );
-
-    // Project the axes of the box onto the normal of the plane.  Half the
-    // length of the projection (sometime called the "radius") is equal to
-    // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
-    // where h(i) are extents of the box, n is the plane normal, and b(i) are the 
-    // axes of the box.
-    XMVECTOR Radius = XMVector3Dot( Plane, Axis0 );
-    Radius = XMVectorInsert<0, 0, 1, 0, 0>( Radius, XMVector3Dot( Plane, Axis1 ) );
-    Radius = XMVectorInsert<0, 0, 0, 1, 0>( Radius, XMVector3Dot( Plane, Axis2 ) );
-    Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) );
-
-    // Outside the plane?
-    Outside = XMVectorGreater( Dist, Radius );
-
-    // Fully inside the plane?
-    Inside = XMVectorLess( Dist, -Radius );
-}
-
-//-----------------------------------------------------------------------------
-inline void XM_CALLCONV FastIntersectFrustumPlane( _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, _In_ GXMVECTOR Point3,
-                                                   _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7,
-                                                   _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
-{
-    // Find the min/max projection of the frustum onto the plane normal.
-    XMVECTOR Min, Max, Dist;
-
-    Min = Max = XMVector3Dot( Plane, Point0 );
-
-    Dist = XMVector3Dot( Plane, Point1 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    Dist = XMVector3Dot( Plane, Point2 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    Dist = XMVector3Dot( Plane, Point3 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    Dist = XMVector3Dot( Plane, Point4 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    Dist = XMVector3Dot( Plane, Point5 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    Dist = XMVector3Dot( Plane, Point6 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    Dist = XMVector3Dot( Plane, Point7 );
-    Min = XMVectorMin( Min, Dist );
-    Max = XMVectorMax( Max, Dist );
-
-    XMVECTOR PlaneDist = -XMVectorSplatW( Plane );
-
-    // Outside the plane?
-    Outside = XMVectorGreater( Min, PlaneDist );
-
-    // Fully inside the plane?
-    Inside = XMVectorLess( Max, PlaneDist );
-}
-
-}; // namespace Internal
-
-
-/****************************************************************************
- *
- * BoundingSphere
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform a sphere by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, FXMMATRIX M ) const
-{
-    // Load the center of the sphere.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-
-    // Transform the center of the sphere.
-    XMVECTOR C = XMVector3Transform( vCenter, M );
-    
-    XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] );
-    XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] );
-    XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] );
-
-    XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) );
-
-    // Store the center sphere.
-    XMStoreFloat3( &Out.Center, C );
-
-    // Scale the radius of the pshere.
-    float Scale = sqrtf( XMVectorGetX(d) );
-    Out.Radius = Radius * Scale;
-}
-
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
-{
-    // Load the center of the sphere.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-
-    // Transform the center of the sphere.
-    vCenter = XMVector3Rotate( vCenter * XMVectorReplicate( Scale ), Rotation ) + Translation;
-
-    // Store the center sphere.
-    XMStoreFloat3( &Out.Center, vCenter );
-
-    // Scale the radius of the pshere.
-    Out.Radius = Radius * Scale;
-}
-
-
-//-----------------------------------------------------------------------------
-// Point in sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR Point ) const
-{
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-
-    XMVECTOR DistanceSquared = XMVector3LengthSq( Point - vCenter );
-    XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius );
-
-    return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ) ? CONTAINS : DISJOINT;
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    if ( !Intersects(V0,V1,V2) )
-        return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-    XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius );
-
-    XMVECTOR DistanceSquared = XMVector3LengthSq( V0 - vCenter );
-    XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared);
-
-    DistanceSquared = XMVector3LengthSq( V1 - vCenter );
-    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) );
-
-    DistanceSquared = XMVector3LengthSq( V2 - vCenter );
-    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) );
-
-    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere in sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingSphere::Contains( const BoundingSphere& sh ) const
-{
-    XMVECTOR Center1 = XMLoadFloat3( &Center );
-    float r1 = Radius;
-
-    XMVECTOR Center2 = XMLoadFloat3( &sh.Center );
-    float r2 = sh.Radius;
-
-    XMVECTOR V = XMVectorSubtract( Center2, Center1 );
-
-    XMVECTOR Dist = XMVector3Length( V );
-
-    float d = XMVectorGetX( Dist );
-
-    return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT;
-}
-
-
-//-----------------------------------------------------------------------------
-// Axis-aligned box in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingSphere::Contains( const BoundingBox& box ) const
-{
-    if ( !box.Intersects(*this) )
-        return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-    XMVECTOR RadiusSq = vRadius * vRadius;
-
-    XMVECTOR boxCenter = XMLoadFloat3( &box.Center );
-    XMVECTOR boxExtents = XMLoadFloat3( &box.Extents );
-
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    XMVECTOR offset = boxCenter - vCenter;
-
-    for( size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVectorMultiplyAdd( boxExtents, g_BoxOffset[i], offset );
-        XMVECTOR d = XMVector3LengthSq( C );
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) );
-    }
-
-    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Oriented box in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingSphere::Contains( const BoundingOrientedBox& box ) const
-{
-    if ( !box.Intersects(*this) )
-        return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-    XMVECTOR RadiusSq = vRadius * vRadius;
-
-    XMVECTOR boxCenter = XMLoadFloat3( &box.Center );
-    XMVECTOR boxExtents = XMLoadFloat3( &box.Extents );
-    XMVECTOR boxOrientation = XMLoadFloat4( &box.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( boxOrientation ) );
-
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    for( size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVector3Rotate( boxExtents * g_BoxOffset[i], boxOrientation ) + boxCenter;
-        XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) );
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) );
-    }
-
-    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-
-}
-
-
-//-----------------------------------------------------------------------------
-// Frustum in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingSphere::Contains( const BoundingFrustum& fr ) const
-{
-    if ( !fr.Intersects(*this) )
-        return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-    XMVECTOR RadiusSq = vRadius * vRadius;
-
-    XMVECTOR vOrigin = XMLoadFloat3( &fr.Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &fr.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f );
-    XMVECTOR vRightBottom = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftTop = XMVectorSet( fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftBottom = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &fr.Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &fr.Far );
-
-    XMVECTOR Corners[BoundingFrustum::CORNER_COUNT];
-    Corners[0] = vRightTop * vNear;
-    Corners[1] = vRightBottom * vNear;
-    Corners[2] = vLeftTop * vNear;
-    Corners[3] = vLeftBottom * vNear;
-    Corners[4] = vRightTop * vFar;
-    Corners[5] = vRightBottom * vFar;
-    Corners[6] = vLeftTop * vFar;
-    Corners[7] = vLeftBottom * vFar;
-
-    XMVECTOR InsideAll = XMVectorTrueInt();
-    for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVector3Rotate( Corners[i], vOrientation ) + vOrigin;
-        XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) );
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) );
-    }
-
-    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere vs. sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingSphere::Intersects( const BoundingSphere& sh ) const
-{
-    // Load A.
-    XMVECTOR vCenterA = XMLoadFloat3( &Center );
-    XMVECTOR vRadiusA = XMVectorReplicatePtr( &Radius );
-
-    // Load B.
-    XMVECTOR vCenterB = XMLoadFloat3( &sh.Center );
-    XMVECTOR vRadiusB = XMVectorReplicatePtr( &sh.Radius );
-
-    // Distance squared between centers.    
-    XMVECTOR Delta = vCenterB - vCenterA;
-    XMVECTOR DistanceSquared = XMVector3LengthSq( Delta );
-
-    // Sum of the radii squared.
-    XMVECTOR RadiusSquared = XMVectorAdd( vRadiusA, vRadiusB );
-    RadiusSquared = XMVectorMultiply( RadiusSquared, RadiusSquared );
-
-    return XMVector3LessOrEqual( DistanceSquared, RadiusSquared );
-}
-
-
-//-----------------------------------------------------------------------------
-// Box vs. sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingSphere::Intersects( const BoundingBox& box ) const
-{
-    return box.Intersects( *this );
-}
-
-_Use_decl_annotations_
-inline bool BoundingSphere::Intersects( const BoundingOrientedBox& box ) const
-{
-    return box.Intersects( *this );
-}
-
-
-//-----------------------------------------------------------------------------
-// Frustum vs. sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingSphere::Intersects( const BoundingFrustum& fr ) const
-{
-    return fr.Intersects( *this );
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle vs sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    // Load the sphere.    
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-
-    // Compute the plane of the triangle (has to be normalized).
-    XMVECTOR N = XMVector3Normalize( XMVector3Cross( V1 - V0, V2 - V0 ) );
-
-    // Assert that the triangle is not degenerate.
-    assert( !XMVector3Equal( N, XMVectorZero() ) );
-
-    // Find the nearest feature on the triangle to the sphere.
-    XMVECTOR Dist = XMVector3Dot( vCenter - V0, N );
-
-    // If the center of the sphere is farther from the plane of the triangle than
-    // the radius of the sphere, then there cannot be an intersection.
-    XMVECTOR NoIntersection = XMVectorLess( Dist, -vRadius );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Dist, vRadius ) );
-
-    // Project the center of the sphere onto the plane of the triangle.
-    XMVECTOR Point = vCenter - ( N * Dist );
-
-    // Is it inside all the edges? If so we intersect because the distance 
-    // to the plane is less than the radius.
-    XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle( Point, V0, V1, V2 );
-
-    // Find the nearest point on each edge.
-    XMVECTOR RadiusSq = vRadius * vRadius;
-
-    // Edge 0,1
-    Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V0, V1, vCenter );
-
-    // If the distance to the center of the sphere to the point is less than 
-    // the radius of the sphere then it must intersect.
-    Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) );
-
-    // Edge 1,2
-    Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V1, V2, vCenter );
-
-    // If the distance to the center of the sphere to the point is less than 
-    // the radius of the sphere then it must intersect.
-    Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) );
-
-    // Edge 2,0
-    Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V2, V0, vCenter );
-
-    // If the distance to the center of the sphere to the point is less than 
-    // the radius of the sphere then it must intersect.
-    Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) );
-
-    return XMVector4EqualInt( XMVectorAndCInt( Intersection, NoIntersection ), XMVectorTrueInt() );
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere-plane intersection
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Plane ) const
-{
-    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
-
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    XMVECTOR Outside, Inside;
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane, Outside, Inside );
-
-    // If the sphere is outside any plane it is outside.
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return FRONT;
-
-    // If the sphere is inside all planes it is inside.
-    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
-        return BACK;
-
-    // The sphere is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with a sphere.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const
-{
-    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-
-    // l is the vector from the ray origin to the center of the sphere.
-    XMVECTOR l = vCenter - Origin;
-
-    // s is the projection of the l onto the ray direction.
-    XMVECTOR s = XMVector3Dot( l, Direction );
-
-    XMVECTOR l2 = XMVector3Dot( l, l );
-
-    XMVECTOR r2 = vRadius * vRadius;
-
-    // m2 is squared distance from the center of the sphere to the projection.
-    XMVECTOR m2 = l2 - s * s;
-
-    XMVECTOR NoIntersection;
-
-    // If the ray origin is outside the sphere and the center of the sphere is 
-    // behind the ray origin there is no intersection.
-    NoIntersection = XMVectorAndInt( XMVectorLess( s, XMVectorZero() ), XMVectorGreater( l2, r2 ) );
-
-    // If the squared distance from the center of the sphere to the projection
-    // is greater than the radius squared the ray will miss the sphere.
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( m2, r2 ) );
-
-    // The ray hits the sphere, compute the nearest intersection point.
-    XMVECTOR q = XMVectorSqrt( r2 - m2 );
-    XMVECTOR t1 = s - q;
-    XMVECTOR t2 = s + q;
-
-    XMVECTOR OriginInside = XMVectorLessOrEqual( l2, r2 );
-    XMVECTOR t = XMVectorSelect( t1, t2, OriginInside );
-
-    if( XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) )
-    {
-        // Store the x-component to *pDist.
-        XMStoreFloat( &Dist, t );
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Test a sphere vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
-                                                                GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
-{
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane0, Outside, Inside );
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane1, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane2, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane3, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane4, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane5, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    // If the sphere is outside any plane it is outside.
-    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
-        return DISJOINT;
-
-    // If the sphere is inside all planes it is inside.
-    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
-        return CONTAINS;
-
-    // The sphere is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Creates a bounding sphere that contains two other bounding spheres
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingSphere::CreateMerged( BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2 )
-{
-    XMVECTOR Center1 = XMLoadFloat3( &S1.Center );
-    float r1 = S1.Radius;
-
-    XMVECTOR Center2 = XMLoadFloat3( &S2.Center );
-    float r2 = S2.Radius;
-
-    XMVECTOR V = XMVectorSubtract( Center2, Center1 );
-
-    XMVECTOR Dist = XMVector3Length( V );
-
-    float d = XMVectorGetX(Dist);
-
-    if ( r1 + r2 >= d )
-    {
-        if ( r1 - r2 >= d )
-        {
-            Out = S1;
-            return;
-        }
-        else if ( r2 - r1 >= d )
-        {
-            Out = S2;
-            return;
-        }
-    }
-
-    XMVECTOR N = XMVectorDivide( V, Dist );
-
-    float t1 = XMMin( -r1, d-r2 );
-    float t2 = XMMax( r1, d+r2 );
-    float t_5 = (t2 - t1) * 0.5f;
-    
-    XMVECTOR NCenter = XMVectorAdd( Center1, XMVectorMultiply( N, XMVectorReplicate( t_5 + t1 ) ) );
-
-    XMStoreFloat3( &Out.Center, NCenter );
-    Out.Radius = t_5;
-}
-
-
-//-----------------------------------------------------------------------------
-// Create sphere enscribing bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingBox& box )
-{
-    Out.Center = box.Center;
-    XMVECTOR vExtents = XMLoadFloat3( &box.Extents );
-    Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) );
-}
-
-_Use_decl_annotations_
-inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingOrientedBox& box )
-{
-    // Bounding box orientation is irrelevant because a sphere is rotationally invariant
-    Out.Center = box.Center;
-    XMVECTOR vExtents = XMLoadFloat3( &box.Extents );
-    Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) );
-}
-
-
-//-----------------------------------------------------------------------------
-// Find the approximate smallest enclosing bounding sphere for a set of 
-// points. Exact computation of the smallest enclosing bounding sphere is 
-// possible but is slower and requires a more complex algorithm.
-// The algorithm is based on  Jack Ritter, "An Efficient Bounding Sphere", 
-// Graphics Gems.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingSphere::CreateFromPoints( BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride )
-{
-    assert( Count > 0 );
-    assert( pPoints );
-
-    // Find the points with minimum and maximum x, y, and z
-    XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ;
-
-    MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3( pPoints );
-
-    for( size_t i = 1; i < Count; ++i )
-    {
-        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
-
-        float px = XMVectorGetX( Point );
-        float py = XMVectorGetY( Point );
-        float pz = XMVectorGetZ( Point );
-
-        if( px < XMVectorGetX( MinX ) )
-            MinX = Point;
-
-        if( px > XMVectorGetX( MaxX ) )
-            MaxX = Point;
-
-        if( py < XMVectorGetY( MinY ) )
-            MinY = Point;
-
-        if( py > XMVectorGetY( MaxY ) )
-            MaxY = Point;
-
-        if( pz < XMVectorGetZ( MinZ ) )
-            MinZ = Point;
-
-        if( pz > XMVectorGetZ( MaxZ ) )
-            MaxZ = Point;
-    }
-
-    // Use the min/max pair that are farthest apart to form the initial sphere.
-    XMVECTOR DeltaX = MaxX - MinX;
-    XMVECTOR DistX = XMVector3Length( DeltaX );
-
-    XMVECTOR DeltaY = MaxY - MinY;
-    XMVECTOR DistY = XMVector3Length( DeltaY );
-
-    XMVECTOR DeltaZ = MaxZ - MinZ;
-    XMVECTOR DistZ = XMVector3Length( DeltaZ );
-
-    XMVECTOR vCenter;
-    XMVECTOR vRadius;
-
-    if( XMVector3Greater( DistX, DistY ) )
-    {
-        if( XMVector3Greater( DistX, DistZ ) )
-        {
-            // Use min/max x.
-            vCenter = XMVectorLerp(MaxX,MinX,0.5f);
-            vRadius = DistX * 0.5f;
-        }
-        else
-        {
-            // Use min/max z.
-            vCenter = XMVectorLerp(MaxZ,MinZ,0.5f);
-            vRadius = DistZ * 0.5f;
-        }
-    }
-    else // Y >= X
-    {
-        if( XMVector3Greater( DistY, DistZ ) )
-        {
-            // Use min/max y.
-            vCenter = XMVectorLerp(MaxY,MinY,0.5f);
-            vRadius = DistY * 0.5f;
-        }
-        else
-        {
-            // Use min/max z.
-            vCenter = XMVectorLerp(MaxZ,MinZ,0.5f);
-            vRadius = DistZ * 0.5f;
-        }
-    }
-
-    // Add any points not inside the sphere.
-    for( size_t i = 0; i < Count; ++i )
-    {
-        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
-
-        XMVECTOR Delta = Point - vCenter;
-
-        XMVECTOR Dist = XMVector3Length( Delta );
-
-        if( XMVector3Greater( Dist, vRadius ) )
-        {
-            // Adjust sphere to include the new point.
-            vRadius = ( vRadius + Dist ) * 0.5f;
-            vCenter += ( XMVectorReplicate( 1.0f ) - XMVectorDivide(vRadius,Dist) ) * Delta;
-        }
-    }
-
-    XMStoreFloat3( &Out.Center, vCenter );
-    XMStoreFloat( &Out.Radius, vRadius );
-}
-
-
-//-----------------------------------------------------------------------------
-// Create sphere containing frustum
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingSphere::CreateFromFrustum( BoundingSphere& Out, const BoundingFrustum& fr )
-{
-    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
-    fr.GetCorners( Corners );
-    CreateFromPoints( Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3) );
-}
-
-
-/****************************************************************************
- *
- * BoundingBox
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform an axis aligned box by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, FXMMATRIX M ) const
-{
-    // Load center and extents.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    // Compute and transform the corners and find new min/max bounds.
-    XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter );
-    Corner = XMVector3Transform( Corner, M );
-
-    XMVECTOR Min, Max;
-    Min = Max = Corner;
-
-    for( size_t i = 1; i < CORNER_COUNT; ++i )
-    {
-        Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter );
-        Corner = XMVector3Transform( Corner, M );
-
-        Min = XMVectorMin( Min, Corner );
-        Max = XMVectorMax( Max, Corner );
-    }
-
-    // Store center and extents.
-    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
-    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
-}
-
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
-{
-    assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) );
-
-    // Load center and extents.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    XMVECTOR VectorScale = XMVectorReplicate( Scale );
-
-    // Compute and transform the corners and find new min/max bounds.
-    XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter );
-    Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation;
-
-    XMVECTOR Min, Max;
-    Min = Max = Corner;
-
-    for( size_t i = 1; i < CORNER_COUNT; ++i )
-    {
-        Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter );
-        Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation;
-
-        Min = XMVectorMin( Min, Corner );
-        Max = XMVectorMax( Max, Corner );
-    }
-
-    // Store center and extents.
-    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
-    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
-}
-
-
-//-----------------------------------------------------------------------------
-// Get the corner points of the box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingBox::GetCorners( XMFLOAT3* Corners ) const
-{
-    assert( Corners != nullptr );
-
-    // Load the box
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    for( size_t i = 0; i < CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter );
-        XMStoreFloat3( &Corners[i], C );
-    }
-}
-
-
-//-----------------------------------------------------------------------------
-// Point in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR Point ) const
-{
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    return XMVector3InBounds( Point - vCenter, vExtents ) ? CONTAINS : DISJOINT;
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    if ( !Intersects(V0,V1,V2) )
-        return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    XMVECTOR d = XMVectorAbs( V0 - vCenter );
-    XMVECTOR Inside = XMVectorLessOrEqual( d, vExtents );
-
-    d = XMVectorAbs( V1 - vCenter );
-    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
-
-    d = XMVectorAbs( V2 - vCenter );
-    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
-
-    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingBox::Contains( const BoundingSphere& sh ) const
-{
-    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
-    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
-
-    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
-    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
-
-    XMVECTOR BoxMin = BoxCenter - BoxExtents;
-    XMVECTOR BoxMax = BoxCenter + BoxExtents;
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin );
-    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax );
-
-    XMVECTOR MinDelta = SphereCenter - BoxMin;
-    XMVECTOR MaxDelta = SphereCenter - BoxMax;
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect( d, MinDelta, LessThanMin );
-    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot( d, d );
-
-    if ( XMVector3Greater( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) )
-        return DISJOINT;
-
-    XMVECTOR InsideAll = XMVectorLessOrEqual( BoxMin + SphereRadius, SphereCenter );
-    InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( SphereCenter, BoxMax - SphereRadius ) );
-    InsideAll = XMVectorAndInt( InsideAll, XMVectorGreater( BoxMax - BoxMin, SphereRadius ) );
-
-    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Axis-aligned box in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingBox::Contains( const BoundingBox& box ) const
-{
-    XMVECTOR CenterA = XMLoadFloat3( &Center );
-    XMVECTOR ExtentsA = XMLoadFloat3( &Extents );
-
-    XMVECTOR CenterB = XMLoadFloat3( &box.Center );
-    XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents );
-
-    XMVECTOR MinA = CenterA - ExtentsA;
-    XMVECTOR MaxA = CenterA + ExtentsA;
-
-    XMVECTOR MinB = CenterB - ExtentsB;
-    XMVECTOR MaxB = CenterB + ExtentsB;
-
-    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false
-    XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) );
-
-    if ( DirectX::Internal::XMVector3AnyTrue( Disjoint ) )
-        return DISJOINT;
-
-    // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B
-    XMVECTOR Inside = XMVectorAndInt( XMVectorLessOrEqual( MinA, MinB ), XMVectorLessOrEqual( MaxB, MaxA ) );
-
-    return DirectX::Internal::XMVector3AllTrue( Inside ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Oriented box in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingBox::Contains( const BoundingOrientedBox& box ) const
-{
-    if ( !box.Intersects( *this ) )
-        return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    // Subtract off the AABB center to remove a subtract below
-    XMVECTOR oCenter = XMLoadFloat3( &box.Center ) - vCenter;
-
-    XMVECTOR oExtents = XMLoadFloat3( &box.Extents );
-    XMVECTOR oOrientation = XMLoadFloat4( &box.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( oOrientation ) );
-
-    XMVECTOR Inside = XMVectorTrueInt();
-
-    for( size_t i=0; i < BoundingOrientedBox::CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVector3Rotate( oExtents * g_BoxOffset[i], oOrientation ) + oCenter;
-        XMVECTOR d = XMVectorAbs(C);
-        Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
-    }
-
-    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Frustum in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingBox::Contains( const BoundingFrustum& fr ) const
-{
-    if ( !fr.Intersects( *this ) )
-        return DISJOINT;
-
-    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
-    fr.GetCorners( Corners );
-
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    XMVECTOR Inside = XMVectorTrueInt();
-
-    for( size_t i=0; i < BoundingFrustum::CORNER_COUNT; ++i )
-    {
-        XMVECTOR Point = XMLoadFloat3( &Corners[i] );
-        XMVECTOR d = XMVectorAbs( Point - vCenter );
-        Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
-    }
-
-    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere vs axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingBox::Intersects( const BoundingSphere& sh ) const
-{
-    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
-    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
-
-    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
-    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
-
-    XMVECTOR BoxMin = BoxCenter - BoxExtents;
-    XMVECTOR BoxMax = BoxCenter + BoxExtents;
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin );
-    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax );
-
-    XMVECTOR MinDelta = SphereCenter - BoxMin;
-    XMVECTOR MaxDelta = SphereCenter - BoxMax;
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect( d, MinDelta, LessThanMin );
-    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot( d, d );
-
-    return XMVector3LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) );
-}
-
-
-//-----------------------------------------------------------------------------
-// Axis-aligned box vs. axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingBox::Intersects( const BoundingBox& box ) const
-{
-    XMVECTOR CenterA = XMLoadFloat3( &Center );
-    XMVECTOR ExtentsA = XMLoadFloat3( &Extents );
-
-    XMVECTOR CenterB = XMLoadFloat3( &box.Center );
-    XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents );
-
-    XMVECTOR MinA = CenterA - ExtentsA;
-    XMVECTOR MaxA = CenterA + ExtentsA;
-
-    XMVECTOR MinB = CenterB - ExtentsB;
-    XMVECTOR MaxB = CenterB + ExtentsB;
-
-    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false
-    XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) );
-
-    return !DirectX::Internal::XMVector3AnyTrue( Disjoint );
-}
-
-
-//-----------------------------------------------------------------------------
-// Oriented box vs. axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingBox::Intersects( const BoundingOrientedBox& box ) const
-{
-    return box.Intersects( *this );
-}
-
-
-//-----------------------------------------------------------------------------
-// Frustum vs. axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingBox::Intersects( const BoundingFrustum& fr ) const
-{
-    return fr.Intersects( *this );
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle vs. axis aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    XMVECTOR Zero = XMVectorZero();
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    XMVECTOR BoxMin = vCenter - vExtents;
-    XMVECTOR BoxMax = vCenter + vExtents;
-
-    // Test the axes of the box (in effect test the AAB against the minimal AAB 
-    // around the triangle).
-    XMVECTOR TriMin = XMVectorMin( XMVectorMin( V0, V1 ), V2 );
-    XMVECTOR TriMax = XMVectorMax( XMVectorMax( V0, V1 ), V2 );
-
-    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint
-    XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( TriMin, BoxMax ), XMVectorGreater( BoxMin, TriMax ) );
-    if( DirectX::Internal::XMVector3AnyTrue( Disjoint ) )
-        return false;
-
-    // Test the plane of the triangle.
-    XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 );
-    XMVECTOR Dist = XMVector3Dot( Normal, V0 );
-
-    // Assert that the triangle is not degenerate.
-    assert( !XMVector3Equal( Normal, Zero ) );
-
-    // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i)
-    // else v_min(i)=b_max(i), v_max(i)=b_min(i)
-    XMVECTOR NormalSelect = XMVectorGreater( Normal, Zero );
-    XMVECTOR V_Min = XMVectorSelect( BoxMax, BoxMin, NormalSelect );
-    XMVECTOR V_Max = XMVectorSelect( BoxMin, BoxMax, NormalSelect );
-
-    // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint
-    XMVECTOR MinDist = XMVector3Dot( V_Min, Normal );
-    XMVECTOR MaxDist = XMVector3Dot( V_Max, Normal );
-
-    XMVECTOR NoIntersection = XMVectorGreater( MinDist, Dist );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( MaxDist, Dist ) );
-
-    // Move the box center to zero to simplify the following tests.
-    XMVECTOR TV0 = V0 - vCenter;
-    XMVECTOR TV1 = V1 - vCenter;
-    XMVECTOR TV2 = V2 - vCenter;
-
-    // Test the edge/edge axes (3*3).
-    XMVECTOR e0 = TV1 - TV0;
-    XMVECTOR e1 = TV2 - TV1;
-    XMVECTOR e2 = TV0 - TV2;
-
-    // Make w zero.
-    e0 = XMVectorInsert<0, 0, 0, 0, 1>( e0, Zero );
-    e1 = XMVectorInsert<0, 0, 0, 0, 1>( e1, Zero );
-    e2 = XMVectorInsert<0, 0, 0, 0, 1>( e2, Zero );
-
-    XMVECTOR Axis;
-    XMVECTOR p0, p1, p2;
-    XMVECTOR Min, Max;
-    XMVECTOR Radius;
-
-    // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y)
-    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e0, -e0 );
-    p0 = XMVector3Dot( TV0, Axis );
-    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
-    p2 = XMVector3Dot( TV2, Axis );
-    Min = XMVectorMin( p0, p2 );
-    Max = XMVectorMax( p0, p2 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y)
-    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e1, -e1 );
-    p0 = XMVector3Dot( TV0, Axis );
-    p1 = XMVector3Dot( TV1, Axis );
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
-    Min = XMVectorMin( p0, p1 );
-    Max = XMVectorMax( p0, p1 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y)
-    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e2, -e2 );
-    p0 = XMVector3Dot( TV0, Axis );
-    p1 = XMVector3Dot( TV1, Axis );
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
-    Min = XMVectorMin( p0, p1 );
-    Max = XMVectorMax( p0, p1 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x)
-    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e0, -e0 );
-    p0 = XMVector3Dot( TV0, Axis );
-    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
-    p2 = XMVector3Dot( TV2, Axis );
-    Min = XMVectorMin( p0, p2 );
-    Max = XMVectorMax( p0, p2 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x)
-    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e1, -e1 );
-    p0 = XMVector3Dot( TV0, Axis );
-    p1 = XMVector3Dot( TV1, Axis );
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
-    Min = XMVectorMin( p0, p1 );
-    Max = XMVectorMax( p0, p1 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x)
-    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e2, -e2 );
-    p0 = XMVector3Dot( TV0, Axis );
-    p1 = XMVector3Dot( TV1, Axis );
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
-    Min = XMVectorMin( p0, p1 );
-    Max = XMVectorMax( p0, p1 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0)
-    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e0, -e0 );
-    p0 = XMVector3Dot( TV0, Axis );
-    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
-    p2 = XMVector3Dot( TV2, Axis );
-    Min = XMVectorMin( p0, p2 );
-    Max = XMVectorMax( p0, p2 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0)
-    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e1, -e1 );
-    p0 = XMVector3Dot( TV0, Axis );
-    p1 = XMVector3Dot( TV1, Axis );
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
-    Min = XMVectorMin( p0, p1 );
-    Max = XMVectorMax( p0, p1 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0)
-    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e2, -e2 );
-    p0 = XMVector3Dot( TV0, Axis );
-    p1 = XMVector3Dot( TV1, Axis );
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
-    Min = XMVectorMin( p0, p1 );
-    Max = XMVectorMax( p0, p1 );
-    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
-
-    return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() );
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Plane ) const
-{
-    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    XMVECTOR Outside, Inside;
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane, Outside, Inside );
-
-    // If the box is outside any plane it is outside.
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return FRONT;
-
-    // If the box is inside all planes it is inside.
-    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
-        return BACK;
-
-    // The box is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with an axis aligned 
-// box using the slabs method.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const
-{
-    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    // Adjust ray origin to be relative to center of the box.
-    XMVECTOR TOrigin = vCenter - Origin;
-
-    // Compute the dot product againt each axis of the box.
-    // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary.
-    XMVECTOR AxisDotOrigin = TOrigin;
-    XMVECTOR AxisDotDirection = Direction;
-
-    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab.
-    XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon );
-
-    // Test against all three axii simultaneously.
-    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection );
-    XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection;
-    XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection;
-
-    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
-    // use the results from any directions parallel to the slab.
-    XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel );
-    XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel );
-
-    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
-    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
-    t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) );  // x = max(x,y)
-    t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) );  // x = max(max(x,y),z)
-    t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) );  // x = min(x,y)
-    t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) );  // x = min(min(x,y),z)
-
-    // if ( t_min > t_max ) return false;
-    XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) );
-
-    // if ( t_max < 0.0f ) return false;
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) );
-
-    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false;
-    XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) );
-
-    if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) )
-    {
-        // Store the x-component to *pDist
-        XMStoreFloat( &Dist, t_min );
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Test an axis alinged box vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
-                                                             GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
-{
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane0, Outside, Inside );
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane1, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane2, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane3, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane4, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane5, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    // If the box is outside any plane it is outside.
-    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
-        return DISJOINT;
-
-    // If the box is inside all planes it is inside.
-    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
-        return CONTAINS;
-
-    // The box is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Create axis-aligned box that contains two other bounding boxes
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingBox::CreateMerged( BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2 )
-{
-    XMVECTOR b1Center = XMLoadFloat3( &b1.Center );
-    XMVECTOR b1Extents = XMLoadFloat3( &b1.Extents );
-
-    XMVECTOR b2Center = XMLoadFloat3( &b2.Center );
-    XMVECTOR b2Extents = XMLoadFloat3( &b2.Extents );
-
-    XMVECTOR Min = XMVectorSubtract( b1Center, b1Extents );
-    Min = XMVectorMin( Min, XMVectorSubtract( b2Center, b2Extents ) );
-
-    XMVECTOR Max = XMVectorAdd( b1Center, b1Extents );
-    Max = XMVectorMax( Max, XMVectorAdd( b2Center, b2Extents ) );
-
-    assert( XMVector3LessOrEqual( Min, Max ) );
-
-    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
-    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
-}
-
-
-//-----------------------------------------------------------------------------
-// Create axis-aligned box that contains a bounding sphere
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingBox::CreateFromSphere( BoundingBox& Out, const BoundingSphere& sh )
-{
-    XMVECTOR spCenter = XMLoadFloat3( &sh.Center );
-    XMVECTOR shRadius = XMVectorReplicatePtr( &sh.Radius );
-
-    XMVECTOR Min = XMVectorSubtract( spCenter, shRadius );
-    XMVECTOR Max = XMVectorAdd( spCenter, shRadius );
-
-    assert( XMVector3LessOrEqual( Min, Max ) );
-
-    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
-    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
-}
-
-
-//-----------------------------------------------------------------------------
-// Create axis-aligned box from min/max points
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingBox::CreateFromPoints( BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2 )
-{
-    XMVECTOR Min = XMVectorMin( pt1, pt2 );
-    XMVECTOR Max = XMVectorMax( pt1, pt2 );
-
-    // Store center and extents.
-    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
-    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
-}
-
-
-//-----------------------------------------------------------------------------
-// Find the minimum axis aligned bounding box containing a set of points.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingBox::CreateFromPoints( BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride )
-{
-    assert( Count > 0 );
-    assert( pPoints );
-
-    // Find the minimum and maximum x, y, and z
-    XMVECTOR vMin, vMax;
-
-    vMin = vMax = XMLoadFloat3( pPoints );
-
-    for( size_t i = 1; i < Count; ++i )
-    {
-        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
-
-        vMin = XMVectorMin( vMin, Point );
-        vMax = XMVectorMax( vMax, Point );
-    }
-
-    // Store center and extents.
-    XMStoreFloat3( &Out.Center, ( vMin + vMax ) * 0.5f );
-    XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f );
-}
-
-
-/****************************************************************************
- *
- * BoundingOrientedBox
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform an oriented box by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, FXMMATRIX M ) const
-{
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Composite the box rotation and the transform rotation.
-    XMMATRIX nM;
-    nM.r[0] = XMVector3Normalize( M.r[0] );
-    nM.r[1] = XMVector3Normalize( M.r[1] );
-    nM.r[2] = XMVector3Normalize( M.r[2] );
-    nM.r[3] = g_XMIdentityR3;
-    XMVECTOR Rotation = XMQuaternionRotationMatrix( nM );
-    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
-
-    // Transform the center.
-    vCenter = XMVector3Transform( vCenter, M );
-
-    // Scale the box extents.
-    XMVECTOR dX = XMVector3Length( M.r[0] );
-    XMVECTOR dY = XMVector3Length( M.r[1] );
-    XMVECTOR dZ = XMVector3Length( M.r[2] );
-
-    XMVECTOR VectorScale = XMVectorSelect( dY, dX, g_XMSelect1000 );
-    VectorScale = XMVectorSelect( dZ, VectorScale, g_XMSelect1100 );
-    vExtents = vExtents * VectorScale;
-
-    // Store the box.
-    XMStoreFloat3( &Out.Center, vCenter );
-    XMStoreFloat3( &Out.Extents, vExtents );
-    XMStoreFloat4( &Out.Orientation, vOrientation );
-}
-
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
-{
-    assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) );
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Composite the box rotation and the transform rotation.
-    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
-
-    // Transform the center.
-    XMVECTOR VectorScale = XMVectorReplicate( Scale );
-    vCenter = XMVector3Rotate( vCenter * VectorScale, Rotation ) + Translation;
-
-    // Scale the box extents.
-    vExtents = vExtents * VectorScale;
-
-    // Store the box.
-    XMStoreFloat3( &Out.Center, vCenter );
-    XMStoreFloat3( &Out.Extents, vExtents );
-    XMStoreFloat4( &Out.Orientation, vOrientation );
-}
-
-
-//-----------------------------------------------------------------------------
-// Get the corner points of the box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingOrientedBox::GetCorners( XMFLOAT3* Corners ) const
-{
-    assert( Corners != 0 );
-
-    // Load the box
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    for( size_t i = 0; i < CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVector3Rotate( vExtents * g_BoxOffset[i], vOrientation ) + vCenter;
-        XMStoreFloat3( &Corners[i], C );
-    }
-}
-
-
-//-----------------------------------------------------------------------------
-// Point in oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR Point ) const
-{
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Transform the point to be local to the box.
-    XMVECTOR TPoint = XMVector3InverseRotate( Point - vCenter, vOrientation );
-
-    return XMVector3InBounds( TPoint, vExtents ) ? CONTAINS : DISJOINT;
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    // Load the box center & orientation.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Transform the triangle vertices into the space of the box.
-    XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation );
-    XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation );
-    XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation );
-
-    BoundingBox box;
-    box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f );
-    box.Extents = Extents;
-
-    // Use the triangle vs axis aligned box intersection routine.
-    return box.Contains( TV0, TV1, TV2 );
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingOrientedBox::Contains( const BoundingSphere& sh ) const
-{
-    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
-    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
-
-    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
-    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
-    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
-
-    // Transform the center of the sphere to be local to the box.
-    // BoxMin = -BoxExtents
-    // BoxMax = +BoxExtents
-    SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation );
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents );
-    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents );
-
-    XMVECTOR MinDelta = SphereCenter + BoxExtents;
-    XMVECTOR MaxDelta = SphereCenter - BoxExtents;
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect( d, MinDelta, LessThanMin );
-    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot( d, d );
-    XMVECTOR SphereRadiusSq = XMVectorMultiply( SphereRadius, SphereRadius );
-
-    if ( XMVector4Greater( d2, SphereRadiusSq ) )
-        return DISJOINT;
-
-    // See if we are completely inside the box
-    XMVECTOR SMin = SphereCenter - SphereRadius;
-    XMVECTOR SMax = SphereCenter + SphereRadius;
-
-    return ( XMVector3InBounds( SMin, BoxExtents ) && XMVector3InBounds( SMax, BoxExtents ) ) ? CONTAINS : INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Axis aligned box vs. oriented box. Constructs an oriented box and uses
-// the oriented box vs. oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingOrientedBox::Contains( const BoundingBox& box ) const
-{
-    // Make the axis aligned box oriented and do an OBB vs OBB test.
-    BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) );
-    return Contains( obox );
-}
-
-
-//-----------------------------------------------------------------------------
-// Oriented bounding box in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingOrientedBox::Contains( const BoundingOrientedBox& box ) const
-{
-    if ( !Intersects(box) )
-        return DISJOINT;
-
-    // Load the boxes
-    XMVECTOR aCenter = XMLoadFloat3( &Center );
-    XMVECTOR aExtents = XMLoadFloat3( &Extents );
-    XMVECTOR aOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( aOrientation ) );
-
-    XMVECTOR bCenter = XMLoadFloat3( &box.Center );
-    XMVECTOR bExtents = XMLoadFloat3( &box.Extents );
-    XMVECTOR bOrientation = XMLoadFloat4( &box.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( bOrientation ) );
-
-    XMVECTOR offset = bCenter - aCenter;
-
-    for( size_t i = 0; i < CORNER_COUNT; ++i )
-    {
-        // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter
-        // Ca = invrotate( Cb - aCenter, aOrientation )
-
-        XMVECTOR C = XMVector3Rotate( bExtents * g_BoxOffset[i], bOrientation ) + offset;
-        C = XMVector3InverseRotate( C , aOrientation );
-
-        if ( !XMVector3InBounds( C, aExtents ) )
-            return INTERSECTS;
-    }
-
-    return CONTAINS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Frustum in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingOrientedBox::Contains( const BoundingFrustum& fr ) const
-{
-    if ( !fr.Intersects(*this) )
-        return DISJOINT;
-
-    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
-    fr.GetCorners( Corners );
-
-    // Load the box
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVector3InverseRotate( XMLoadFloat3( &Corners[i] ) - vCenter, vOrientation );
-
-        if ( !XMVector3InBounds( C, vExtents ) )
-            return INTERSECTS;
-    }
-
-    return CONTAINS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Sphere vs. oriented box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingOrientedBox::Intersects( const BoundingSphere& sh ) const
-{
-    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
-    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
-
-    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
-    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
-    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
-
-    // Transform the center of the sphere to be local to the box.
-    // BoxMin = -BoxExtents
-    // BoxMax = +BoxExtents
-    SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation );
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents );
-    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents );
-
-    XMVECTOR MinDelta = SphereCenter + BoxExtents;
-    XMVECTOR MaxDelta = SphereCenter - BoxExtents;
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect( d, MinDelta, LessThanMin );
-    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot( d, d );
-
-    return XMVector4LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ? true : false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Axis aligned box vs. oriented box. Constructs an oriented box and uses
-// the oriented box vs. oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingOrientedBox::Intersects( const BoundingBox& box ) const
-{
-    // Make the axis aligned box oriented and do an OBB vs OBB test.
-    BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) );
-    return Intersects( obox );
-}
-
-
-//-----------------------------------------------------------------------------
-// Fast oriented box / oriented box intersection test using the separating axis 
-// theorem.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingOrientedBox::Intersects( const BoundingOrientedBox& box ) const
-{
-    // Build the 3x3 rotation matrix that defines the orientation of B relative to A.
-    XMVECTOR A_quat = XMLoadFloat4( &Orientation );
-    XMVECTOR B_quat = XMLoadFloat4( &box.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( A_quat ) );
-    assert( DirectX::Internal::XMQuaternionIsUnit( B_quat ) );
-
-    XMVECTOR Q = XMQuaternionMultiply( A_quat, XMQuaternionConjugate( B_quat ) );
-    XMMATRIX R = XMMatrixRotationQuaternion( Q );
-
-    // Compute the translation of B relative to A.
-    XMVECTOR A_cent = XMLoadFloat3( &Center );
-    XMVECTOR B_cent = XMLoadFloat3( &box.Center );
-    XMVECTOR t = XMVector3InverseRotate( B_cent - A_cent, A_quat );
-
-    //
-    // h(A) = extents of A.
-    // h(B) = extents of B.
-    //
-    // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1)
-    // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22)
-    //  
-    // For each possible separating axis l:
-    //   d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l )
-    //   d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l )
-    //   if abs( t dot l ) > d(A) + d(B) then disjoint
-    //
-
-    // Load extents of A and B.
-    XMVECTOR h_A = XMLoadFloat3( &Extents );
-    XMVECTOR h_B = XMLoadFloat3( &box.Extents );
-
-    // Rows. Note R[0,1,2]X.w = 0.
-    XMVECTOR R0X = R.r[0];
-    XMVECTOR R1X = R.r[1];
-    XMVECTOR R2X = R.r[2];
-
-    R = XMMatrixTranspose( R );
-
-    // Columns. Note RX[0,1,2].w = 0.
-    XMVECTOR RX0 = R.r[0];
-    XMVECTOR RX1 = R.r[1];
-    XMVECTOR RX2 = R.r[2];
-
-    // Absolute value of rows.
-    XMVECTOR AR0X = XMVectorAbs( R0X );
-    XMVECTOR AR1X = XMVectorAbs( R1X );
-    XMVECTOR AR2X = XMVectorAbs( R2X );
-
-    // Absolute value of columns.
-    XMVECTOR ARX0 = XMVectorAbs( RX0 );
-    XMVECTOR ARX1 = XMVectorAbs( RX1 );
-    XMVECTOR ARX2 = XMVectorAbs( RX2 );
-
-    // Test each of the 15 possible seperating axii.
-    XMVECTOR d, d_A, d_B;
-
-    // l = a(u) = (1, 0, 0)
-    // t dot l = t.x
-    // d(A) = h(A).x
-    // d(B) = h(B) dot abs(r00, r01, r02)
-    d = XMVectorSplatX( t );
-    d_A = XMVectorSplatX( h_A );
-    d_B = XMVector3Dot( h_B, AR0X );
-    XMVECTOR NoIntersection = XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) );
-
-    // l = a(v) = (0, 1, 0)
-    // t dot l = t.y
-    // d(A) = h(A).y
-    // d(B) = h(B) dot abs(r10, r11, r12)
-    d = XMVectorSplatY( t );
-    d_A = XMVectorSplatY( h_A );
-    d_B = XMVector3Dot( h_B, AR1X );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(w) = (0, 0, 1)
-    // t dot l = t.z
-    // d(A) = h(A).z
-    // d(B) = h(B) dot abs(r20, r21, r22)
-    d = XMVectorSplatZ( t );
-    d_A = XMVectorSplatZ( h_A );
-    d_B = XMVector3Dot( h_B, AR2X );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = b(u) = (r00, r10, r20)
-    // d(A) = h(A) dot abs(r00, r10, r20)
-    // d(B) = h(B).x
-    d = XMVector3Dot( t, RX0 );
-    d_A = XMVector3Dot( h_A, ARX0 );
-    d_B = XMVectorSplatX( h_B );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = b(v) = (r01, r11, r21)
-    // d(A) = h(A) dot abs(r01, r11, r21)
-    // d(B) = h(B).y
-    d = XMVector3Dot( t, RX1 );
-    d_A = XMVector3Dot( h_A, ARX1 );
-    d_B = XMVectorSplatY( h_B );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = b(w) = (r02, r12, r22)
-    // d(A) = h(A) dot abs(r02, r12, r22)
-    // d(B) = h(B).z
-    d = XMVector3Dot( t, RX2 );
-    d_A = XMVector3Dot( h_A, ARX2 );
-    d_B = XMVectorSplatZ( h_B );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(u) x b(u) = (0, -r20, r10)
-    // d(A) = h(A) dot abs(0, r20, r10)
-    // d(B) = h(B) dot abs(0, r02, r01)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX0, -RX0 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX0 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR0X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(u) x b(v) = (0, -r21, r11)
-    // d(A) = h(A) dot abs(0, r21, r11)
-    // d(B) = h(B) dot abs(r02, 0, r00)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX1, -RX1 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX1 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR0X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(u) x b(w) = (0, -r22, r12)
-    // d(A) = h(A) dot abs(0, r22, r12)
-    // d(B) = h(B) dot abs(r01, r00, 0)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX2, -RX2 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX2 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR0X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(v) x b(u) = (r20, 0, -r00)
-    // d(A) = h(A) dot abs(r20, 0, r00)
-    // d(B) = h(B) dot abs(0, r12, r11)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX0, -RX0 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX0 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR1X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(v) x b(v) = (r21, 0, -r01)
-    // d(A) = h(A) dot abs(r21, 0, r01)
-    // d(B) = h(B) dot abs(r12, 0, r10)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX1, -RX1 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX1 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR1X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(v) x b(w) = (r22, 0, -r02)
-    // d(A) = h(A) dot abs(r22, 0, r02)
-    // d(B) = h(B) dot abs(r11, r10, 0)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX2, -RX2 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX2 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR1X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(w) x b(u) = (-r10, r00, 0)
-    // d(A) = h(A) dot abs(r10, r00, 0)
-    // d(B) = h(B) dot abs(0, r22, r21)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX0, -RX0 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX0 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR2X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(w) x b(v) = (-r11, r01, 0)
-    // d(A) = h(A) dot abs(r11, r01, 0)
-    // d(B) = h(B) dot abs(r22, 0, r20)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX1, -RX1 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX1 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR2X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // l = a(w) x b(w) = (-r12, r02, 0)
-    // d(A) = h(A) dot abs(r12, r02, 0)
-    // d(B) = h(B) dot abs(r21, r20, 0)
-    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX2, -RX2 ) );
-    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX2 ) );
-    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR2X ) );
-    NoIntersection = XMVectorOrInt( NoIntersection, 
-                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
-
-    // No seperating axis found, boxes must intersect.
-    return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ? true : false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Frustum vs. oriented box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingOrientedBox::Intersects( const BoundingFrustum& fr ) const
-{
-    return fr.Intersects( *this );
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle vs. oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    // Load the box center & orientation.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Transform the triangle vertices into the space of the box.
-    XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation );
-    XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation );
-    XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation );
-
-    BoundingBox box;
-    box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f );
-    box.Extents = Extents;
-
-    // Use the triangle vs axis aligned box intersection routine.
-    return box.Intersects( TV0, TV1, TV2 );
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Plane ) const
-{
-    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    // Build the 3x3 rotation matrix that defines the box axes.
-    XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation );
-
-    XMVECTOR Outside, Inside;
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside );
-
-    // If the box is outside any plane it is outside.
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return FRONT;
-
-    // If the box is inside all planes it is inside.
-    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
-        return BACK;
-
-    // The box is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with an oriented box
-// using the slabs method.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const
-{
-    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
-
-    static const XMVECTORU32 SelectY =
-    {
-        XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0
-    };
-    static const XMVECTORU32 SelectZ =
-    {
-        XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0
-    };
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Get the boxes normalized side directions.
-    XMMATRIX R = XMMatrixRotationQuaternion( vOrientation );
-
-    // Adjust ray origin to be relative to center of the box.
-    XMVECTOR TOrigin = vCenter - Origin;
-
-    // Compute the dot product againt each axis of the box.
-    XMVECTOR AxisDotOrigin = XMVector3Dot( R.r[0], TOrigin );
-    AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[1], TOrigin ), SelectY );
-    AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[2], TOrigin ), SelectZ );
-
-    XMVECTOR AxisDotDirection = XMVector3Dot( R.r[0], Direction );
-    AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[1], Direction ), SelectY );
-    AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[2], Direction ), SelectZ );
-
-    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab.
-    XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon );
-
-    // Test against all three axes simultaneously.
-    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection );
-    XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection;
-    XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection;
-
-    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
-    // use the results from any directions parallel to the slab.
-    XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel );
-    XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel );
-
-    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
-    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
-    t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) );  // x = max(x,y)
-    t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) );  // x = max(max(x,y),z)
-    t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) );  // x = min(x,y)
-    t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) );  // x = min(min(x,y),z)
-
-    // if ( t_min > t_max ) return false;
-    XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) );
-
-    // if ( t_max < 0.0f ) return false;
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) );
-
-    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false;
-    XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents );
-    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) );
-
-    if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) )
-    {
-        // Store the x-component to *pDist
-        XMStoreFloat( &Dist, t_min );
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Test an oriented box vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
-                                                                     GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
-{
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3( &Center );
-    XMVECTOR vExtents = XMLoadFloat3( &Extents );
-    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    // Build the 3x3 rotation matrix that defines the box axes.
-    XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation );
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside );
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    // If the box is outside any plane it is outside.
-    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
-        return DISJOINT;
-
-    // If the box is inside all planes it is inside.
-    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
-        return CONTAINS;
-
-    // The box is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Create oriented bounding box from axis-aligned bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingOrientedBox::CreateFromBoundingBox( BoundingOrientedBox& Out, const BoundingBox& box )
-{
-    Out.Center = box.Center;
-    Out.Extents = box.Extents;
-    Out.Orientation = XMFLOAT4( 0.f, 0.f, 0.f, 1.f );
-}
-
-
-//-----------------------------------------------------------------------------
-// Find the approximate minimum oriented bounding box containing a set of 
-// points.  Exact computation of minimum oriented bounding box is possible but 
-// is slower and requires a more complex algorithm.
-// The algorithm works by computing the inertia tensor of the points and then
-// using the eigenvectors of the intertia tensor as the axes of the box.
-// Computing the intertia tensor of the convex hull of the points will usually 
-// result in better bounding box but the computation is more complex. 
-// Exact computation of the minimum oriented bounding box is possible but the
-// best know algorithm is O(N^3) and is significanly more complex to implement.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingOrientedBox::CreateFromPoints( BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride )
-{
-    assert( Count > 0 );
-    assert( pPoints != 0 );
-
-    XMVECTOR CenterOfMass = XMVectorZero();
-
-    // Compute the center of mass and inertia tensor of the points.
-    for( size_t i = 0; i < Count; ++i )
-    {
-        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
-
-        CenterOfMass += Point;
-    }
-
-    CenterOfMass *= XMVectorReciprocal( XMVectorReplicate( float( Count ) ) );
-
-    // Compute the inertia tensor of the points around the center of mass.
-    // Using the center of mass is not strictly necessary, but will hopefully
-    // improve the stability of finding the eigenvectors.
-    XMVECTOR XX_YY_ZZ = XMVectorZero();
-    XMVECTOR XY_XZ_YZ = XMVectorZero();
-
-    for( size_t i = 0; i < Count; ++i )
-    {
-        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ) - CenterOfMass;
-
-        XX_YY_ZZ += Point * Point;
-
-        XMVECTOR XXY = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>( Point );
-        XMVECTOR YZZ = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_W>( Point );
-
-        XY_XZ_YZ += XXY * YZZ;
-    }
-
-    XMVECTOR v1, v2, v3;
-
-    // Compute the eigenvectors of the inertia tensor.
-    DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix( XMVectorGetX( XX_YY_ZZ ), XMVectorGetY( XX_YY_ZZ ),
-                                                                  XMVectorGetZ( XX_YY_ZZ ),
-                                                                  XMVectorGetX( XY_XZ_YZ ), XMVectorGetY( XY_XZ_YZ ),
-                                                                  XMVectorGetZ( XY_XZ_YZ ),
-                                                                  &v1, &v2, &v3 );
-
-    // Put them in a matrix.
-    XMMATRIX R;
-
-    R.r[0] = XMVectorSetW( v1, 0.f );
-    R.r[1] = XMVectorSetW( v2, 0.f );
-    R.r[2] = XMVectorSetW( v3, 0.f );
-    R.r[3] = g_XMIdentityR3.v;
-
-    // Multiply by -1 to convert the matrix into a right handed coordinate 
-    // system (Det ~= 1) in case the eigenvectors form a left handed 
-    // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only 
-    // works on right handed matrices.
-    XMVECTOR Det = XMMatrixDeterminant( R );
-
-    if( XMVector4Less( Det, XMVectorZero() ) )
-    {
-        R.r[0] *= g_XMNegativeOne.v;
-        R.r[1] *= g_XMNegativeOne.v;
-        R.r[2] *= g_XMNegativeOne.v;
-    }
-
-    // Get the rotation quaternion from the matrix.
-    XMVECTOR vOrientation = XMQuaternionRotationMatrix( R );
-
-    // Make sure it is normal (in case the vectors are slightly non-orthogonal).
-    vOrientation = XMQuaternionNormalize( vOrientation );
-
-    // Rebuild the rotation matrix from the quaternion.
-    R = XMMatrixRotationQuaternion( vOrientation );
-
-    // Build the rotation into the rotated space.
-    XMMATRIX InverseR = XMMatrixTranspose( R );
-
-    // Find the minimum OBB using the eigenvectors as the axes.
-    XMVECTOR vMin, vMax;
-
-    vMin = vMax = XMVector3TransformNormal( XMLoadFloat3( pPoints ), InverseR );
-
-    for( size_t i = 1; i < Count; ++i )
-    {
-        XMVECTOR Point = XMVector3TransformNormal( XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ),
-                                                   InverseR );
-
-        vMin = XMVectorMin( vMin, Point );
-        vMax = XMVectorMax( vMax, Point );
-    }
-
-    // Rotate the center into world space.
-    XMVECTOR vCenter = ( vMin + vMax ) * 0.5f;
-    vCenter = XMVector3TransformNormal( vCenter, R );
-
-    // Store center, extents, and orientation.
-    XMStoreFloat3( &Out.Center, vCenter );
-    XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f );
-    XMStoreFloat4( &Out.Orientation, vOrientation );
-}
-
-
-/****************************************************************************
- *
- * BoundingFrustum
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform a frustum by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, FXMMATRIX M ) const
-{
-    // Load the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Composite the frustum rotation and the transform rotation
-    XMMATRIX nM;
-    nM.r[0] = XMVector3Normalize( M.r[0] );
-    nM.r[1] = XMVector3Normalize( M.r[1] );
-    nM.r[2] = XMVector3Normalize( M.r[2] );
-    nM.r[3] = g_XMIdentityR3;
-    XMVECTOR Rotation = XMQuaternionRotationMatrix( nM );
-    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
-
-    // Transform the center.
-    vOrigin = XMVector3Transform( vOrigin, M );
-
-    // Store the frustum.
-    XMStoreFloat3( &Out.Origin, vOrigin );
-    XMStoreFloat4( &Out.Orientation, vOrientation );
-
-    // Scale the near and far distances (the slopes remain the same).
-    XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] );
-    XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] );
-    XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] );
-
-    XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) );
-    float Scale = sqrtf( XMVectorGetX(d) );
-
-    Out.Near = Near * Scale;
-    Out.Far = Far * Scale;
-
-    // Copy the slopes.
-    Out.RightSlope = RightSlope;
-    Out.LeftSlope = LeftSlope;
-    Out.TopSlope = TopSlope;
-    Out.BottomSlope = BottomSlope;
-}
-
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
-{
-    assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) );
-
-    // Load the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Composite the frustum rotation and the transform rotation.
-    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
-
-    // Transform the origin.
-    vOrigin = XMVector3Rotate( vOrigin * XMVectorReplicate( Scale ), Rotation ) + Translation;
-
-    // Store the frustum.
-    XMStoreFloat3( &Out.Origin, vOrigin );
-    XMStoreFloat4( &Out.Orientation, vOrientation );
-
-    // Scale the near and far distances (the slopes remain the same).
-    Out.Near = Near * Scale;
-    Out.Far = Far * Scale;
-
-    // Copy the slopes.
-    Out.RightSlope = RightSlope;
-    Out.LeftSlope = LeftSlope;
-    Out.TopSlope = TopSlope;
-    Out.BottomSlope = BottomSlope;
-}
-
-
-//-----------------------------------------------------------------------------
-// Get the corner points of the frustum
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingFrustum::GetCorners( XMFLOAT3* Corners ) const
-{
-    assert( Corners != 0 );
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
-
-    // Returns 8 corners position of bounding frustum.
-    //     Near    Far
-    //    0----1  4----5
-    //    |    |  |    |
-    //    |    |  |    |
-    //    3----2  7----6
-
-    XMVECTOR vCorners[CORNER_COUNT];
-    vCorners[0] = vLeftTop * vNear;
-    vCorners[1] = vRightTop * vNear;
-    vCorners[2] = vRightBottom * vNear;
-    vCorners[3] = vLeftBottom * vNear;
-    vCorners[4] = vLeftTop * vFar;
-    vCorners[5] = vRightTop * vFar;
-    vCorners[6] = vRightBottom * vFar;
-    vCorners[7] = vLeftBottom * vFar;
-
-    for( size_t i=0; i < CORNER_COUNT; ++i )
-    {
-        XMVECTOR C = XMVector3Rotate( vCorners[i], vOrientation ) + vOrigin;
-        XMStoreFloat3( &Corners[i], C );
-    }
-}
-
-
-//-----------------------------------------------------------------------------
-// Point in frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR Point ) const
-{
-    // Build frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-
-    // Load origin and orientation.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Transform point into local space of frustum.
-    XMVECTOR TPoint = XMVector3InverseRotate( Point - vOrigin, vOrientation );
-
-    // Set w to one.
-    TPoint = XMVectorInsert<0, 0, 0, 0, 1>( TPoint, XMVectorSplatOne() );
-
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Outside = Zero;
-
-    // Test point against each plane of the frustum.
-    for( size_t i = 0; i < 6; ++i )
-    {
-        XMVECTOR Dot = XMVector4Dot( TPoint, Planes[i] );
-        Outside = XMVectorOrInt( Outside, XMVectorGreater( Dot, Zero ) );
-    }
-
-    return XMVector4NotEqualInt( Outside, XMVectorTrueInt() ) ? CONTAINS : DISJOINT;
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
-    NearPlane = XMPlaneNormalize( NearPlane );
-
-    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
-    FarPlane = XMPlaneNormalize( FarPlane );
-
-    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
-    RightPlane = XMPlaneNormalize( RightPlane );
-
-    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
-    LeftPlane = XMPlaneNormalize( LeftPlane );
-    
-    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
-    TopPlane = XMPlaneNormalize( TopPlane );
-
-    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
-    BottomPlane = XMPlaneNormalize( BottomPlane );
-
-    return TriangleTests::ContainedBy( V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingFrustum::Contains( const BoundingSphere& sh ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
-    NearPlane = XMPlaneNormalize( NearPlane );
-
-    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
-    FarPlane = XMPlaneNormalize( FarPlane );
-
-    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
-    RightPlane = XMPlaneNormalize( RightPlane );
-
-    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
-    LeftPlane = XMPlaneNormalize( LeftPlane );
-    
-    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
-    TopPlane = XMPlaneNormalize( TopPlane );
-
-    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
-    BottomPlane = XMPlaneNormalize( BottomPlane );
-
-    return sh.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingFrustum::Contains( const BoundingBox& box ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
-    NearPlane = XMPlaneNormalize( NearPlane );
-
-    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
-    FarPlane = XMPlaneNormalize( FarPlane );
-
-    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
-    RightPlane = XMPlaneNormalize( RightPlane );
-
-    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
-    LeftPlane = XMPlaneNormalize( LeftPlane );
-    
-    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
-    TopPlane = XMPlaneNormalize( TopPlane );
-
-    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
-    BottomPlane = XMPlaneNormalize( BottomPlane );
-
-    return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingFrustum::Contains( const BoundingOrientedBox& box ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
-    NearPlane = XMPlaneNormalize( NearPlane );
-
-    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
-    FarPlane = XMPlaneNormalize( FarPlane );
-
-    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
-    RightPlane = XMPlaneNormalize( RightPlane );
-
-    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
-    LeftPlane = XMPlaneNormalize( LeftPlane );
-    
-    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
-    TopPlane = XMPlaneNormalize( TopPlane );
-
-    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
-    BottomPlane = XMPlaneNormalize( BottomPlane );
-
-    return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType BoundingFrustum::Contains( const BoundingFrustum& fr ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
-    NearPlane = XMPlaneNormalize( NearPlane );
-
-    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
-    FarPlane = XMPlaneNormalize( FarPlane );
-
-    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
-    RightPlane = XMPlaneNormalize( RightPlane );
-
-    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
-    LeftPlane = XMPlaneNormalize( LeftPlane );
-    
-    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
-    TopPlane = XMPlaneNormalize( TopPlane );
-
-    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
-    BottomPlane = XMPlaneNormalize( BottomPlane );
-
-    return fr.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
-}
-
-
-//-----------------------------------------------------------------------------
-// Exact sphere vs frustum test.  The algorithm first checks the sphere against
-// the planes of the frustum, then if the plane checks were indeterminate finds
-// the nearest feature (plane, line, point) on the frustum to the center of the
-// sphere and compares the distance to the nearest feature to the radius of the 
-// sphere
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingFrustum::Intersects( const BoundingSphere& sh ) const
-{
-    XMVECTOR Zero = XMVectorZero();
-
-    // Build the frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-
-    // Normalize the planes so we can compare to the sphere radius.
-    Planes[2] = XMVector3Normalize( Planes[2] );
-    Planes[3] = XMVector3Normalize( Planes[3] );
-    Planes[4] = XMVector3Normalize( Planes[4] );
-    Planes[5] = XMVector3Normalize( Planes[5] );
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3( &sh.Center );
-    XMVECTOR vRadius = XMVectorReplicatePtr( &sh.Radius );
-
-    // Transform the center of the sphere into the local space of frustum.
-    vCenter = XMVector3InverseRotate( vCenter - vOrigin, vOrientation );
-
-    // Set w of the center to one so we can dot4 with the plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
-
-    // Check against each plane of the frustum.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-    XMVECTOR CenterInsideAll = XMVectorTrueInt();
-
-    XMVECTOR Dist[6];
-
-    for( size_t i = 0; i < 6; ++i )
-    {
-        Dist[i] = XMVector4Dot( vCenter, Planes[i] );
-
-        // Outside the plane?
-        Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist[i], vRadius ) );
-
-        // Fully inside the plane?
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist[i], -vRadius ) );
-
-        // Check if the center is inside the plane.
-        CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist[i], Zero ) );
-    }
-
-    // If the sphere is outside any of the planes it is outside. 
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If the sphere is inside all planes it is fully inside.
-    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
-        return true;
-
-    // If the center of the sphere is inside all planes and the sphere intersects 
-    // one or more planes then it must intersect.
-    if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) )
-        return true;
-
-    // The sphere may be outside the frustum or intersecting the frustum.
-    // Find the nearest feature (face, edge, or corner) on the frustum 
-    // to the sphere.
-
-    // The faces adjacent to each face are:
-    static const size_t adjacent_faces[6][4] =
-    {
-        { 2, 3, 4, 5 },    // 0
-        { 2, 3, 4, 5 },    // 1
-        { 0, 1, 4, 5 },    // 2
-        { 0, 1, 4, 5 },    // 3
-        { 0, 1, 2, 3 },    // 4
-        { 0, 1, 2, 3 }
-    };  // 5
-
-    XMVECTOR Intersects = XMVectorFalseInt();
-
-    // Check to see if the nearest feature is one of the planes.
-    for( size_t i = 0; i < 6; ++i )
-    {
-        // Find the nearest point on the plane to the center of the sphere.
-        XMVECTOR Point = vCenter - (Planes[i] * Dist[i]);
-
-        // Set w of the point to one.
-        Point = XMVectorInsert<0, 0, 0, 0, 1>( Point, XMVectorSplatOne() );
-        
-        // If the point is inside the face (inside the adjacent planes) then
-        // this plane is the nearest feature.
-        XMVECTOR InsideFace = XMVectorTrueInt();
-        
-        for ( size_t j = 0; j < 4; j++ )
-        {
-            size_t plane_index = adjacent_faces[i][j];
-
-            InsideFace = XMVectorAndInt( InsideFace,
-                           XMVectorLessOrEqual( XMVector4Dot( Point, Planes[plane_index] ), Zero ) );
-        }
-     
-        // Since we have already checked distance from the plane we know that the
-        // sphere must intersect if this plane is the nearest feature.
-        Intersects = XMVectorOrInt( Intersects, 
-                                    XMVectorAndInt( XMVectorGreater( Dist[i], Zero ), InsideFace ) );
-    }
-
-    if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) )
-        return true;
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
-
-    XMVECTOR Corners[CORNER_COUNT];
-    Corners[0] = vRightTop * vNear;
-    Corners[1] = vRightBottom * vNear;
-    Corners[2] = vLeftTop * vNear;
-    Corners[3] = vLeftBottom * vNear;
-    Corners[4] = vRightTop * vFar;
-    Corners[5] = vRightBottom * vFar;
-    Corners[6] = vLeftTop * vFar;
-    Corners[7] = vLeftBottom * vFar;
-
-    // The Edges are:
-    static const size_t edges[12][2] =
-    {
-        { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 },    // Near plane
-        { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 },    // Far plane
-        { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 },
-    }; // Near to far
-
-    XMVECTOR RadiusSq = vRadius * vRadius;
-
-    // Check to see if the nearest feature is one of the edges (or corners).
-    for( size_t i = 0; i < 12; ++i )
-    {
-        size_t ei0 = edges[i][0];
-        size_t ei1 = edges[i][1];
-
-        // Find the nearest point on the edge to the center of the sphere.
-        // The corners of the frustum are included as the endpoints of the edges.
-        XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint( Corners[ei0], Corners[ei1], vCenter );
-
-        XMVECTOR Delta = vCenter - Point;
-
-        XMVECTOR DistSq = XMVector3Dot( Delta, Delta );
-
-        // If the distance to the center of the sphere to the point is less than 
-        // the radius of the sphere then it must intersect.
-        Intersects = XMVectorOrInt( Intersects, XMVectorLessOrEqual( DistSq, RadiusSq ) );
-    }
-
-    if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) )
-        return true;
-
-    // The sphere must be outside the frustum.
-    return false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Exact axis aligned box vs frustum test.  Constructs an oriented box and uses
-// the oriented box vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingFrustum::Intersects( const BoundingBox& box ) const
-{
-    // Make the axis aligned box oriented and do an OBB vs frustum test.
-    BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) );
-    return Intersects( obox );
-}
-
-
-//-----------------------------------------------------------------------------
-// Exact oriented box vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingFrustum::Intersects( const BoundingOrientedBox& box ) const
-{
-    static const XMVECTORU32 SelectY =
-    {
-        XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0
-    };
-    static const XMVECTORU32 SelectZ =
-    {
-        XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0
-    };
-
-    XMVECTOR Zero = XMVectorZero();
-
-    // Build the frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR FrustumOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( FrustumOrientation ) );
-
-    // Load the box.
-    XMVECTOR Center = XMLoadFloat3( &box.Center );
-    XMVECTOR Extents = XMLoadFloat3( &box.Extents );
-    XMVECTOR BoxOrientation = XMLoadFloat4( &box.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
-
-    // Transform the oriented box into the space of the frustum in order to 
-    // minimize the number of transforms we have to do.
-    Center = XMVector3InverseRotate( Center - vOrigin, FrustumOrientation );
-    BoxOrientation = XMQuaternionMultiply( BoxOrientation, XMQuaternionConjugate( FrustumOrientation ) );
-
-    // Set w of the center to one so we can dot4 with the plane.
-    Center = XMVectorInsert<0, 0, 0, 0, 1>( Center, XMVectorSplatOne() );
-
-    // Build the 3x3 rotation matrix that defines the box axes.
-    XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation );
-
-    // Check against each plane of the frustum.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-    XMVECTOR CenterInsideAll = XMVectorTrueInt();
-
-    for( size_t i = 0; i < 6; ++i )
-    {
-        // Compute the distance to the center of the box.
-        XMVECTOR Dist = XMVector4Dot( Center, Planes[i] );
-
-        // Project the axes of the box onto the normal of the plane.  Half the
-        // length of the projection (sometime called the "radius") is equal to
-        // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
-        // where h(i) are extents of the box, n is the plane normal, and b(i) are the 
-        // axes of the box.
-        XMVECTOR Radius = XMVector3Dot( Planes[i], R.r[0] );
-        Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[1] ), SelectY );
-        Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[2] ), SelectZ );
-        Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) );
-
-        // Outside the plane?
-        Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, Radius ) );
-
-        // Fully inside the plane?
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist, -Radius ) );
-
-        // Check if the center is inside the plane.
-        CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist, Zero ) );
-    }
-
-    // If the box is outside any of the planes it is outside. 
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If the box is inside all planes it is fully inside.
-    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
-        return true;
-
-    // If the center of the box is inside all planes and the box intersects 
-    // one or more planes then it must intersect.
-    if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) )
-        return true;
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
-
-    XMVECTOR Corners[CORNER_COUNT];
-    Corners[0] = vRightTop * vNear;
-    Corners[1] = vRightBottom * vNear;
-    Corners[2] = vLeftTop * vNear;
-    Corners[3] = vLeftBottom * vNear;
-    Corners[4] = vRightTop * vFar;
-    Corners[5] = vRightBottom * vFar;
-    Corners[6] = vLeftTop * vFar;
-    Corners[7] = vLeftBottom * vFar;
-
-    // Test against box axes (3)
-    {
-        // Find the min/max values of the projection of the frustum onto each axis.
-        XMVECTOR FrustumMin, FrustumMax;
-
-        FrustumMin = XMVector3Dot( Corners[0], R.r[0] );
-        FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[1] ), SelectY );
-        FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[2] ), SelectZ );
-        FrustumMax = FrustumMin;
-
-        for( size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i )
-        {
-            XMVECTOR Temp = XMVector3Dot( Corners[i], R.r[0] );
-            Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[1] ), SelectY );
-            Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[2] ), SelectZ );
-
-            FrustumMin = XMVectorMin( FrustumMin, Temp );
-            FrustumMax = XMVectorMax( FrustumMax, Temp );
-        }
-
-        // Project the center of the box onto the axes.
-        XMVECTOR BoxDist = XMVector3Dot( Center, R.r[0] );
-        BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[1] ), SelectY );
-        BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[2] ), SelectZ );
-
-        // The projection of the box onto the axis is just its Center and Extents.
-        // if (min > box_max || max < box_min) reject;
-        XMVECTOR Result = XMVectorOrInt( XMVectorGreater( FrustumMin, BoxDist + Extents ),
-                                          XMVectorLess( FrustumMax, BoxDist - Extents ) );
-
-        if( DirectX::Internal::XMVector3AnyTrue( Result ) )
-            return false;
-    }
-
-    // Test against edge/edge axes (3*6).
-    XMVECTOR FrustumEdgeAxis[6];
-
-    FrustumEdgeAxis[0] = vRightTop;
-    FrustumEdgeAxis[1] = vRightBottom;
-    FrustumEdgeAxis[2] = vLeftTop;
-    FrustumEdgeAxis[3] = vLeftBottom;
-    FrustumEdgeAxis[4] = vRightTop - vLeftTop;
-    FrustumEdgeAxis[5] = vLeftBottom - vLeftTop;
-
-    for( size_t i = 0; i < 3; ++i )
-    {
-        for( size_t j = 0; j < 6; j++ )
-        {
-            // Compute the axis we are going to test.
-            XMVECTOR Axis = XMVector3Cross( R.r[i], FrustumEdgeAxis[j] );
-
-            // Find the min/max values of the projection of the frustum onto the axis.
-            XMVECTOR FrustumMin, FrustumMax;
-
-            FrustumMin = FrustumMax = XMVector3Dot( Axis, Corners[0] );
-
-            for( size_t k = 1; k < CORNER_COUNT; k++ )
-            {
-                XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] );
-                FrustumMin = XMVectorMin( FrustumMin, Temp );
-                FrustumMax = XMVectorMax( FrustumMax, Temp );
-            }
-
-            // Project the center of the box onto the axis.
-            XMVECTOR Dist = XMVector3Dot( Center, Axis );
-
-            // Project the axes of the box onto the axis to find the "radius" of the box.
-            XMVECTOR Radius = XMVector3Dot( Axis, R.r[0] );
-            Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[1] ), SelectY );
-            Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[2] ), SelectZ );
-            Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) );
-
-            // if (center > max + radius || center < min - radius) reject;
-            Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, FrustumMax + Radius ) );
-            Outside = XMVectorOrInt( Outside, XMVectorLess( Dist, FrustumMin - Radius ) );
-        }
-    }
-
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If we did not find a separating plane then the box must intersect the frustum.
-    return true;
-}
-
-
-//-----------------------------------------------------------------------------
-// Exact frustum vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool BoundingFrustum::Intersects( const BoundingFrustum& fr ) const
-{
-    // Load origin and orientation of frustum B.
-    XMVECTOR OriginB = XMLoadFloat3( &Origin );
-    XMVECTOR OrientationB = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( OrientationB ) );
-
-    // Build the planes of frustum B.
-    XMVECTOR AxisB[6];
-    AxisB[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f );
-    AxisB[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f );
-    AxisB[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    AxisB[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    AxisB[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    AxisB[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-
-    XMVECTOR PlaneDistB[6];
-    PlaneDistB[0] = -XMVectorReplicatePtr( &Near );
-    PlaneDistB[1] = XMVectorReplicatePtr( &Far );
-    PlaneDistB[2] = XMVectorZero();
-    PlaneDistB[3] = XMVectorZero();
-    PlaneDistB[4] = XMVectorZero();
-    PlaneDistB[5] = XMVectorZero();
-
-    // Load origin and orientation of frustum A.
-    XMVECTOR OriginA = XMLoadFloat3( &fr.Origin );
-    XMVECTOR OrientationA = XMLoadFloat4( &fr.Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( OrientationA ) );
-
-    // Transform frustum A into the space of the frustum B in order to 
-    // minimize the number of transforms we have to do.
-    OriginA = XMVector3InverseRotate( OriginA - OriginB, OrientationB );
-    OrientationA = XMQuaternionMultiply( OrientationA, XMQuaternionConjugate( OrientationB ) );
-
-    // Build the corners of frustum A (in the local space of B).
-    XMVECTOR RightTopA = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f );
-    XMVECTOR RightBottomA = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f );
-    XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope,fr.TopSlope, 1.0f, 0.0f );
-    XMVECTOR LeftBottomA = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f );
-    XMVECTOR NearA = XMVectorReplicatePtr( &fr.Near );
-    XMVECTOR FarA = XMVectorReplicatePtr( &fr.Far );
-
-    RightTopA = XMVector3Rotate( RightTopA, OrientationA );
-    RightBottomA = XMVector3Rotate( RightBottomA, OrientationA );
-    LeftTopA = XMVector3Rotate( LeftTopA, OrientationA );
-    LeftBottomA = XMVector3Rotate( LeftBottomA, OrientationA );
-
-    XMVECTOR CornersA[CORNER_COUNT];
-    CornersA[0] = OriginA + RightTopA * NearA;
-    CornersA[1] = OriginA + RightBottomA * NearA;
-    CornersA[2] = OriginA + LeftTopA * NearA;
-    CornersA[3] = OriginA + LeftBottomA * NearA;
-    CornersA[4] = OriginA + RightTopA * FarA;
-    CornersA[5] = OriginA + RightBottomA * FarA;
-    CornersA[6] = OriginA + LeftTopA * FarA;
-    CornersA[7] = OriginA + LeftBottomA * FarA;
-
-    // Check frustum A against each plane of frustum B.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    for( size_t i = 0; i < 6; ++i )
-    {
-        // Find the min/max projection of the frustum onto the plane normal.
-        XMVECTOR Min, Max;
-
-        Min = Max = XMVector3Dot( AxisB[i], CornersA[0] );
-
-        for( size_t j = 1; j < CORNER_COUNT; j++ )
-        {
-            XMVECTOR Temp = XMVector3Dot( AxisB[i], CornersA[j] );
-            Min = XMVectorMin( Min, Temp );
-            Max = XMVectorMax( Max, Temp );
-        }
-
-        // Outside the plane?
-        Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistB[i] ) );
-
-        // Fully inside the plane?
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Max, PlaneDistB[i] ) );
-    }
-
-    // If the frustum A is outside any of the planes of frustum B it is outside. 
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If frustum A is inside all planes of frustum B it is fully inside.
-    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
-        return true;
-
-    // Build the corners of frustum B.
-    XMVECTOR RightTopB = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR RightBottomB = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR LeftTopB = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR LeftBottomB = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR NearB = XMVectorReplicatePtr( &Near );
-    XMVECTOR FarB = XMVectorReplicatePtr( &Far );
-
-    XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT];
-    CornersB[0] = RightTopB * NearB;
-    CornersB[1] = RightBottomB * NearB;
-    CornersB[2] = LeftTopB * NearB;
-    CornersB[3] = LeftBottomB * NearB;
-    CornersB[4] = RightTopB * FarB;
-    CornersB[5] = RightBottomB * FarB;
-    CornersB[6] = LeftTopB * FarB;
-    CornersB[7] = LeftBottomB * FarB;
-
-    // Build the planes of frustum A (in the local space of B).
-    XMVECTOR AxisA[6];
-    XMVECTOR PlaneDistA[6];
-
-    AxisA[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f );
-    AxisA[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f );
-    AxisA[2] = XMVectorSet( 1.0f, 0.0f, -fr.RightSlope, 0.0f );
-    AxisA[3] = XMVectorSet( -1.0f, 0.0f, fr.LeftSlope, 0.0f );
-    AxisA[4] = XMVectorSet( 0.0f, 1.0f, -fr.TopSlope, 0.0f );
-    AxisA[5] = XMVectorSet( 0.0f, -1.0f, fr.BottomSlope, 0.0f );
-
-    AxisA[0] = XMVector3Rotate( AxisA[0], OrientationA );
-    AxisA[1] = -AxisA[0];
-    AxisA[2] = XMVector3Rotate( AxisA[2], OrientationA );
-    AxisA[3] = XMVector3Rotate( AxisA[3], OrientationA );
-    AxisA[4] = XMVector3Rotate( AxisA[4], OrientationA );
-    AxisA[5] = XMVector3Rotate( AxisA[5], OrientationA );
-
-    PlaneDistA[0] = XMVector3Dot( AxisA[0], CornersA[0] );  // Re-use corner on near plane.
-    PlaneDistA[1] = XMVector3Dot( AxisA[1], CornersA[4] );  // Re-use corner on far plane.
-    PlaneDistA[2] = XMVector3Dot( AxisA[2], OriginA );
-    PlaneDistA[3] = XMVector3Dot( AxisA[3], OriginA );
-    PlaneDistA[4] = XMVector3Dot( AxisA[4], OriginA );
-    PlaneDistA[5] = XMVector3Dot( AxisA[5], OriginA );
-
-    // Check each axis of frustum A for a seperating plane (5).
-    for( size_t i = 0; i < 6; ++i )
-    {
-        // Find the minimum projection of the frustum onto the plane normal.
-        XMVECTOR Min;
-
-        Min = XMVector3Dot( AxisA[i], CornersB[0] );
-
-        for( size_t j = 1; j < CORNER_COUNT; j++ )
-        {
-            XMVECTOR Temp = XMVector3Dot( AxisA[i], CornersB[j] );
-            Min = XMVectorMin( Min, Temp );
-        }
-
-        // Outside the plane?
-        Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistA[i] ) );
-    }
-
-    // If the frustum B is outside any of the planes of frustum A it is outside. 
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // Check edge/edge axes (6 * 6).
-    XMVECTOR FrustumEdgeAxisA[6];
-    FrustumEdgeAxisA[0] = RightTopA;
-    FrustumEdgeAxisA[1] = RightBottomA;
-    FrustumEdgeAxisA[2] = LeftTopA;
-    FrustumEdgeAxisA[3] = LeftBottomA;
-    FrustumEdgeAxisA[4] = RightTopA - LeftTopA;
-    FrustumEdgeAxisA[5] = LeftBottomA - LeftTopA;
-
-    XMVECTOR FrustumEdgeAxisB[6];
-    FrustumEdgeAxisB[0] = RightTopB;
-    FrustumEdgeAxisB[1] = RightBottomB;
-    FrustumEdgeAxisB[2] = LeftTopB;
-    FrustumEdgeAxisB[3] = LeftBottomB;
-    FrustumEdgeAxisB[4] = RightTopB - LeftTopB;
-    FrustumEdgeAxisB[5] = LeftBottomB - LeftTopB;
-
-    for( size_t i = 0; i < 6; ++i )
-    {
-        for( size_t j = 0; j < 6; j++ )
-        {
-            // Compute the axis we are going to test.
-            XMVECTOR Axis = XMVector3Cross( FrustumEdgeAxisA[i], FrustumEdgeAxisB[j] );
-
-            // Find the min/max values of the projection of both frustums onto the axis.
-            XMVECTOR MinA, MaxA;
-            XMVECTOR MinB, MaxB;
-
-            MinA = MaxA = XMVector3Dot( Axis, CornersA[0] );
-            MinB = MaxB = XMVector3Dot( Axis, CornersB[0] );
-
-            for( size_t k = 1; k < CORNER_COUNT; k++ )
-            {
-                XMVECTOR TempA = XMVector3Dot( Axis, CornersA[k] );
-                MinA = XMVectorMin( MinA, TempA );
-                MaxA = XMVectorMax( MaxA, TempA );
-
-                XMVECTOR TempB = XMVector3Dot( Axis, CornersB[k] );
-                MinB = XMVectorMin( MinB, TempB );
-                MaxB = XMVectorMax( MaxB, TempB );
-            }
-
-            // if (MinA > MaxB || MinB > MaxA) reject
-            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) );
-            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) );
-        }
-    }
-
-    // If there is a seperating plane, then the frustums do not intersect.
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If we did not find a separating plane then the frustums intersect.
-    return true;
-}
-
-
-//-----------------------------------------------------------------------------
-// Triangle vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
-{
-    // Build the frustum planes (NOTE: D is negated from the usual).
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, -Near );
-    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, Far );
-    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Transform triangle into the local space of frustum.
-    XMVECTOR TV0 = XMVector3InverseRotate( V0 - vOrigin, vOrientation );
-    XMVECTOR TV1 = XMVector3InverseRotate( V1 - vOrigin, vOrientation );
-    XMVECTOR TV2 = XMVector3InverseRotate( V2 - vOrigin, vOrientation );
-
-    // Test each vertex of the triangle against the frustum planes.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    for( size_t i = 0; i < 6; ++i )
-    {
-        XMVECTOR Dist0 = XMVector3Dot( TV0, Planes[i] );
-        XMVECTOR Dist1 = XMVector3Dot( TV1, Planes[i] );
-        XMVECTOR Dist2 = XMVector3Dot( TV2, Planes[i] );
-
-        XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 );
-        MinDist = XMVectorMin( MinDist, Dist2 );
-        XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 );
-        MaxDist = XMVectorMax( MaxDist, Dist2 );
-
-        XMVECTOR PlaneDist = XMVectorSplatW( Planes[i] );
-
-        // Outside the plane?
-        Outside = XMVectorOrInt( Outside, XMVectorGreater( MinDist, PlaneDist ) );
-
-        // Fully inside the plane?
-        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( MaxDist, PlaneDist ) );
-    }
-
-    // If the triangle is outside any of the planes it is outside. 
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If the triangle is inside all planes it is fully inside.
-    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
-        return true;
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
-
-    XMVECTOR Corners[CORNER_COUNT];
-    Corners[0] = vRightTop * vNear;
-    Corners[1] = vRightBottom * vNear;
-    Corners[2] = vLeftTop * vNear;
-    Corners[3] = vLeftBottom * vNear;
-    Corners[4] = vRightTop * vFar;
-    Corners[5] = vRightBottom * vFar;
-    Corners[6] = vLeftTop * vFar;
-    Corners[7] = vLeftBottom * vFar;
-
-    // Test the plane of the triangle.
-    XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 );
-    XMVECTOR Dist = XMVector3Dot( Normal, V0 );
-
-    XMVECTOR MinDist, MaxDist;
-    MinDist = MaxDist = XMVector3Dot( Corners[0], Normal );
-    for( size_t i = 1; i < CORNER_COUNT; ++i )
-    {
-        XMVECTOR Temp = XMVector3Dot( Corners[i], Normal );
-        MinDist = XMVectorMin( MinDist, Temp );
-        MaxDist = XMVectorMax( MaxDist, Temp );
-    }
-
-    Outside = XMVectorOrInt( XMVectorGreater( MinDist, Dist ), XMVectorLess( MaxDist, Dist ) );   
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // Check the edge/edge axes (3*6).
-    XMVECTOR TriangleEdgeAxis[3];
-    TriangleEdgeAxis[0] = V1 - V0;
-    TriangleEdgeAxis[1] = V2 - V1;
-    TriangleEdgeAxis[2] = V0 - V2;
-
-    XMVECTOR FrustumEdgeAxis[6];
-    FrustumEdgeAxis[0] = vRightTop;
-    FrustumEdgeAxis[1] = vRightBottom;
-    FrustumEdgeAxis[2] = vLeftTop;
-    FrustumEdgeAxis[3] = vLeftBottom;
-    FrustumEdgeAxis[4] = vRightTop - vLeftTop;
-    FrustumEdgeAxis[5] = vLeftBottom - vLeftTop;
-
-    for( size_t i = 0; i < 3; ++i )
-    {
-        for( size_t j = 0; j < 6; j++ )
-        {
-            // Compute the axis we are going to test.
-            XMVECTOR Axis = XMVector3Cross( TriangleEdgeAxis[i], FrustumEdgeAxis[j] );
-
-            // Find the min/max of the projection of the triangle onto the axis.
-            XMVECTOR MinA, MaxA;
-
-            XMVECTOR Dist0 = XMVector3Dot( V0, Axis );
-            XMVECTOR Dist1 = XMVector3Dot( V1, Axis );
-            XMVECTOR Dist2 = XMVector3Dot( V2, Axis );
-
-            MinA = XMVectorMin( Dist0, Dist1 );
-            MinA = XMVectorMin( MinA, Dist2 );
-            MaxA = XMVectorMax( Dist0, Dist1 );
-            MaxA = XMVectorMax( MaxA, Dist2 );
-
-            // Find the min/max of the projection of the frustum onto the axis.
-            XMVECTOR MinB, MaxB;
-
-            MinB = MaxB = XMVector3Dot( Axis, Corners[0] );
-
-            for( size_t k = 1; k < CORNER_COUNT; k++ )
-            {
-                XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] );
-                MinB = XMVectorMin( MinB, Temp );
-                MaxB = XMVectorMax( MaxB, Temp );
-            }
-
-            // if (MinA > MaxB || MinB > MaxA) reject;
-            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) );
-            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) );
-        }
-    }
-
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return false;
-
-    // If we did not find a separating plane then the triangle must intersect the frustum.
-    return true;
-}
-
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR Plane ) const
-{
-    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Set w of the origin to one so we can dot4 with a plane.
-    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() );
-
-    // Build the corners of the frustum (in world space).
-    XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
-
-    RightTop = XMVector3Rotate( RightTop, vOrientation );
-    RightBottom = XMVector3Rotate( RightBottom, vOrientation );
-    LeftTop = XMVector3Rotate( LeftTop, vOrientation );
-    LeftBottom = XMVector3Rotate( LeftBottom, vOrientation );
-
-    XMVECTOR Corners0 = vOrigin + RightTop * vNear;
-    XMVECTOR Corners1 = vOrigin + RightBottom * vNear;
-    XMVECTOR Corners2 = vOrigin + LeftTop * vNear;
-    XMVECTOR Corners3 = vOrigin + LeftBottom * vNear;
-    XMVECTOR Corners4 = vOrigin + RightTop * vFar;
-    XMVECTOR Corners5 = vOrigin + RightBottom * vFar;
-    XMVECTOR Corners6 = vOrigin + LeftTop * vFar;
-    XMVECTOR Corners7 = vOrigin + LeftBottom * vFar;
-
-    XMVECTOR Outside, Inside;
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane, Outside, Inside );
-
-    // If the frustum is outside any plane it is outside.
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return FRONT;
-
-    // If the frustum is inside all planes it is inside.
-    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
-        return BACK;
-
-    // The frustum is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-
-//-----------------------------------------------------------------------------
-// Ray vs. frustum test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist ) const
-{
-    // If ray starts inside the frustum, return a distance of 0 for the hit
-    if ( Contains(rayOrigin) == CONTAINS )
-    {
-        Dist = 0.0f;
-        return true;
-    }
-
-    // Build the frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR frOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR frOrientation = XMLoadFloat4( &Orientation );
-
-    // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250
-    float tnear = -FLT_MAX;
-    float tfar = FLT_MAX;
-
-    for( size_t i=0; i < 6; ++i )
-    {
-        XMVECTOR Plane = DirectX::Internal::XMPlaneTransform( Planes[i], frOrientation, frOrigin );
-        Plane = XMPlaneNormalize( Plane );
-
-        XMVECTOR AxisDotOrigin = XMPlaneDotCoord( Plane, rayOrigin );
-        XMVECTOR AxisDotDirection = XMVector3Dot( Plane, Direction );
-
-        if ( XMVector3LessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ) )
-        {
-            // Ray is parallel to plane - check if ray origin is inside plane's
-            if ( XMVector3Greater( AxisDotOrigin, g_XMZero ) )
-            {
-                // Ray origin is outside half-space.
-                Dist = 0.f;
-                return false;
-            }
-        }
-        else
-        {
-            // Ray not parallel - get distance to plane.
-            float vd = XMVectorGetX( AxisDotDirection );
-            float vn = XMVectorGetX( AxisDotOrigin );
-            float t = -vn / vd;
-            if (vd < 0.0f)
-            {
-                // Front face - T is a near point.
-                if (t > tfar)
-                {
-                    Dist = 0.f;
-                    return false;
-                }
-                if (t > tnear)
-                {
-                    // Hit near face.
-                    tnear = t;
-                }
-            }
-            else
-            {
-                // back face - T is far point.
-                if (t < tnear)
-                {
-                    Dist = 0.f;
-                    return false;
-                }
-                if (t < tfar)
-                {
-                    // Hit far face.
-                    tfar = t;
-                }
-            }
-        }
-    }
-
-    // Survived all tests.
-    // Note: if ray originates on polyhedron, may want to change 0.0f to some
-    // epsilon to avoid intersecting the originating face.
-    float distance = ( tnear >= 0.0f ) ? tnear : tfar;
-    if (distance >= 0.0f)
-    {
-        Dist = distance;
-        return true;
-    }    
-
-    Dist = 0.f;
-    return false;
-}
-
-
-//-----------------------------------------------------------------------------
-// Test a frustum vs 6 planes (typically forming another frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
-                                                                 GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
-
-    // Set w of the origin to one so we can dot4 with a plane.
-    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() );
-
-    // Build the corners of the frustum (in world space).
-    XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
-    XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
-    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
-    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
-
-    RightTop = XMVector3Rotate( RightTop, vOrientation );
-    RightBottom = XMVector3Rotate( RightBottom, vOrientation );
-    LeftTop = XMVector3Rotate( LeftTop, vOrientation );
-    LeftBottom = XMVector3Rotate( LeftBottom, vOrientation );
-
-    XMVECTOR Corners0 = vOrigin + RightTop * vNear;
-    XMVECTOR Corners1 = vOrigin + RightBottom * vNear;
-    XMVECTOR Corners2 = vOrigin + LeftTop * vNear;
-    XMVECTOR Corners3 = vOrigin + LeftBottom * vNear;
-    XMVECTOR Corners4 = vOrigin + RightTop * vFar;
-    XMVECTOR Corners5 = vOrigin + RightBottom * vFar;
-    XMVECTOR Corners6 = vOrigin + LeftTop * vFar;
-    XMVECTOR Corners7 = vOrigin + LeftBottom * vFar;
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane0, Outside, Inside );
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane1, Outside, Inside );
-
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane2, Outside, Inside );
-
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane3, Outside, Inside );
-
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane4, Outside, Inside );
-
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
-                                                  Corners4, Corners5, Corners6, Corners7, 
-                                                  Plane5, Outside, Inside );
-
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    // If the frustum is outside any plane it is outside.
-    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
-        return DISJOINT;
-
-    // If the frustum is inside all planes it is inside.
-    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
-        return CONTAINS;
-
-    // The frustum is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-
-//-----------------------------------------------------------------------------
-// Build the 6 frustum planes from a frustum.
-//
-// The intended use for these routines is for fast culling to a view frustum.  
-// When the volume being tested against a view frustum is small relative to the
-// view frustum it is usually either inside all six planes of the frustum
-// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither
-// of these cases is true then it may or may not be intersecting the frustum
-// (INTERSECTS)
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void BoundingFrustum::GetPlanes( XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane,
-                                        XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane ) const
-{
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
-    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
-
-    if (NearPlane)
-    {
-        XMVECTOR vNearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
-        vNearPlane = DirectX::Internal::XMPlaneTransform( vNearPlane, vOrientation, vOrigin );
-        *NearPlane = XMPlaneNormalize( vNearPlane );
-    }
-
-    if (FarPlane)
-    {
-        XMVECTOR vFarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
-        vFarPlane = DirectX::Internal::XMPlaneTransform( vFarPlane, vOrientation, vOrigin );
-        *FarPlane = XMPlaneNormalize( vFarPlane );
-    }
-
-    if (RightPlane)
-    {
-        XMVECTOR vRightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
-        vRightPlane = DirectX::Internal::XMPlaneTransform( vRightPlane, vOrientation, vOrigin );
-        *RightPlane = XMPlaneNormalize( vRightPlane );
-    }
-
-    if (LeftPlane)
-    {
-        XMVECTOR vLeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
-        vLeftPlane = DirectX::Internal::XMPlaneTransform( vLeftPlane, vOrientation, vOrigin );
-        *LeftPlane = XMPlaneNormalize( vLeftPlane );
-    }
-
-    if (TopPlane)
-    {
-        XMVECTOR vTopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
-        vTopPlane = DirectX::Internal::XMPlaneTransform( vTopPlane, vOrientation, vOrigin );
-        *TopPlane = XMPlaneNormalize( vTopPlane );
-    }
-
-    if (BottomPlane)
-    {
-        XMVECTOR vBottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
-        vBottomPlane = DirectX::Internal::XMPlaneTransform( vBottomPlane, vOrientation, vOrigin );
-        *BottomPlane = XMPlaneNormalize( vBottomPlane );
-    }
-}
-
-
-//-----------------------------------------------------------------------------
-// Build a frustum from a persepective projection matrix.  The matrix may only
-// contain a projection; any rotation, translation or scale will cause the
-// constructed frustum to be incorrect.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix( BoundingFrustum& Out, FXMMATRIX Projection )
-{
-    // Corners of the projection frustum in homogenous space.
-    static XMVECTORF32 HomogenousPoints[6] =
-    {
-        {  1.0f,  0.0f, 1.0f, 1.0f },   // right (at far plane)
-        { -1.0f,  0.0f, 1.0f, 1.0f },   // left
-        {  0.0f,  1.0f, 1.0f, 1.0f },   // top
-        {  0.0f, -1.0f, 1.0f, 1.0f },   // bottom
-
-        { 0.0f, 0.0f, 0.0f, 1.0f },     // near
-        { 0.0f, 0.0f, 1.0f, 1.0f }      // far
-    };
-
-    XMVECTOR Determinant;
-    XMMATRIX matInverse = XMMatrixInverse( &Determinant, Projection );
-
-    // Compute the frustum corners in world space.
-    XMVECTOR Points[6];
-
-    for( size_t i = 0; i < 6; ++i )
-    {
-        // Transform point.
-        Points[i] = XMVector4Transform( HomogenousPoints[i], matInverse );
-    }
-
-    Out.Origin = XMFLOAT3( 0.0f, 0.0f, 0.0f );
-    Out.Orientation = XMFLOAT4( 0.0f, 0.0f, 0.0f, 1.0f );
-
-    // Compute the slopes.
-    Points[0] = Points[0] * XMVectorReciprocal( XMVectorSplatZ( Points[0] ) );
-    Points[1] = Points[1] * XMVectorReciprocal( XMVectorSplatZ( Points[1] ) );
-    Points[2] = Points[2] * XMVectorReciprocal( XMVectorSplatZ( Points[2] ) );
-    Points[3] = Points[3] * XMVectorReciprocal( XMVectorSplatZ( Points[3] ) );
-
-    Out.RightSlope = XMVectorGetX( Points[0] );
-    Out.LeftSlope = XMVectorGetX( Points[1] );
-    Out.TopSlope = XMVectorGetY( Points[2] );
-    Out.BottomSlope = XMVectorGetY( Points[3] );
-
-    // Compute near and far.
-    Points[4] = Points[4] * XMVectorReciprocal( XMVectorSplatW( Points[4] ) );
-    Points[5] = Points[5] * XMVectorReciprocal( XMVectorSplatW( Points[5] ) );
-
-    Out.Near = XMVectorGetZ( Points[4] );
-    Out.Far = XMVectorGetZ( Points[5] );
-}
-
-
-/****************************************************************************
- *
- * TriangleTests
- *
- ****************************************************************************/
-
-namespace TriangleTests
-{
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with a triangle 
-// (V0, V1, V2).  Return true if there is an intersection and also set *pDist 
-// to the distance along the ray to the intersection.
-// 
-// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage 
-// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, 
-// pp 21-28, 1997.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV Intersects( FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1, HXMVECTOR V2, float& Dist )
-{
-    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
-
-    XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR e1 = V1 - V0;
-    XMVECTOR e2 = V2 - V0;
-
-    // p = Direction ^ e2;
-    XMVECTOR p = XMVector3Cross( Direction, e2 );
-
-    // det = e1 * p;
-    XMVECTOR det = XMVector3Dot( e1, p );
-
-    XMVECTOR u, v, t;
-
-    if( XMVector3GreaterOrEqual( det, g_RayEpsilon ) )
-    {
-        // Determinate is positive (front side of the triangle).
-        XMVECTOR s = Origin - V0;
-
-        // u = s * p;
-        u = XMVector3Dot( s, p );
-
-        XMVECTOR NoIntersection = XMVectorLess( u, Zero );
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u, det ) );
-
-        // q = s ^ e1;
-        XMVECTOR q = XMVector3Cross( s, e1 );
-
-        // v = Direction * q;
-        v = XMVector3Dot( Direction, q );
-
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( v, Zero ) );
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u + v, det ) );
-
-        // t = e2 * q;
-        t = XMVector3Dot( e2, q );
-
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( t, Zero ) );
-
-        if( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) )
-        {
-            Dist = 0.f;
-            return false;
-        }
-    }
-    else if( XMVector3LessOrEqual( det, g_RayNegEpsilon ) )
-    {
-        // Determinate is negative (back side of the triangle).
-        XMVECTOR s = Origin - V0;
-
-        // u = s * p;
-        u = XMVector3Dot( s, p );
-
-        XMVECTOR NoIntersection = XMVectorGreater( u, Zero );
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u, det ) );
-
-        // q = s ^ e1;
-        XMVECTOR q = XMVector3Cross( s, e1 );
-
-        // v = Direction * q;
-        v = XMVector3Dot( Direction, q );
-
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( v, Zero ) );
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u + v, det ) );
-
-        // t = e2 * q;
-        t = XMVector3Dot( e2, q );
-
-        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( t, Zero ) );
-
-        if ( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) )
-        {
-            Dist = 0.f;
-            return false;
-        }
-    }
-    else
-    {
-        // Parallel ray.
-        Dist = 0.f;
-        return false;
-    }
-
-    t = XMVectorDivide ( t, det );
-
-    // (u / det) and (v / dev) are the barycentric cooridinates of the intersection.
-
-    // Store the x-component to *pDist
-    XMStoreFloat( &Dist, t );
-
-    return true;
-}
-
-
-//-----------------------------------------------------------------------------
-// Test if two triangles intersect.
-//
-// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast 
-// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics 
-// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and 
-// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal 
-// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003.
-//
-// The final test could be considered an edge-edge separating plane test with
-// the 9 possible cases narrowed down to the only two pairs of edges that can 
-// actaully result in a seperation.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline bool XM_CALLCONV Intersects( FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2 )
-{
-    static const XMVECTORU32 SelectY =
-    {
-        XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0
-    };
-    static const XMVECTORU32 SelectZ =
-    {
-        XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0
-    };
-    static const XMVECTORU32 Select0111 =
-    {
-        XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1
-    };
-    static const XMVECTORU32 Select1011 =
-    {
-        XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1
-    };
-    static const XMVECTORU32 Select1101 =
-    {
-        XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1
-    };
-
-    XMVECTOR Zero = XMVectorZero();
-
-    // Compute the normal of triangle A.
-    XMVECTOR N1 = XMVector3Cross( A1 - A0, A2 - A0 );
-
-    // Assert that the triangle is not degenerate.
-    assert( !XMVector3Equal( N1, Zero ) );
-
-    // Test points of B against the plane of A.
-    XMVECTOR BDist = XMVector3Dot( N1, B0 - A0 );
-    BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B1 - A0 ), SelectY );
-    BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B2 - A0 ), SelectZ );
-
-    // Ensure robustness with co-planar triangles by zeroing small distances.
-    uint32_t BDistIsZeroCR;
-    XMVECTOR BDistIsZero = XMVectorGreaterR( &BDistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) );
-    BDist = XMVectorSelect( BDist, Zero, BDistIsZero );
-
-    uint32_t BDistIsLessCR;
-    XMVECTOR BDistIsLess = XMVectorGreaterR( &BDistIsLessCR, Zero, BDist );
-
-    uint32_t BDistIsGreaterCR;
-    XMVECTOR BDistIsGreater = XMVectorGreaterR( &BDistIsGreaterCR, BDist, Zero );
-
-    // If all the points are on the same side we don't intersect.
-    if( XMComparisonAllTrue( BDistIsLessCR ) || XMComparisonAllTrue( BDistIsGreaterCR ) )
-        return false;
-
-    // Compute the normal of triangle B.
-    XMVECTOR N2 = XMVector3Cross( B1 - B0, B2 - B0 );
-
-    // Assert that the triangle is not degenerate.
-    assert( !XMVector3Equal( N2, Zero ) );
-
-    // Test points of A against the plane of B.
-    XMVECTOR ADist = XMVector3Dot( N2, A0 - B0 );
-    ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A1 - B0 ), SelectY );
-    ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A2 - B0 ), SelectZ );
-
-    // Ensure robustness with co-planar triangles by zeroing small distances.
-    uint32_t ADistIsZeroCR;
-    XMVECTOR ADistIsZero = XMVectorGreaterR( &ADistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) );
-    ADist = XMVectorSelect( ADist, Zero, ADistIsZero );
-
-    uint32_t ADistIsLessCR;
-    XMVECTOR ADistIsLess = XMVectorGreaterR( &ADistIsLessCR, Zero, ADist );
-
-    uint32_t ADistIsGreaterCR;
-    XMVECTOR ADistIsGreater = XMVectorGreaterR( &ADistIsGreaterCR, ADist, Zero );
-
-    // If all the points are on the same side we don't intersect.
-    if( XMComparisonAllTrue( ADistIsLessCR ) || XMComparisonAllTrue( ADistIsGreaterCR ) )
-        return false;
-
-    // Special case for co-planar triangles.
-    if( XMComparisonAllTrue( ADistIsZeroCR ) || XMComparisonAllTrue( BDistIsZeroCR ) )
-    {
-        XMVECTOR Axis, Dist, MinDist;
-
-        // Compute an axis perpindicular to the edge (points out).
-        Axis = XMVector3Cross( N1, A1 - A0 );
-        Dist = XMVector3Dot( Axis, A0 );
-
-        // Test points of B against the axis.
-        MinDist = XMVector3Dot( B0, Axis );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) );
-        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
-            return false;
-
-        // Edge (A1, A2)
-        Axis = XMVector3Cross( N1, A2 - A1 );
-        Dist = XMVector3Dot( Axis, A1 );
-
-        MinDist = XMVector3Dot( B0, Axis );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) );
-        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
-            return false;
-
-        // Edge (A2, A0)
-        Axis = XMVector3Cross( N1, A0 - A2 );
-        Dist = XMVector3Dot( Axis, A2 );
-
-        MinDist = XMVector3Dot( B0, Axis );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) );
-        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
-            return false;
-
-        // Edge (B0, B1)
-        Axis = XMVector3Cross( N2, B1 - B0 );
-        Dist = XMVector3Dot( Axis, B0 );
-
-        MinDist = XMVector3Dot( A0, Axis );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) );
-        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
-            return false;
-
-        // Edge (B1, B2)
-        Axis = XMVector3Cross( N2, B2 - B1 );
-        Dist = XMVector3Dot( Axis, B1 );
-
-        MinDist = XMVector3Dot( A0, Axis );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) );
-        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
-            return false;
-
-        // Edge (B2,B0)
-        Axis = XMVector3Cross( N2, B0 - B2 );
-        Dist = XMVector3Dot( Axis, B2 );
-
-        MinDist = XMVector3Dot( A0, Axis );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) );
-        MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) );
-        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
-            return false;
-
-        return true;
-    }
-
-    //
-    // Find the single vertex of A and B (ie the vertex on the opposite side
-    // of the plane from the other two) and reorder the edges so we can compute 
-    // the signed edge/edge distances.
-    //
-    // if ( (V0 >= 0 && V1 <  0 && V2 <  0) ||
-    //      (V0 >  0 && V1 <= 0 && V2 <= 0) ||
-    //      (V0 <= 0 && V1 >  0 && V2 >  0) ||
-    //      (V0 <  0 && V1 >= 0 && V2 >= 0) ) then V0 is singular;
-    //
-    // If our singular vertex is not on the positive side of the plane we reverse
-    // the triangle winding so that the overlap comparisons will compare the 
-    // correct edges with the correct signs.
-    //
-    XMVECTOR ADistIsLessEqual = XMVectorOrInt( ADistIsLess, ADistIsZero );
-    XMVECTOR ADistIsGreaterEqual = XMVectorOrInt( ADistIsGreater, ADistIsZero );
-
-    XMVECTOR AA0, AA1, AA2;
-    bool bPositiveA;
-
-    if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select0111 ) ) ||
-        DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select0111 ) ) )
-    {
-        // A0 is singular, crossing from positive to negative.
-        AA0 = A0; AA1 = A1; AA2 = A2;
-        bPositiveA = true;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select0111 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select0111 ) ) )
-    {
-        // A0 is singular, crossing from negative to positive.
-        AA0 = A0; AA1 = A2; AA2 = A1;
-        bPositiveA = false;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1011 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1011 ) ) )
-    {
-        // A1 is singular, crossing from positive to negative.
-        AA0 = A1; AA1 = A2; AA2 = A0;
-        bPositiveA = true;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1011 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1011 ) ) )
-    {
-        // A1 is singular, crossing from negative to positive.
-        AA0 = A1; AA1 = A0; AA2 = A2;
-        bPositiveA = false;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1101 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1101 ) ) )
-    {
-        // A2 is singular, crossing from positive to negative.
-        AA0 = A2; AA1 = A0; AA2 = A1;
-        bPositiveA = true;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1101 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1101 ) ) )
-    {
-        // A2 is singular, crossing from negative to positive.
-        AA0 = A2; AA1 = A1; AA2 = A0;
-        bPositiveA = false;
-    }
-    else
-    {
-        assert( false );
-        return false;
-    }
-
-    XMVECTOR BDistIsLessEqual = XMVectorOrInt( BDistIsLess, BDistIsZero );
-    XMVECTOR BDistIsGreaterEqual = XMVectorOrInt( BDistIsGreater, BDistIsZero );
-
-    XMVECTOR BB0, BB1, BB2;
-    bool bPositiveB;
-
-    if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select0111 ) ) ||
-        DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select0111 ) ) )
-    {
-        // B0 is singular, crossing from positive to negative.
-        BB0 = B0; BB1 = B1; BB2 = B2;
-        bPositiveB = true;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select0111 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select0111 ) ) )
-    {
-        // B0 is singular, crossing from negative to positive.
-        BB0 = B0; BB1 = B2; BB2 = B1;
-        bPositiveB = false;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1011 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1011 ) ) )
-    {
-        // B1 is singular, crossing from positive to negative.
-        BB0 = B1; BB1 = B2; BB2 = B0;
-        bPositiveB = true;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1011 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1011 ) ) )
-    {
-        // B1 is singular, crossing from negative to positive.
-        BB0 = B1; BB1 = B0; BB2 = B2;
-        bPositiveB = false;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1101 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1101 ) ) )
-    {
-        // B2 is singular, crossing from positive to negative.
-        BB0 = B2; BB1 = B0; BB2 = B1;
-        bPositiveB = true;
-    }
-    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1101 ) ) ||
-             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1101 ) ) )
-    {
-        // B2 is singular, crossing from negative to positive.
-        BB0 = B2; BB1 = B1; BB2 = B0;
-        bPositiveB = false;
-    }
-    else
-    {
-        assert( false );
-        return false;
-    }
-
-    XMVECTOR Delta0, Delta1;
-
-    // Reverse the direction of the test depending on whether the singular vertices are
-    // the same sign or different signs.
-    if( bPositiveA ^ bPositiveB )
-    {
-        Delta0 = ( BB0 - AA0 );
-        Delta1 = ( AA0 - BB0 );
-    }
-    else
-    {
-        Delta0 = ( AA0 - BB0 );
-        Delta1 = ( BB0 - AA0 );
-    }
-
-    // Check if the triangles overlap on the line of intersection between the
-    // planes of the two triangles by finding the signed line distances.
-    XMVECTOR Dist0 = XMVector3Dot( Delta0, XMVector3Cross( ( BB2 - BB0 ), ( AA2 - AA0 ) ) );
-    if( XMVector4Greater( Dist0, Zero ) )
-        return false;
-
-    XMVECTOR Dist1 = XMVector3Dot( Delta1, XMVector3Cross( ( BB1 - BB0 ), ( AA1 - AA0 ) ) );
-    if( XMVector4Greater( Dist1, Zero ) )
-        return false;
-
-    return true;
-}
-
-
-//-----------------------------------------------------------------------------
-// Ray-triangle test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PlaneIntersectionType XM_CALLCONV Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane )
-{
-    XMVECTOR One = XMVectorSplatOne();
-
-    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
-
-    // Set w of the points to one so we can dot4 with a plane.
-    XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
-    XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
-    XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
-
-    XMVECTOR Outside, Inside;
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane, Outside, Inside );
-
-    // If the triangle is outside any plane it is outside.
-    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
-        return FRONT;
-
-    // If the triangle is inside all planes it is inside.
-    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
-        return BACK;
-
-    // The triangle is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-
-//-----------------------------------------------------------------------------
-// Test a triangle vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_
-inline ContainmentType XM_CALLCONV ContainedBy( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2,
-                                                GXMVECTOR Plane0, HXMVECTOR Plane1, HXMVECTOR Plane2,
-                                                CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 )
-{
-    XMVECTOR One = XMVectorSplatOne();
-
-    // Set w of the points to one so we can dot4 with a plane.
-    XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
-    XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
-    XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane0, Outside, Inside );
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane1, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane2, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane3, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane4, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane5, Outside, Inside );
-    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
-    AllInside = XMVectorAndInt( AllInside, Inside );
-
-    // If the triangle is outside any plane it is outside.
-    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
-        return DISJOINT;
-
-    // If the triangle is inside all planes it is inside.
-    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
-        return CONTAINS;
-
-    // The triangle is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-}; // namespace TriangleTests
-
+//-------------------------------------------------------------------------------------
+// DirectXCollision.inl -- C++ Collision Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] =
+{
+    { -1.0f, -1.0f,  1.0f, 0.0f },
+    {  1.0f, -1.0f,  1.0f, 0.0f },
+    {  1.0f,  1.0f,  1.0f, 0.0f },
+    { -1.0f,  1.0f,  1.0f, 0.0f },
+    { -1.0f, -1.0f, -1.0f, 0.0f },
+    {  1.0f, -1.0f, -1.0f, 0.0f },
+    {  1.0f,  1.0f, -1.0f, 0.0f },
+    { -1.0f,  1.0f, -1.0f, 0.0f },
+};
+
+XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { 1e-20f, 1e-20f, 1e-20f, 1e-20f };
+XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { -1e-20f, -1e-20f, -1e-20f, -1e-20f };
+XMGLOBALCONST XMVECTORF32 g_FltMin = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX };
+XMGLOBALCONST XMVECTORF32 g_FltMax = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
+
+namespace Internal
+{
+
+//-----------------------------------------------------------------------------
+// Return true if any of the elements of a 3 vector are equal to 0xffffffff.
+// Slightly more efficient than using XMVector3EqualInt.
+//-----------------------------------------------------------------------------
+inline bool XMVector3AnyTrue( _In_ FXMVECTOR V )
+{
+    // Duplicate the fourth element from the first element.
+    XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(V);
+
+    return XMComparisonAnyTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Return true if all of the elements of a 3 vector are equal to 0xffffffff.
+// Slightly more efficient than using XMVector3EqualInt.
+//-----------------------------------------------------------------------------
+inline bool XMVector3AllTrue( _In_ FXMVECTOR V )
+{
+    // Duplicate the fourth element from the first element.
+    XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>( V );
+
+    return XMComparisonAllTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) );
+}
+
+#if defined(_PREFAST) || !defined(NDEBUG)
+
+XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f };
+XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f };
+XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f };
+
+//-----------------------------------------------------------------------------
+// Return true if the vector is a unit vector (length == 1).
+//-----------------------------------------------------------------------------
+inline bool XMVector3IsUnit( _In_ FXMVECTOR V )
+{
+    XMVECTOR Difference = XMVector3Length( V ) - XMVectorSplatOne();
+    return XMVector4Less( XMVectorAbs( Difference ), g_UnitVectorEpsilon );
+}
+
+//-----------------------------------------------------------------------------
+// Return true if the quaterion is a unit quaternion.
+//-----------------------------------------------------------------------------
+inline bool XMQuaternionIsUnit( _In_ FXMVECTOR Q )
+{
+    XMVECTOR Difference = XMVector4Length( Q ) - XMVectorSplatOne();
+    return XMVector4Less( XMVectorAbs( Difference ), g_UnitQuaternionEpsilon );
+}
+
+//-----------------------------------------------------------------------------
+// Return true if the plane is a unit plane.
+//-----------------------------------------------------------------------------
+inline bool XMPlaneIsUnit( _In_ FXMVECTOR Plane )
+{
+    XMVECTOR Difference = XMVector3Length( Plane ) - XMVectorSplatOne();
+    return XMVector4Less( XMVectorAbs( Difference ), g_UnitPlaneEpsilon );
+}
+
+#endif // __PREFAST__ || !NDEBUG
+
+//-----------------------------------------------------------------------------
+inline XMVECTOR XMPlaneTransform( _In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation )
+{
+    XMVECTOR vNormal = XMVector3Rotate( Plane, Rotation );
+    XMVECTOR vD = XMVectorSplatW( Plane ) - XMVector3Dot( vNormal, Translation );
+
+    return XMVectorInsert<0, 0, 0, 0, 1>( vNormal, vD );
+}
+
+//-----------------------------------------------------------------------------
+// Return the point on the line segement (S1, S2) nearest the point P.
+//-----------------------------------------------------------------------------
+inline XMVECTOR PointOnLineSegmentNearestPoint( _In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P )
+{
+    XMVECTOR Dir = S2 - S1;
+    XMVECTOR Projection = ( XMVector3Dot( P, Dir ) - XMVector3Dot( S1, Dir ) );
+    XMVECTOR LengthSq = XMVector3Dot( Dir, Dir );
+
+    XMVECTOR t = Projection * XMVectorReciprocal( LengthSq );
+    XMVECTOR Point = S1 + t * Dir;
+
+    // t < 0
+    XMVECTOR SelectS1 = XMVectorLess( Projection, XMVectorZero() );
+    Point = XMVectorSelect( Point, S1, SelectS1 );
+
+    // t > 1
+    XMVECTOR SelectS2 = XMVectorGreater( Projection, LengthSq );
+    Point = XMVectorSelect( Point, S2, SelectS2 );
+
+    return Point;
+}
+
+//-----------------------------------------------------------------------------
+// Test if the point (P) on the plane of the triangle is inside the triangle 
+// (V0, V1, V2).
+//-----------------------------------------------------------------------------
+inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle( _In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2 )
+{
+    // Compute the triangle normal.
+    XMVECTOR N = XMVector3Cross( V2 - V0, V1 - V0 );
+
+    // Compute the cross products of the vector from the base of each edge to 
+    // the point with each edge vector.
+    XMVECTOR C0 = XMVector3Cross( P - V0, V1 - V0 );
+    XMVECTOR C1 = XMVector3Cross( P - V1, V2 - V1 );
+    XMVECTOR C2 = XMVector3Cross( P - V2, V0 - V2 );
+
+    // If the cross product points in the same direction as the normal the the
+    // point is inside the edge (it is zero if is on the edge).
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Inside0 = XMVectorGreaterOrEqual( XMVector3Dot( C0, N ), Zero );
+    XMVECTOR Inside1 = XMVectorGreaterOrEqual( XMVector3Dot( C1, N ), Zero );
+    XMVECTOR Inside2 = XMVectorGreaterOrEqual( XMVector3Dot( C2, N ), Zero );
+
+    // If the point inside all of the edges it is inside.
+    return XMVectorAndInt( XMVectorAndInt( Inside0, Inside1 ), Inside2 );
+}
+
+//-----------------------------------------------------------------------------
+inline bool SolveCubic( _In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v )
+{
+    float p, q, h, rc, d, theta, costh3, sinth3;
+
+    p = f - e * e / 3.0f;
+    q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f;
+    h = q * q / 4.0f + p * p * p / 27.0f;
+
+    if( h > 0.0 )
+    {
+        *t = *u = *v = 0.f;
+        return false; // only one real root
+    }
+
+    if( ( h == 0.0 ) && ( q == 0.0 ) ) // all the same root
+    {
+        *t = - e / 3;
+        *u = - e / 3;
+        *v = - e / 3;
+
+        return true;
+    }
+
+    d = sqrtf( q * q / 4.0f - h );
+    if( d < 0 )
+        rc = -powf( -d, 1.0f / 3.0f );
+    else
+        rc = powf( d, 1.0f / 3.0f );
+
+    theta = XMScalarACos( -q / ( 2.0f * d ) );
+    costh3 = XMScalarCos( theta / 3.0f );
+    sinth3 = sqrtf( 3.0f ) * XMScalarSin( theta / 3.0f );
+    *t = 2.0f * rc * costh3 - e / 3.0f;
+    *u = -rc * ( costh3 + sinth3 ) - e / 3.0f;
+    *v = -rc * ( costh3 - sinth3 ) - e / 3.0f;
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+inline XMVECTOR CalculateEigenVector( _In_ float m11, _In_ float m12, _In_ float m13,
+                                      _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e )
+{
+    float fTmp[3];
+    fTmp[0] = ( float )( m12 * m23 - m13 * ( m22 - e ) );
+    fTmp[1] = ( float )( m13 * m12 - m23 * ( m11 - e ) );
+    fTmp[2] = ( float )( ( m11 - e ) * ( m22 - e ) - m12 * m12 );
+
+    XMVECTOR vTmp = XMLoadFloat3( (XMFLOAT3*)fTmp );
+
+    if( XMVector3Equal( vTmp, XMVectorZero() ) ) // planar or linear
+    {
+        float f1, f2, f3;
+
+        // we only have one equation - find a valid one
+        if( ( m11 - e != 0.0 ) || ( m12 != 0.0 ) || ( m13 != 0.0 ) )
+        {
+            f1 = m11 - e; f2 = m12; f3 = m13;
+        }
+        else if( ( m12 != 0.0 ) || ( m22 - e != 0.0 ) || ( m23 != 0.0 ) )
+        {
+            f1 = m12; f2 = m22 - e; f3 = m23;
+        }
+        else if( ( m13 != 0.0 ) || ( m23 != 0.0 ) || ( m33 - e != 0.0 ) )
+        {
+            f1 = m13; f2 = m23; f3 = m33 - e;
+        }
+        else
+        {
+            // error, we'll just make something up - we have NO context
+            f1 = 1.0; f2 = 0.0; f3 = 0.0;
+        }
+
+        if( f1 == 0.0 )
+            vTmp = XMVectorSetX( vTmp, 0.0f );
+        else
+            vTmp = XMVectorSetX( vTmp, 1.0f );
+
+        if( f2 == 0.0 )
+            vTmp = XMVectorSetY( vTmp, 0.0f );
+        else
+            vTmp = XMVectorSetY( vTmp, 1.0f );
+
+        if( f3 == 0.0 )
+        {
+            vTmp = XMVectorSetZ( vTmp, 0.0f );
+            // recalculate y to make equation work
+            if( m12 != 0.0 )
+                vTmp = XMVectorSetY( vTmp, ( float )( -f1 / f2 ) );
+        }
+        else
+        {
+            vTmp = XMVectorSetZ( vTmp, ( float )( ( f2 - f1 ) / f3 ) );
+        }
+    }
+
+    if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) > 1e-5f )
+    {
+        return XMVector3Normalize( vTmp );
+    }
+    else
+    {
+        // Multiply by a value large enough to make the vector non-zero.
+        vTmp *= 1e5f;
+        return XMVector3Normalize( vTmp );
+    }
+}
+
+//-----------------------------------------------------------------------------
+inline bool CalculateEigenVectors( _In_ float m11, _In_ float m12, _In_ float m13,
+                                   _In_ float m22, _In_ float m23, _In_ float m33,
+                                   _In_ float e1, _In_ float e2, _In_ float e3,
+                                   _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 )
+{
+    *pV1 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e1 );
+    *pV2 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e2 );
+    *pV3 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e3 );
+
+    bool v1z = false;
+    bool v2z = false;
+    bool v3z = false;
+
+    XMVECTOR Zero = XMVectorZero();
+
+    if ( XMVector3Equal( *pV1, Zero ) )
+        v1z = true;
+
+    if ( XMVector3Equal( *pV2, Zero ) )
+        v2z = true;
+
+    if ( XMVector3Equal( *pV3, Zero ))
+        v3z = true;
+
+    bool e12 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV2 ) ) ) > 0.1f ); // check for non-orthogonal vectors
+    bool e13 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV3 ) ) ) > 0.1f );
+    bool e23 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV2, *pV3 ) ) ) > 0.1f );
+
+    if( ( v1z && v2z && v3z ) || ( e12 && e13 && e23 ) ||
+        ( e12 && v3z ) || ( e13 && v2z ) || ( e23 && v1z ) ) // all eigenvectors are 0- any basis set
+    {
+        *pV1 = g_XMIdentityR0.v;
+        *pV2 = g_XMIdentityR1.v;
+        *pV3 = g_XMIdentityR2.v;
+        return true;
+    }
+
+    if( v1z && v2z )
+    {
+        XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV3 );
+        if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f )
+        {
+            vTmp = XMVector3Cross( g_XMIdentityR0, *pV3 );
+        }
+        *pV1 = XMVector3Normalize( vTmp );
+        *pV2 = XMVector3Cross( *pV3, *pV1 );
+        return true;
+    }
+
+    if( v3z && v1z )
+    {
+        XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV2 );
+        if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f )
+        {
+            vTmp = XMVector3Cross( g_XMIdentityR0, *pV2 );
+        }
+        *pV3 = XMVector3Normalize( vTmp );
+        *pV1 = XMVector3Cross( *pV2, *pV3 );
+        return true;
+    }
+
+    if( v2z && v3z )
+    {
+        XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV1 );
+        if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f )
+        {
+            vTmp = XMVector3Cross( g_XMIdentityR0, *pV1 );
+        }
+        *pV2 = XMVector3Normalize( vTmp );
+        *pV3 = XMVector3Cross( *pV1, *pV2 );
+        return true;
+    }
+
+    if( ( v1z ) || e12 )
+    {
+        *pV1 = XMVector3Cross( *pV2, *pV3 );
+        return true;
+    }
+
+    if( ( v2z ) || e23 )
+    {
+        *pV2 = XMVector3Cross( *pV3, *pV1 );
+        return true;
+    }
+
+    if( ( v3z ) || e13 )
+    {
+        *pV3 = XMVector3Cross( *pV1, *pV2 );
+        return true;
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+inline bool CalculateEigenVectorsFromCovarianceMatrix( _In_ float Cxx, _In_ float Cyy, _In_ float Czz,
+                                                       _In_ float Cxy, _In_ float Cxz, _In_ float Cyz,
+                                                       _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 )
+{
+    // Calculate the eigenvalues by solving a cubic equation.
+    float e = -( Cxx + Cyy + Czz );
+    float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz;
+    float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz;
+
+    float ev1, ev2, ev3;
+    if( !DirectX::Internal::SolveCubic( e, f, g, &ev1, &ev2, &ev3 ) )
+    {
+        // set them to arbitrary orthonormal basis set
+        *pV1 = g_XMIdentityR0.v;
+        *pV2 = g_XMIdentityR1.v;
+        *pV3 = g_XMIdentityR2.v;
+        return false;
+    }
+
+    return DirectX::Internal::CalculateEigenVectors( Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3 );
+}
+
+//-----------------------------------------------------------------------------
+inline void XM_CALLCONV FastIntersectTrianglePlane( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane,
+                                                    XMVECTOR& Outside, XMVECTOR& Inside )
+{
+    // Plane0
+    XMVECTOR Dist0 = XMVector4Dot( V0, Plane );
+    XMVECTOR Dist1 = XMVector4Dot( V1, Plane );
+    XMVECTOR Dist2 = XMVector4Dot( V2, Plane );
+
+    XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 );
+    MinDist = XMVectorMin( MinDist, Dist2 );
+
+    XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 );
+    MaxDist = XMVectorMax( MaxDist, Dist2 );
+
+    XMVECTOR Zero = XMVectorZero();
+
+    // Outside the plane?
+    Outside = XMVectorGreater( MinDist, Zero );
+
+    // Fully inside the plane?
+    Inside = XMVectorLess( MaxDist, Zero );
+}
+
+//-----------------------------------------------------------------------------
+inline void FastIntersectSpherePlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane,
+                                      _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
+{
+    XMVECTOR Dist = XMVector4Dot( Center, Plane );
+
+    // Outside the plane?
+    Outside = XMVectorGreater( Dist, Radius );
+
+    // Fully inside the plane?
+    Inside = XMVectorLess( Dist, -Radius );
+}
+
+//-----------------------------------------------------------------------------
+inline void FastIntersectAxisAlignedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane,
+                                              _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
+{
+    // Compute the distance to the center of the box.
+    XMVECTOR Dist = XMVector4Dot( Center, Plane );
+
+    // Project the axes of the box onto the normal of the plane.  Half the
+    // length of the projection (sometime called the "radius") is equal to
+    // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
+    // where h(i) are extents of the box, n is the plane normal, and b(i) are the 
+    // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)].
+    XMVECTOR Radius = XMVector3Dot( Extents, XMVectorAbs( Plane ) );
+
+    // Outside the plane?
+    Outside = XMVectorGreater( Dist, Radius );
+
+    // Fully inside the plane?
+    Inside = XMVectorLess( Dist, -Radius );
+}
+
+//-----------------------------------------------------------------------------
+inline void XM_CALLCONV FastIntersectOrientedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, _In_ GXMVECTOR Axis1,
+                                                       _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
+{
+    // Compute the distance to the center of the box.
+    XMVECTOR Dist = XMVector4Dot( Center, Plane );
+
+    // Project the axes of the box onto the normal of the plane.  Half the
+    // length of the projection (sometime called the "radius") is equal to
+    // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
+    // where h(i) are extents of the box, n is the plane normal, and b(i) are the 
+    // axes of the box.
+    XMVECTOR Radius = XMVector3Dot( Plane, Axis0 );
+    Radius = XMVectorInsert<0, 0, 1, 0, 0>( Radius, XMVector3Dot( Plane, Axis1 ) );
+    Radius = XMVectorInsert<0, 0, 0, 1, 0>( Radius, XMVector3Dot( Plane, Axis2 ) );
+    Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) );
+
+    // Outside the plane?
+    Outside = XMVectorGreater( Dist, Radius );
+
+    // Fully inside the plane?
+    Inside = XMVectorLess( Dist, -Radius );
+}
+
+//-----------------------------------------------------------------------------
+inline void XM_CALLCONV FastIntersectFrustumPlane( _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, _In_ GXMVECTOR Point3,
+                                                   _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7,
+                                                   _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside )
+{
+    // Find the min/max projection of the frustum onto the plane normal.
+    XMVECTOR Min, Max, Dist;
+
+    Min = Max = XMVector3Dot( Plane, Point0 );
+
+    Dist = XMVector3Dot( Plane, Point1 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    Dist = XMVector3Dot( Plane, Point2 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    Dist = XMVector3Dot( Plane, Point3 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    Dist = XMVector3Dot( Plane, Point4 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    Dist = XMVector3Dot( Plane, Point5 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    Dist = XMVector3Dot( Plane, Point6 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    Dist = XMVector3Dot( Plane, Point7 );
+    Min = XMVectorMin( Min, Dist );
+    Max = XMVectorMax( Max, Dist );
+
+    XMVECTOR PlaneDist = -XMVectorSplatW( Plane );
+
+    // Outside the plane?
+    Outside = XMVectorGreater( Min, PlaneDist );
+
+    // Fully inside the plane?
+    Inside = XMVectorLess( Max, PlaneDist );
+}
+
+}; // namespace Internal
+
+
+/****************************************************************************
+ *
+ * BoundingSphere
+ *
+ ****************************************************************************/
+
+//-----------------------------------------------------------------------------
+// Transform a sphere by an angle preserving transform.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, FXMMATRIX M ) const
+{
+    // Load the center of the sphere.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+
+    // Transform the center of the sphere.
+    XMVECTOR C = XMVector3Transform( vCenter, M );
+    
+    XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] );
+    XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] );
+    XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] );
+
+    XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) );
+
+    // Store the center sphere.
+    XMStoreFloat3( &Out.Center, C );
+
+    // Scale the radius of the pshere.
+    float Scale = sqrtf( XMVectorGetX(d) );
+    Out.Radius = Radius * Scale;
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
+{
+    // Load the center of the sphere.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+
+    // Transform the center of the sphere.
+    vCenter = XMVector3Rotate( vCenter * XMVectorReplicate( Scale ), Rotation ) + Translation;
+
+    // Store the center sphere.
+    XMStoreFloat3( &Out.Center, vCenter );
+
+    // Scale the radius of the pshere.
+    Out.Radius = Radius * Scale;
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR Point ) const
+{
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+
+    XMVECTOR DistanceSquared = XMVector3LengthSq( Point - vCenter );
+    XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius );
+
+    return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    if ( !Intersects(V0,V1,V2) )
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+    XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius );
+
+    XMVECTOR DistanceSquared = XMVector3LengthSq( V0 - vCenter );
+    XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared);
+
+    DistanceSquared = XMVector3LengthSq( V1 - vCenter );
+    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) );
+
+    DistanceSquared = XMVector3LengthSq( V2 - vCenter );
+    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) );
+
+    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere in sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains( const BoundingSphere& sh ) const
+{
+    XMVECTOR Center1 = XMLoadFloat3( &Center );
+    float r1 = Radius;
+
+    XMVECTOR Center2 = XMLoadFloat3( &sh.Center );
+    float r2 = sh.Radius;
+
+    XMVECTOR V = XMVectorSubtract( Center2, Center1 );
+
+    XMVECTOR Dist = XMVector3Length( V );
+
+    float d = XMVectorGetX( Dist );
+
+    return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis-aligned box in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains( const BoundingBox& box ) const
+{
+    if ( !box.Intersects(*this) )
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+    XMVECTOR RadiusSq = vRadius * vRadius;
+
+    XMVECTOR boxCenter = XMLoadFloat3( &box.Center );
+    XMVECTOR boxExtents = XMLoadFloat3( &box.Extents );
+
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    XMVECTOR offset = boxCenter - vCenter;
+
+    for( size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVectorMultiplyAdd( boxExtents, g_BoxOffset[i], offset );
+        XMVECTOR d = XMVector3LengthSq( C );
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) );
+    }
+
+    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented box in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains( const BoundingOrientedBox& box ) const
+{
+    if ( !box.Intersects(*this) )
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+    XMVECTOR RadiusSq = vRadius * vRadius;
+
+    XMVECTOR boxCenter = XMLoadFloat3( &box.Center );
+    XMVECTOR boxExtents = XMLoadFloat3( &box.Extents );
+    XMVECTOR boxOrientation = XMLoadFloat4( &box.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( boxOrientation ) );
+
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    for( size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVector3Rotate( boxExtents * g_BoxOffset[i], boxOrientation ) + boxCenter;
+        XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) );
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) );
+    }
+
+    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains( const BoundingFrustum& fr ) const
+{
+    if ( !fr.Intersects(*this) )
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+    XMVECTOR RadiusSq = vRadius * vRadius;
+
+    XMVECTOR vOrigin = XMLoadFloat3( &fr.Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &fr.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f );
+    XMVECTOR vRightBottom = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftTop = XMVectorSet( fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftBottom = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &fr.Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &fr.Far );
+
+    XMVECTOR Corners[BoundingFrustum::CORNER_COUNT];
+    Corners[0] = vRightTop * vNear;
+    Corners[1] = vRightBottom * vNear;
+    Corners[2] = vLeftTop * vNear;
+    Corners[3] = vLeftBottom * vNear;
+    Corners[4] = vRightTop * vFar;
+    Corners[5] = vRightBottom * vFar;
+    Corners[6] = vLeftTop * vFar;
+    Corners[7] = vLeftBottom * vFar;
+
+    XMVECTOR InsideAll = XMVectorTrueInt();
+    for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVector3Rotate( Corners[i], vOrientation ) + vOrigin;
+        XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) );
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) );
+    }
+
+    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere vs. sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects( const BoundingSphere& sh ) const
+{
+    // Load A.
+    XMVECTOR vCenterA = XMLoadFloat3( &Center );
+    XMVECTOR vRadiusA = XMVectorReplicatePtr( &Radius );
+
+    // Load B.
+    XMVECTOR vCenterB = XMLoadFloat3( &sh.Center );
+    XMVECTOR vRadiusB = XMVectorReplicatePtr( &sh.Radius );
+
+    // Distance squared between centers.    
+    XMVECTOR Delta = vCenterB - vCenterA;
+    XMVECTOR DistanceSquared = XMVector3LengthSq( Delta );
+
+    // Sum of the radii squared.
+    XMVECTOR RadiusSquared = XMVectorAdd( vRadiusA, vRadiusB );
+    RadiusSquared = XMVectorMultiply( RadiusSquared, RadiusSquared );
+
+    return XMVector3LessOrEqual( DistanceSquared, RadiusSquared );
+}
+
+
+//-----------------------------------------------------------------------------
+// Box vs. sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects( const BoundingBox& box ) const
+{
+    return box.Intersects( *this );
+}
+
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects( const BoundingOrientedBox& box ) const
+{
+    return box.Intersects( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum vs. sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects( const BoundingFrustum& fr ) const
+{
+    return fr.Intersects( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    // Load the sphere.    
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+
+    // Compute the plane of the triangle (has to be normalized).
+    XMVECTOR N = XMVector3Normalize( XMVector3Cross( V1 - V0, V2 - V0 ) );
+
+    // Assert that the triangle is not degenerate.
+    assert( !XMVector3Equal( N, XMVectorZero() ) );
+
+    // Find the nearest feature on the triangle to the sphere.
+    XMVECTOR Dist = XMVector3Dot( vCenter - V0, N );
+
+    // If the center of the sphere is farther from the plane of the triangle than
+    // the radius of the sphere, then there cannot be an intersection.
+    XMVECTOR NoIntersection = XMVectorLess( Dist, -vRadius );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Dist, vRadius ) );
+
+    // Project the center of the sphere onto the plane of the triangle.
+    XMVECTOR Point = vCenter - ( N * Dist );
+
+    // Is it inside all the edges? If so we intersect because the distance 
+    // to the plane is less than the radius.
+    XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle( Point, V0, V1, V2 );
+
+    // Find the nearest point on each edge.
+    XMVECTOR RadiusSq = vRadius * vRadius;
+
+    // Edge 0,1
+    Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V0, V1, vCenter );
+
+    // If the distance to the center of the sphere to the point is less than 
+    // the radius of the sphere then it must intersect.
+    Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) );
+
+    // Edge 1,2
+    Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V1, V2, vCenter );
+
+    // If the distance to the center of the sphere to the point is less than 
+    // the radius of the sphere then it must intersect.
+    Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) );
+
+    // Edge 2,0
+    Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V2, V0, vCenter );
+
+    // If the distance to the center of the sphere to the point is less than 
+    // the radius of the sphere then it must intersect.
+    Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) );
+
+    return XMVector4EqualInt( XMVectorAndCInt( Intersection, NoIntersection ), XMVectorTrueInt() );
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere-plane intersection
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Plane ) const
+{
+    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
+
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane, Outside, Inside );
+
+    // If the sphere is outside any plane it is outside.
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return FRONT;
+
+    // If the sphere is inside all planes it is inside.
+    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
+        return BACK;
+
+    // The sphere is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with a sphere.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const
+{
+    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+
+    // l is the vector from the ray origin to the center of the sphere.
+    XMVECTOR l = vCenter - Origin;
+
+    // s is the projection of the l onto the ray direction.
+    XMVECTOR s = XMVector3Dot( l, Direction );
+
+    XMVECTOR l2 = XMVector3Dot( l, l );
+
+    XMVECTOR r2 = vRadius * vRadius;
+
+    // m2 is squared distance from the center of the sphere to the projection.
+    XMVECTOR m2 = l2 - s * s;
+
+    XMVECTOR NoIntersection;
+
+    // If the ray origin is outside the sphere and the center of the sphere is 
+    // behind the ray origin there is no intersection.
+    NoIntersection = XMVectorAndInt( XMVectorLess( s, XMVectorZero() ), XMVectorGreater( l2, r2 ) );
+
+    // If the squared distance from the center of the sphere to the projection
+    // is greater than the radius squared the ray will miss the sphere.
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( m2, r2 ) );
+
+    // The ray hits the sphere, compute the nearest intersection point.
+    XMVECTOR q = XMVectorSqrt( r2 - m2 );
+    XMVECTOR t1 = s - q;
+    XMVECTOR t2 = s + q;
+
+    XMVECTOR OriginInside = XMVectorLessOrEqual( l2, r2 );
+    XMVECTOR t = XMVectorSelect( t1, t2, OriginInside );
+
+    if( XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) )
+    {
+        // Store the x-component to *pDist.
+        XMStoreFloat( &Dist, t );
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test a sphere vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+                                                                GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
+{
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &Radius );
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane0, Outside, Inside );
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane1, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane2, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane3, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane4, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane5, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    // If the sphere is outside any plane it is outside.
+    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
+        return DISJOINT;
+
+    // If the sphere is inside all planes it is inside.
+    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
+        return CONTAINS;
+
+    // The sphere is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Creates a bounding sphere that contains two other bounding spheres
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateMerged( BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2 )
+{
+    XMVECTOR Center1 = XMLoadFloat3( &S1.Center );
+    float r1 = S1.Radius;
+
+    XMVECTOR Center2 = XMLoadFloat3( &S2.Center );
+    float r2 = S2.Radius;
+
+    XMVECTOR V = XMVectorSubtract( Center2, Center1 );
+
+    XMVECTOR Dist = XMVector3Length( V );
+
+    float d = XMVectorGetX(Dist);
+
+    if ( r1 + r2 >= d )
+    {
+        if ( r1 - r2 >= d )
+        {
+            Out = S1;
+            return;
+        }
+        else if ( r2 - r1 >= d )
+        {
+            Out = S2;
+            return;
+        }
+    }
+
+    XMVECTOR N = XMVectorDivide( V, Dist );
+
+    float t1 = XMMin( -r1, d-r2 );
+    float t2 = XMMax( r1, d+r2 );
+    float t_5 = (t2 - t1) * 0.5f;
+    
+    XMVECTOR NCenter = XMVectorAdd( Center1, XMVectorMultiply( N, XMVectorReplicate( t_5 + t1 ) ) );
+
+    XMStoreFloat3( &Out.Center, NCenter );
+    Out.Radius = t_5;
+}
+
+
+//-----------------------------------------------------------------------------
+// Create sphere enscribing bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingBox& box )
+{
+    Out.Center = box.Center;
+    XMVECTOR vExtents = XMLoadFloat3( &box.Extents );
+    Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) );
+}
+
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingOrientedBox& box )
+{
+    // Bounding box orientation is irrelevant because a sphere is rotationally invariant
+    Out.Center = box.Center;
+    XMVECTOR vExtents = XMLoadFloat3( &box.Extents );
+    Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Find the approximate smallest enclosing bounding sphere for a set of 
+// points. Exact computation of the smallest enclosing bounding sphere is 
+// possible but is slower and requires a more complex algorithm.
+// The algorithm is based on  Jack Ritter, "An Efficient Bounding Sphere", 
+// Graphics Gems.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromPoints( BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride )
+{
+    assert( Count > 0 );
+    assert( pPoints );
+
+    // Find the points with minimum and maximum x, y, and z
+    XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ;
+
+    MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3( pPoints );
+
+    for( size_t i = 1; i < Count; ++i )
+    {
+        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
+
+        float px = XMVectorGetX( Point );
+        float py = XMVectorGetY( Point );
+        float pz = XMVectorGetZ( Point );
+
+        if( px < XMVectorGetX( MinX ) )
+            MinX = Point;
+
+        if( px > XMVectorGetX( MaxX ) )
+            MaxX = Point;
+
+        if( py < XMVectorGetY( MinY ) )
+            MinY = Point;
+
+        if( py > XMVectorGetY( MaxY ) )
+            MaxY = Point;
+
+        if( pz < XMVectorGetZ( MinZ ) )
+            MinZ = Point;
+
+        if( pz > XMVectorGetZ( MaxZ ) )
+            MaxZ = Point;
+    }
+
+    // Use the min/max pair that are farthest apart to form the initial sphere.
+    XMVECTOR DeltaX = MaxX - MinX;
+    XMVECTOR DistX = XMVector3Length( DeltaX );
+
+    XMVECTOR DeltaY = MaxY - MinY;
+    XMVECTOR DistY = XMVector3Length( DeltaY );
+
+    XMVECTOR DeltaZ = MaxZ - MinZ;
+    XMVECTOR DistZ = XMVector3Length( DeltaZ );
+
+    XMVECTOR vCenter;
+    XMVECTOR vRadius;
+
+    if( XMVector3Greater( DistX, DistY ) )
+    {
+        if( XMVector3Greater( DistX, DistZ ) )
+        {
+            // Use min/max x.
+            vCenter = XMVectorLerp(MaxX,MinX,0.5f);
+            vRadius = DistX * 0.5f;
+        }
+        else
+        {
+            // Use min/max z.
+            vCenter = XMVectorLerp(MaxZ,MinZ,0.5f);
+            vRadius = DistZ * 0.5f;
+        }
+    }
+    else // Y >= X
+    {
+        if( XMVector3Greater( DistY, DistZ ) )
+        {
+            // Use min/max y.
+            vCenter = XMVectorLerp(MaxY,MinY,0.5f);
+            vRadius = DistY * 0.5f;
+        }
+        else
+        {
+            // Use min/max z.
+            vCenter = XMVectorLerp(MaxZ,MinZ,0.5f);
+            vRadius = DistZ * 0.5f;
+        }
+    }
+
+    // Add any points not inside the sphere.
+    for( size_t i = 0; i < Count; ++i )
+    {
+        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
+
+        XMVECTOR Delta = Point - vCenter;
+
+        XMVECTOR Dist = XMVector3Length( Delta );
+
+        if( XMVector3Greater( Dist, vRadius ) )
+        {
+            // Adjust sphere to include the new point.
+            vRadius = ( vRadius + Dist ) * 0.5f;
+            vCenter += ( XMVectorReplicate( 1.0f ) - XMVectorDivide(vRadius,Dist) ) * Delta;
+        }
+    }
+
+    XMStoreFloat3( &Out.Center, vCenter );
+    XMStoreFloat( &Out.Radius, vRadius );
+}
+
+
+//-----------------------------------------------------------------------------
+// Create sphere containing frustum
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromFrustum( BoundingSphere& Out, const BoundingFrustum& fr )
+{
+    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
+    fr.GetCorners( Corners );
+    CreateFromPoints( Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3) );
+}
+
+
+/****************************************************************************
+ *
+ * BoundingBox
+ *
+ ****************************************************************************/
+
+//-----------------------------------------------------------------------------
+// Transform an axis aligned box by an angle preserving transform.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, FXMMATRIX M ) const
+{
+    // Load center and extents.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    // Compute and transform the corners and find new min/max bounds.
+    XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter );
+    Corner = XMVector3Transform( Corner, M );
+
+    XMVECTOR Min, Max;
+    Min = Max = Corner;
+
+    for( size_t i = 1; i < CORNER_COUNT; ++i )
+    {
+        Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter );
+        Corner = XMVector3Transform( Corner, M );
+
+        Min = XMVectorMin( Min, Corner );
+        Max = XMVectorMax( Max, Corner );
+    }
+
+    // Store center and extents.
+    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
+    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
+{
+    assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) );
+
+    // Load center and extents.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    XMVECTOR VectorScale = XMVectorReplicate( Scale );
+
+    // Compute and transform the corners and find new min/max bounds.
+    XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter );
+    Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation;
+
+    XMVECTOR Min, Max;
+    Min = Max = Corner;
+
+    for( size_t i = 1; i < CORNER_COUNT; ++i )
+    {
+        Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter );
+        Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation;
+
+        Min = XMVectorMin( Min, Corner );
+        Max = XMVectorMax( Max, Corner );
+    }
+
+    // Store center and extents.
+    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
+    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the corner points of the box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::GetCorners( XMFLOAT3* Corners ) const
+{
+    assert( Corners != nullptr );
+
+    // Load the box
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    for( size_t i = 0; i < CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter );
+        XMStoreFloat3( &Corners[i], C );
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR Point ) const
+{
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    return XMVector3InBounds( Point - vCenter, vExtents ) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    if ( !Intersects(V0,V1,V2) )
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    XMVECTOR d = XMVectorAbs( V0 - vCenter );
+    XMVECTOR Inside = XMVectorLessOrEqual( d, vExtents );
+
+    d = XMVectorAbs( V1 - vCenter );
+    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
+
+    d = XMVectorAbs( V2 - vCenter );
+    Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
+
+    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains( const BoundingSphere& sh ) const
+{
+    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
+    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
+
+    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
+    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
+
+    XMVECTOR BoxMin = BoxCenter - BoxExtents;
+    XMVECTOR BoxMax = BoxCenter + BoxExtents;
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin );
+    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax );
+
+    XMVECTOR MinDelta = SphereCenter - BoxMin;
+    XMVECTOR MaxDelta = SphereCenter - BoxMax;
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect( d, MinDelta, LessThanMin );
+    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot( d, d );
+
+    if ( XMVector3Greater( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) )
+        return DISJOINT;
+
+    XMVECTOR InsideAll = XMVectorLessOrEqual( BoxMin + SphereRadius, SphereCenter );
+    InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( SphereCenter, BoxMax - SphereRadius ) );
+    InsideAll = XMVectorAndInt( InsideAll, XMVectorGreater( BoxMax - BoxMin, SphereRadius ) );
+
+    return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis-aligned box in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains( const BoundingBox& box ) const
+{
+    XMVECTOR CenterA = XMLoadFloat3( &Center );
+    XMVECTOR ExtentsA = XMLoadFloat3( &Extents );
+
+    XMVECTOR CenterB = XMLoadFloat3( &box.Center );
+    XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents );
+
+    XMVECTOR MinA = CenterA - ExtentsA;
+    XMVECTOR MaxA = CenterA + ExtentsA;
+
+    XMVECTOR MinB = CenterB - ExtentsB;
+    XMVECTOR MaxB = CenterB + ExtentsB;
+
+    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false
+    XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) );
+
+    if ( DirectX::Internal::XMVector3AnyTrue( Disjoint ) )
+        return DISJOINT;
+
+    // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B
+    XMVECTOR Inside = XMVectorAndInt( XMVectorLessOrEqual( MinA, MinB ), XMVectorLessOrEqual( MaxB, MaxA ) );
+
+    return DirectX::Internal::XMVector3AllTrue( Inside ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented box in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains( const BoundingOrientedBox& box ) const
+{
+    if ( !box.Intersects( *this ) )
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    // Subtract off the AABB center to remove a subtract below
+    XMVECTOR oCenter = XMLoadFloat3( &box.Center ) - vCenter;
+
+    XMVECTOR oExtents = XMLoadFloat3( &box.Extents );
+    XMVECTOR oOrientation = XMLoadFloat4( &box.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( oOrientation ) );
+
+    XMVECTOR Inside = XMVectorTrueInt();
+
+    for( size_t i=0; i < BoundingOrientedBox::CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVector3Rotate( oExtents * g_BoxOffset[i], oOrientation ) + oCenter;
+        XMVECTOR d = XMVectorAbs(C);
+        Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
+    }
+
+    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains( const BoundingFrustum& fr ) const
+{
+    if ( !fr.Intersects( *this ) )
+        return DISJOINT;
+
+    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
+    fr.GetCorners( Corners );
+
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    XMVECTOR Inside = XMVectorTrueInt();
+
+    for( size_t i=0; i < BoundingFrustum::CORNER_COUNT; ++i )
+    {
+        XMVECTOR Point = XMLoadFloat3( &Corners[i] );
+        XMVECTOR d = XMVectorAbs( Point - vCenter );
+        Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) );
+    }
+
+    return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere vs axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects( const BoundingSphere& sh ) const
+{
+    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
+    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
+
+    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
+    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
+
+    XMVECTOR BoxMin = BoxCenter - BoxExtents;
+    XMVECTOR BoxMax = BoxCenter + BoxExtents;
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin );
+    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax );
+
+    XMVECTOR MinDelta = SphereCenter - BoxMin;
+    XMVECTOR MaxDelta = SphereCenter - BoxMax;
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect( d, MinDelta, LessThanMin );
+    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot( d, d );
+
+    return XMVector3LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis-aligned box vs. axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects( const BoundingBox& box ) const
+{
+    XMVECTOR CenterA = XMLoadFloat3( &Center );
+    XMVECTOR ExtentsA = XMLoadFloat3( &Extents );
+
+    XMVECTOR CenterB = XMLoadFloat3( &box.Center );
+    XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents );
+
+    XMVECTOR MinA = CenterA - ExtentsA;
+    XMVECTOR MaxA = CenterA + ExtentsA;
+
+    XMVECTOR MinB = CenterB - ExtentsB;
+    XMVECTOR MaxB = CenterB + ExtentsB;
+
+    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false
+    XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) );
+
+    return !DirectX::Internal::XMVector3AnyTrue( Disjoint );
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented box vs. axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects( const BoundingOrientedBox& box ) const
+{
+    return box.Intersects( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum vs. axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects( const BoundingFrustum& fr ) const
+{
+    return fr.Intersects( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs. axis aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    XMVECTOR Zero = XMVectorZero();
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    XMVECTOR BoxMin = vCenter - vExtents;
+    XMVECTOR BoxMax = vCenter + vExtents;
+
+    // Test the axes of the box (in effect test the AAB against the minimal AAB 
+    // around the triangle).
+    XMVECTOR TriMin = XMVectorMin( XMVectorMin( V0, V1 ), V2 );
+    XMVECTOR TriMax = XMVectorMax( XMVectorMax( V0, V1 ), V2 );
+
+    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint
+    XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( TriMin, BoxMax ), XMVectorGreater( BoxMin, TriMax ) );
+    if( DirectX::Internal::XMVector3AnyTrue( Disjoint ) )
+        return false;
+
+    // Test the plane of the triangle.
+    XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 );
+    XMVECTOR Dist = XMVector3Dot( Normal, V0 );
+
+    // Assert that the triangle is not degenerate.
+    assert( !XMVector3Equal( Normal, Zero ) );
+
+    // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i)
+    // else v_min(i)=b_max(i), v_max(i)=b_min(i)
+    XMVECTOR NormalSelect = XMVectorGreater( Normal, Zero );
+    XMVECTOR V_Min = XMVectorSelect( BoxMax, BoxMin, NormalSelect );
+    XMVECTOR V_Max = XMVectorSelect( BoxMin, BoxMax, NormalSelect );
+
+    // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint
+    XMVECTOR MinDist = XMVector3Dot( V_Min, Normal );
+    XMVECTOR MaxDist = XMVector3Dot( V_Max, Normal );
+
+    XMVECTOR NoIntersection = XMVectorGreater( MinDist, Dist );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( MaxDist, Dist ) );
+
+    // Move the box center to zero to simplify the following tests.
+    XMVECTOR TV0 = V0 - vCenter;
+    XMVECTOR TV1 = V1 - vCenter;
+    XMVECTOR TV2 = V2 - vCenter;
+
+    // Test the edge/edge axes (3*3).
+    XMVECTOR e0 = TV1 - TV0;
+    XMVECTOR e1 = TV2 - TV1;
+    XMVECTOR e2 = TV0 - TV2;
+
+    // Make w zero.
+    e0 = XMVectorInsert<0, 0, 0, 0, 1>( e0, Zero );
+    e1 = XMVectorInsert<0, 0, 0, 0, 1>( e1, Zero );
+    e2 = XMVectorInsert<0, 0, 0, 0, 1>( e2, Zero );
+
+    XMVECTOR Axis;
+    XMVECTOR p0, p1, p2;
+    XMVECTOR Min, Max;
+    XMVECTOR Radius;
+
+    // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y)
+    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e0, -e0 );
+    p0 = XMVector3Dot( TV0, Axis );
+    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
+    p2 = XMVector3Dot( TV2, Axis );
+    Min = XMVectorMin( p0, p2 );
+    Max = XMVectorMax( p0, p2 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y)
+    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e1, -e1 );
+    p0 = XMVector3Dot( TV0, Axis );
+    p1 = XMVector3Dot( TV1, Axis );
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
+    Min = XMVectorMin( p0, p1 );
+    Max = XMVectorMax( p0, p1 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y)
+    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e2, -e2 );
+    p0 = XMVector3Dot( TV0, Axis );
+    p1 = XMVector3Dot( TV1, Axis );
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
+    Min = XMVectorMin( p0, p1 );
+    Max = XMVectorMax( p0, p1 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x)
+    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e0, -e0 );
+    p0 = XMVector3Dot( TV0, Axis );
+    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
+    p2 = XMVector3Dot( TV2, Axis );
+    Min = XMVectorMin( p0, p2 );
+    Max = XMVectorMax( p0, p2 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x)
+    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e1, -e1 );
+    p0 = XMVector3Dot( TV0, Axis );
+    p1 = XMVector3Dot( TV1, Axis );
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
+    Min = XMVectorMin( p0, p1 );
+    Max = XMVectorMax( p0, p1 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x)
+    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e2, -e2 );
+    p0 = XMVector3Dot( TV0, Axis );
+    p1 = XMVector3Dot( TV1, Axis );
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
+    Min = XMVectorMin( p0, p1 );
+    Max = XMVectorMax( p0, p1 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0)
+    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e0, -e0 );
+    p0 = XMVector3Dot( TV0, Axis );
+    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
+    p2 = XMVector3Dot( TV2, Axis );
+    Min = XMVectorMin( p0, p2 );
+    Max = XMVectorMax( p0, p2 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0)
+    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e1, -e1 );
+    p0 = XMVector3Dot( TV0, Axis );
+    p1 = XMVector3Dot( TV1, Axis );
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
+    Min = XMVectorMin( p0, p1 );
+    Max = XMVectorMax( p0, p1 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0)
+    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e2, -e2 );
+    p0 = XMVector3Dot( TV0, Axis );
+    p1 = XMVector3Dot( TV1, Axis );
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
+    Min = XMVectorMin( p0, p1 );
+    Max = XMVectorMax( p0, p1 );
+    Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) );
+
+    return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() );
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Plane ) const
+{
+    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane, Outside, Inside );
+
+    // If the box is outside any plane it is outside.
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return FRONT;
+
+    // If the box is inside all planes it is inside.
+    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
+        return BACK;
+
+    // The box is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with an axis aligned 
+// box using the slabs method.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const
+{
+    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    // Adjust ray origin to be relative to center of the box.
+    XMVECTOR TOrigin = vCenter - Origin;
+
+    // Compute the dot product againt each axis of the box.
+    // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary.
+    XMVECTOR AxisDotOrigin = TOrigin;
+    XMVECTOR AxisDotDirection = Direction;
+
+    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab.
+    XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon );
+
+    // Test against all three axii simultaneously.
+    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection );
+    XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection;
+    XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection;
+
+    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
+    // use the results from any directions parallel to the slab.
+    XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel );
+    XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel );
+
+    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
+    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
+    t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) );  // x = max(x,y)
+    t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) );  // x = max(max(x,y),z)
+    t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) );  // x = min(x,y)
+    t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) );  // x = min(min(x,y),z)
+
+    // if ( t_min > t_max ) return false;
+    XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) );
+
+    // if ( t_max < 0.0f ) return false;
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) );
+
+    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false;
+    XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) );
+
+    if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) )
+    {
+        // Store the x-component to *pDist
+        XMStoreFloat( &Dist, t_min );
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test an axis alinged box vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+                                                             GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
+{
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane0, Outside, Inside );
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane1, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane2, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane3, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane4, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane5, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    // If the box is outside any plane it is outside.
+    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
+        return DISJOINT;
+
+    // If the box is inside all planes it is inside.
+    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
+        return CONTAINS;
+
+    // The box is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Create axis-aligned box that contains two other bounding boxes
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::CreateMerged( BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2 )
+{
+    XMVECTOR b1Center = XMLoadFloat3( &b1.Center );
+    XMVECTOR b1Extents = XMLoadFloat3( &b1.Extents );
+
+    XMVECTOR b2Center = XMLoadFloat3( &b2.Center );
+    XMVECTOR b2Extents = XMLoadFloat3( &b2.Extents );
+
+    XMVECTOR Min = XMVectorSubtract( b1Center, b1Extents );
+    Min = XMVectorMin( Min, XMVectorSubtract( b2Center, b2Extents ) );
+
+    XMVECTOR Max = XMVectorAdd( b1Center, b1Extents );
+    Max = XMVectorMax( Max, XMVectorAdd( b2Center, b2Extents ) );
+
+    assert( XMVector3LessOrEqual( Min, Max ) );
+
+    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
+    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
+}
+
+
+//-----------------------------------------------------------------------------
+// Create axis-aligned box that contains a bounding sphere
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::CreateFromSphere( BoundingBox& Out, const BoundingSphere& sh )
+{
+    XMVECTOR spCenter = XMLoadFloat3( &sh.Center );
+    XMVECTOR shRadius = XMVectorReplicatePtr( &sh.Radius );
+
+    XMVECTOR Min = XMVectorSubtract( spCenter, shRadius );
+    XMVECTOR Max = XMVectorAdd( spCenter, shRadius );
+
+    assert( XMVector3LessOrEqual( Min, Max ) );
+
+    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
+    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
+}
+
+
+//-----------------------------------------------------------------------------
+// Create axis-aligned box from min/max points
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingBox::CreateFromPoints( BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2 )
+{
+    XMVECTOR Min = XMVectorMin( pt1, pt2 );
+    XMVECTOR Max = XMVectorMax( pt1, pt2 );
+
+    // Store center and extents.
+    XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f );
+    XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f );
+}
+
+
+//-----------------------------------------------------------------------------
+// Find the minimum axis aligned bounding box containing a set of points.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::CreateFromPoints( BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride )
+{
+    assert( Count > 0 );
+    assert( pPoints );
+
+    // Find the minimum and maximum x, y, and z
+    XMVECTOR vMin, vMax;
+
+    vMin = vMax = XMLoadFloat3( pPoints );
+
+    for( size_t i = 1; i < Count; ++i )
+    {
+        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
+
+        vMin = XMVectorMin( vMin, Point );
+        vMax = XMVectorMax( vMax, Point );
+    }
+
+    // Store center and extents.
+    XMStoreFloat3( &Out.Center, ( vMin + vMax ) * 0.5f );
+    XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f );
+}
+
+
+/****************************************************************************
+ *
+ * BoundingOrientedBox
+ *
+ ****************************************************************************/
+
+//-----------------------------------------------------------------------------
+// Transform an oriented box by an angle preserving transform.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, FXMMATRIX M ) const
+{
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Composite the box rotation and the transform rotation.
+    XMMATRIX nM;
+    nM.r[0] = XMVector3Normalize( M.r[0] );
+    nM.r[1] = XMVector3Normalize( M.r[1] );
+    nM.r[2] = XMVector3Normalize( M.r[2] );
+    nM.r[3] = g_XMIdentityR3;
+    XMVECTOR Rotation = XMQuaternionRotationMatrix( nM );
+    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
+
+    // Transform the center.
+    vCenter = XMVector3Transform( vCenter, M );
+
+    // Scale the box extents.
+    XMVECTOR dX = XMVector3Length( M.r[0] );
+    XMVECTOR dY = XMVector3Length( M.r[1] );
+    XMVECTOR dZ = XMVector3Length( M.r[2] );
+
+    XMVECTOR VectorScale = XMVectorSelect( dY, dX, g_XMSelect1000 );
+    VectorScale = XMVectorSelect( dZ, VectorScale, g_XMSelect1100 );
+    vExtents = vExtents * VectorScale;
+
+    // Store the box.
+    XMStoreFloat3( &Out.Center, vCenter );
+    XMStoreFloat3( &Out.Extents, vExtents );
+    XMStoreFloat4( &Out.Orientation, vOrientation );
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
+{
+    assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) );
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Composite the box rotation and the transform rotation.
+    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
+
+    // Transform the center.
+    XMVECTOR VectorScale = XMVectorReplicate( Scale );
+    vCenter = XMVector3Rotate( vCenter * VectorScale, Rotation ) + Translation;
+
+    // Scale the box extents.
+    vExtents = vExtents * VectorScale;
+
+    // Store the box.
+    XMStoreFloat3( &Out.Center, vCenter );
+    XMStoreFloat3( &Out.Extents, vExtents );
+    XMStoreFloat4( &Out.Orientation, vOrientation );
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the corner points of the box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingOrientedBox::GetCorners( XMFLOAT3* Corners ) const
+{
+    assert( Corners != 0 );
+
+    // Load the box
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    for( size_t i = 0; i < CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVector3Rotate( vExtents * g_BoxOffset[i], vOrientation ) + vCenter;
+        XMStoreFloat3( &Corners[i], C );
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR Point ) const
+{
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Transform the point to be local to the box.
+    XMVECTOR TPoint = XMVector3InverseRotate( Point - vCenter, vOrientation );
+
+    return XMVector3InBounds( TPoint, vExtents ) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    // Load the box center & orientation.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Transform the triangle vertices into the space of the box.
+    XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation );
+    XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation );
+    XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation );
+
+    BoundingBox box;
+    box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f );
+    box.Extents = Extents;
+
+    // Use the triangle vs axis aligned box intersection routine.
+    return box.Contains( TV0, TV1, TV2 );
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains( const BoundingSphere& sh ) const
+{
+    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
+    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
+
+    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
+    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
+    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
+
+    // Transform the center of the sphere to be local to the box.
+    // BoxMin = -BoxExtents
+    // BoxMax = +BoxExtents
+    SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation );
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents );
+    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents );
+
+    XMVECTOR MinDelta = SphereCenter + BoxExtents;
+    XMVECTOR MaxDelta = SphereCenter - BoxExtents;
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect( d, MinDelta, LessThanMin );
+    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot( d, d );
+    XMVECTOR SphereRadiusSq = XMVectorMultiply( SphereRadius, SphereRadius );
+
+    if ( XMVector4Greater( d2, SphereRadiusSq ) )
+        return DISJOINT;
+
+    // See if we are completely inside the box
+    XMVECTOR SMin = SphereCenter - SphereRadius;
+    XMVECTOR SMax = SphereCenter + SphereRadius;
+
+    return ( XMVector3InBounds( SMin, BoxExtents ) && XMVector3InBounds( SMax, BoxExtents ) ) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis aligned box vs. oriented box. Constructs an oriented box and uses
+// the oriented box vs. oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains( const BoundingBox& box ) const
+{
+    // Make the axis aligned box oriented and do an OBB vs OBB test.
+    BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) );
+    return Contains( obox );
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented bounding box in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains( const BoundingOrientedBox& box ) const
+{
+    if ( !Intersects(box) )
+        return DISJOINT;
+
+    // Load the boxes
+    XMVECTOR aCenter = XMLoadFloat3( &Center );
+    XMVECTOR aExtents = XMLoadFloat3( &Extents );
+    XMVECTOR aOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( aOrientation ) );
+
+    XMVECTOR bCenter = XMLoadFloat3( &box.Center );
+    XMVECTOR bExtents = XMLoadFloat3( &box.Extents );
+    XMVECTOR bOrientation = XMLoadFloat4( &box.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( bOrientation ) );
+
+    XMVECTOR offset = bCenter - aCenter;
+
+    for( size_t i = 0; i < CORNER_COUNT; ++i )
+    {
+        // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter
+        // Ca = invrotate( Cb - aCenter, aOrientation )
+
+        XMVECTOR C = XMVector3Rotate( bExtents * g_BoxOffset[i], bOrientation ) + offset;
+        C = XMVector3InverseRotate( C , aOrientation );
+
+        if ( !XMVector3InBounds( C, aExtents ) )
+            return INTERSECTS;
+    }
+
+    return CONTAINS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains( const BoundingFrustum& fr ) const
+{
+    if ( !fr.Intersects(*this) )
+        return DISJOINT;
+
+    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
+    fr.GetCorners( Corners );
+
+    // Load the box
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVector3InverseRotate( XMLoadFloat3( &Corners[i] ) - vCenter, vOrientation );
+
+        if ( !XMVector3InBounds( C, vExtents ) )
+            return INTERSECTS;
+    }
+
+    return CONTAINS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere vs. oriented box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects( const BoundingSphere& sh ) const
+{
+    XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center );
+    XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius );
+
+    XMVECTOR BoxCenter = XMLoadFloat3( &Center );
+    XMVECTOR BoxExtents = XMLoadFloat3( &Extents );
+    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
+
+    // Transform the center of the sphere to be local to the box.
+    // BoxMin = -BoxExtents
+    // BoxMax = +BoxExtents
+    SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation );
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents );
+    XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents );
+
+    XMVECTOR MinDelta = SphereCenter + BoxExtents;
+    XMVECTOR MaxDelta = SphereCenter - BoxExtents;
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect( d, MinDelta, LessThanMin );
+    d = XMVectorSelect( d, MaxDelta, GreaterThanMax );
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot( d, d );
+
+    return XMVector4LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ? true : false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis aligned box vs. oriented box. Constructs an oriented box and uses
+// the oriented box vs. oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects( const BoundingBox& box ) const
+{
+    // Make the axis aligned box oriented and do an OBB vs OBB test.
+    BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) );
+    return Intersects( obox );
+}
+
+
+//-----------------------------------------------------------------------------
+// Fast oriented box / oriented box intersection test using the separating axis 
+// theorem.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects( const BoundingOrientedBox& box ) const
+{
+    // Build the 3x3 rotation matrix that defines the orientation of B relative to A.
+    XMVECTOR A_quat = XMLoadFloat4( &Orientation );
+    XMVECTOR B_quat = XMLoadFloat4( &box.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( A_quat ) );
+    assert( DirectX::Internal::XMQuaternionIsUnit( B_quat ) );
+
+    XMVECTOR Q = XMQuaternionMultiply( A_quat, XMQuaternionConjugate( B_quat ) );
+    XMMATRIX R = XMMatrixRotationQuaternion( Q );
+
+    // Compute the translation of B relative to A.
+    XMVECTOR A_cent = XMLoadFloat3( &Center );
+    XMVECTOR B_cent = XMLoadFloat3( &box.Center );
+    XMVECTOR t = XMVector3InverseRotate( B_cent - A_cent, A_quat );
+
+    //
+    // h(A) = extents of A.
+    // h(B) = extents of B.
+    //
+    // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1)
+    // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22)
+    //  
+    // For each possible separating axis l:
+    //   d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l )
+    //   d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l )
+    //   if abs( t dot l ) > d(A) + d(B) then disjoint
+    //
+
+    // Load extents of A and B.
+    XMVECTOR h_A = XMLoadFloat3( &Extents );
+    XMVECTOR h_B = XMLoadFloat3( &box.Extents );
+
+    // Rows. Note R[0,1,2]X.w = 0.
+    XMVECTOR R0X = R.r[0];
+    XMVECTOR R1X = R.r[1];
+    XMVECTOR R2X = R.r[2];
+
+    R = XMMatrixTranspose( R );
+
+    // Columns. Note RX[0,1,2].w = 0.
+    XMVECTOR RX0 = R.r[0];
+    XMVECTOR RX1 = R.r[1];
+    XMVECTOR RX2 = R.r[2];
+
+    // Absolute value of rows.
+    XMVECTOR AR0X = XMVectorAbs( R0X );
+    XMVECTOR AR1X = XMVectorAbs( R1X );
+    XMVECTOR AR2X = XMVectorAbs( R2X );
+
+    // Absolute value of columns.
+    XMVECTOR ARX0 = XMVectorAbs( RX0 );
+    XMVECTOR ARX1 = XMVectorAbs( RX1 );
+    XMVECTOR ARX2 = XMVectorAbs( RX2 );
+
+    // Test each of the 15 possible seperating axii.
+    XMVECTOR d, d_A, d_B;
+
+    // l = a(u) = (1, 0, 0)
+    // t dot l = t.x
+    // d(A) = h(A).x
+    // d(B) = h(B) dot abs(r00, r01, r02)
+    d = XMVectorSplatX( t );
+    d_A = XMVectorSplatX( h_A );
+    d_B = XMVector3Dot( h_B, AR0X );
+    XMVECTOR NoIntersection = XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) );
+
+    // l = a(v) = (0, 1, 0)
+    // t dot l = t.y
+    // d(A) = h(A).y
+    // d(B) = h(B) dot abs(r10, r11, r12)
+    d = XMVectorSplatY( t );
+    d_A = XMVectorSplatY( h_A );
+    d_B = XMVector3Dot( h_B, AR1X );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(w) = (0, 0, 1)
+    // t dot l = t.z
+    // d(A) = h(A).z
+    // d(B) = h(B) dot abs(r20, r21, r22)
+    d = XMVectorSplatZ( t );
+    d_A = XMVectorSplatZ( h_A );
+    d_B = XMVector3Dot( h_B, AR2X );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = b(u) = (r00, r10, r20)
+    // d(A) = h(A) dot abs(r00, r10, r20)
+    // d(B) = h(B).x
+    d = XMVector3Dot( t, RX0 );
+    d_A = XMVector3Dot( h_A, ARX0 );
+    d_B = XMVectorSplatX( h_B );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = b(v) = (r01, r11, r21)
+    // d(A) = h(A) dot abs(r01, r11, r21)
+    // d(B) = h(B).y
+    d = XMVector3Dot( t, RX1 );
+    d_A = XMVector3Dot( h_A, ARX1 );
+    d_B = XMVectorSplatY( h_B );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = b(w) = (r02, r12, r22)
+    // d(A) = h(A) dot abs(r02, r12, r22)
+    // d(B) = h(B).z
+    d = XMVector3Dot( t, RX2 );
+    d_A = XMVector3Dot( h_A, ARX2 );
+    d_B = XMVectorSplatZ( h_B );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(u) x b(u) = (0, -r20, r10)
+    // d(A) = h(A) dot abs(0, r20, r10)
+    // d(B) = h(B) dot abs(0, r02, r01)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX0, -RX0 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX0 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR0X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(u) x b(v) = (0, -r21, r11)
+    // d(A) = h(A) dot abs(0, r21, r11)
+    // d(B) = h(B) dot abs(r02, 0, r00)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX1, -RX1 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX1 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR0X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(u) x b(w) = (0, -r22, r12)
+    // d(A) = h(A) dot abs(0, r22, r12)
+    // d(B) = h(B) dot abs(r01, r00, 0)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX2, -RX2 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX2 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR0X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(v) x b(u) = (r20, 0, -r00)
+    // d(A) = h(A) dot abs(r20, 0, r00)
+    // d(B) = h(B) dot abs(0, r12, r11)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX0, -RX0 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX0 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR1X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(v) x b(v) = (r21, 0, -r01)
+    // d(A) = h(A) dot abs(r21, 0, r01)
+    // d(B) = h(B) dot abs(r12, 0, r10)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX1, -RX1 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX1 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR1X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(v) x b(w) = (r22, 0, -r02)
+    // d(A) = h(A) dot abs(r22, 0, r02)
+    // d(B) = h(B) dot abs(r11, r10, 0)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX2, -RX2 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX2 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR1X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(w) x b(u) = (-r10, r00, 0)
+    // d(A) = h(A) dot abs(r10, r00, 0)
+    // d(B) = h(B) dot abs(0, r22, r21)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX0, -RX0 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX0 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR2X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(w) x b(v) = (-r11, r01, 0)
+    // d(A) = h(A) dot abs(r11, r01, 0)
+    // d(B) = h(B) dot abs(r22, 0, r20)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX1, -RX1 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX1 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR2X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // l = a(w) x b(w) = (-r12, r02, 0)
+    // d(A) = h(A) dot abs(r12, r02, 0)
+    // d(B) = h(B) dot abs(r21, r20, 0)
+    d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX2, -RX2 ) );
+    d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX2 ) );
+    d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR2X ) );
+    NoIntersection = XMVectorOrInt( NoIntersection, 
+                                    XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) );
+
+    // No seperating axis found, boxes must intersect.
+    return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ? true : false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum vs. oriented box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects( const BoundingFrustum& fr ) const
+{
+    return fr.Intersects( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs. oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    // Load the box center & orientation.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Transform the triangle vertices into the space of the box.
+    XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation );
+    XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation );
+    XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation );
+
+    BoundingBox box;
+    box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f );
+    box.Extents = Extents;
+
+    // Use the triangle vs axis aligned box intersection routine.
+    return box.Intersects( TV0, TV1, TV2 );
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Plane ) const
+{
+    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    // Build the 3x3 rotation matrix that defines the box axes.
+    XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation );
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside );
+
+    // If the box is outside any plane it is outside.
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return FRONT;
+
+    // If the box is inside all planes it is inside.
+    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
+        return BACK;
+
+    // The box is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with an oriented box
+// using the slabs method.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const
+{
+    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
+
+    static const XMVECTORU32 SelectY =
+    {
+        XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0
+    };
+    static const XMVECTORU32 SelectZ =
+    {
+        XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0
+    };
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Get the boxes normalized side directions.
+    XMMATRIX R = XMMatrixRotationQuaternion( vOrientation );
+
+    // Adjust ray origin to be relative to center of the box.
+    XMVECTOR TOrigin = vCenter - Origin;
+
+    // Compute the dot product againt each axis of the box.
+    XMVECTOR AxisDotOrigin = XMVector3Dot( R.r[0], TOrigin );
+    AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[1], TOrigin ), SelectY );
+    AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[2], TOrigin ), SelectZ );
+
+    XMVECTOR AxisDotDirection = XMVector3Dot( R.r[0], Direction );
+    AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[1], Direction ), SelectY );
+    AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[2], Direction ), SelectZ );
+
+    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab.
+    XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon );
+
+    // Test against all three axes simultaneously.
+    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection );
+    XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection;
+    XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection;
+
+    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
+    // use the results from any directions parallel to the slab.
+    XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel );
+    XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel );
+
+    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
+    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
+    t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) );  // x = max(x,y)
+    t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) );  // x = max(max(x,y),z)
+    t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) );  // x = min(x,y)
+    t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) );  // x = min(min(x,y),z)
+
+    // if ( t_min > t_max ) return false;
+    XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) );
+
+    // if ( t_max < 0.0f ) return false;
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) );
+
+    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false;
+    XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents );
+    NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) );
+
+    if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) )
+    {
+        // Store the x-component to *pDist
+        XMStoreFloat( &Dist, t_min );
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test an oriented box vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+                                                                     GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
+{
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3( &Center );
+    XMVECTOR vExtents = XMLoadFloat3( &Extents );
+    XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    // Build the 3x3 rotation matrix that defines the box axes.
+    XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation );
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside );
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    // If the box is outside any plane it is outside.
+    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
+        return DISJOINT;
+
+    // If the box is inside all planes it is inside.
+    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
+        return CONTAINS;
+
+    // The box is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Create oriented bounding box from axis-aligned bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingOrientedBox::CreateFromBoundingBox( BoundingOrientedBox& Out, const BoundingBox& box )
+{
+    Out.Center = box.Center;
+    Out.Extents = box.Extents;
+    Out.Orientation = XMFLOAT4( 0.f, 0.f, 0.f, 1.f );
+}
+
+
+//-----------------------------------------------------------------------------
+// Find the approximate minimum oriented bounding box containing a set of 
+// points.  Exact computation of minimum oriented bounding box is possible but 
+// is slower and requires a more complex algorithm.
+// The algorithm works by computing the inertia tensor of the points and then
+// using the eigenvectors of the intertia tensor as the axes of the box.
+// Computing the intertia tensor of the convex hull of the points will usually 
+// result in better bounding box but the computation is more complex. 
+// Exact computation of the minimum oriented bounding box is possible but the
+// best know algorithm is O(N^3) and is significanly more complex to implement.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingOrientedBox::CreateFromPoints( BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride )
+{
+    assert( Count > 0 );
+    assert( pPoints != 0 );
+
+    XMVECTOR CenterOfMass = XMVectorZero();
+
+    // Compute the center of mass and inertia tensor of the points.
+    for( size_t i = 0; i < Count; ++i )
+    {
+        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) );
+
+        CenterOfMass += Point;
+    }
+
+    CenterOfMass *= XMVectorReciprocal( XMVectorReplicate( float( Count ) ) );
+
+    // Compute the inertia tensor of the points around the center of mass.
+    // Using the center of mass is not strictly necessary, but will hopefully
+    // improve the stability of finding the eigenvectors.
+    XMVECTOR XX_YY_ZZ = XMVectorZero();
+    XMVECTOR XY_XZ_YZ = XMVectorZero();
+
+    for( size_t i = 0; i < Count; ++i )
+    {
+        XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ) - CenterOfMass;
+
+        XX_YY_ZZ += Point * Point;
+
+        XMVECTOR XXY = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>( Point );
+        XMVECTOR YZZ = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_W>( Point );
+
+        XY_XZ_YZ += XXY * YZZ;
+    }
+
+    XMVECTOR v1, v2, v3;
+
+    // Compute the eigenvectors of the inertia tensor.
+    DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix( XMVectorGetX( XX_YY_ZZ ), XMVectorGetY( XX_YY_ZZ ),
+                                                                  XMVectorGetZ( XX_YY_ZZ ),
+                                                                  XMVectorGetX( XY_XZ_YZ ), XMVectorGetY( XY_XZ_YZ ),
+                                                                  XMVectorGetZ( XY_XZ_YZ ),
+                                                                  &v1, &v2, &v3 );
+
+    // Put them in a matrix.
+    XMMATRIX R;
+
+    R.r[0] = XMVectorSetW( v1, 0.f );
+    R.r[1] = XMVectorSetW( v2, 0.f );
+    R.r[2] = XMVectorSetW( v3, 0.f );
+    R.r[3] = g_XMIdentityR3.v;
+
+    // Multiply by -1 to convert the matrix into a right handed coordinate 
+    // system (Det ~= 1) in case the eigenvectors form a left handed 
+    // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only 
+    // works on right handed matrices.
+    XMVECTOR Det = XMMatrixDeterminant( R );
+
+    if( XMVector4Less( Det, XMVectorZero() ) )
+    {
+        R.r[0] *= g_XMNegativeOne.v;
+        R.r[1] *= g_XMNegativeOne.v;
+        R.r[2] *= g_XMNegativeOne.v;
+    }
+
+    // Get the rotation quaternion from the matrix.
+    XMVECTOR vOrientation = XMQuaternionRotationMatrix( R );
+
+    // Make sure it is normal (in case the vectors are slightly non-orthogonal).
+    vOrientation = XMQuaternionNormalize( vOrientation );
+
+    // Rebuild the rotation matrix from the quaternion.
+    R = XMMatrixRotationQuaternion( vOrientation );
+
+    // Build the rotation into the rotated space.
+    XMMATRIX InverseR = XMMatrixTranspose( R );
+
+    // Find the minimum OBB using the eigenvectors as the axes.
+    XMVECTOR vMin, vMax;
+
+    vMin = vMax = XMVector3TransformNormal( XMLoadFloat3( pPoints ), InverseR );
+
+    for( size_t i = 1; i < Count; ++i )
+    {
+        XMVECTOR Point = XMVector3TransformNormal( XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ),
+                                                   InverseR );
+
+        vMin = XMVectorMin( vMin, Point );
+        vMax = XMVectorMax( vMax, Point );
+    }
+
+    // Rotate the center into world space.
+    XMVECTOR vCenter = ( vMin + vMax ) * 0.5f;
+    vCenter = XMVector3TransformNormal( vCenter, R );
+
+    // Store center, extents, and orientation.
+    XMStoreFloat3( &Out.Center, vCenter );
+    XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f );
+    XMStoreFloat4( &Out.Orientation, vOrientation );
+}
+
+
+/****************************************************************************
+ *
+ * BoundingFrustum
+ *
+ ****************************************************************************/
+
+//-----------------------------------------------------------------------------
+// Transform a frustum by an angle preserving transform.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, FXMMATRIX M ) const
+{
+    // Load the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Composite the frustum rotation and the transform rotation
+    XMMATRIX nM;
+    nM.r[0] = XMVector3Normalize( M.r[0] );
+    nM.r[1] = XMVector3Normalize( M.r[1] );
+    nM.r[2] = XMVector3Normalize( M.r[2] );
+    nM.r[3] = g_XMIdentityR3;
+    XMVECTOR Rotation = XMQuaternionRotationMatrix( nM );
+    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
+
+    // Transform the center.
+    vOrigin = XMVector3Transform( vOrigin, M );
+
+    // Store the frustum.
+    XMStoreFloat3( &Out.Origin, vOrigin );
+    XMStoreFloat4( &Out.Orientation, vOrientation );
+
+    // Scale the near and far distances (the slopes remain the same).
+    XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] );
+    XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] );
+    XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] );
+
+    XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) );
+    float Scale = sqrtf( XMVectorGetX(d) );
+
+    Out.Near = Near * Scale;
+    Out.Far = Far * Scale;
+
+    // Copy the slopes.
+    Out.RightSlope = RightSlope;
+    Out.LeftSlope = LeftSlope;
+    Out.TopSlope = TopSlope;
+    Out.BottomSlope = BottomSlope;
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const
+{
+    assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) );
+
+    // Load the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Composite the frustum rotation and the transform rotation.
+    vOrientation = XMQuaternionMultiply( vOrientation, Rotation );
+
+    // Transform the origin.
+    vOrigin = XMVector3Rotate( vOrigin * XMVectorReplicate( Scale ), Rotation ) + Translation;
+
+    // Store the frustum.
+    XMStoreFloat3( &Out.Origin, vOrigin );
+    XMStoreFloat4( &Out.Orientation, vOrientation );
+
+    // Scale the near and far distances (the slopes remain the same).
+    Out.Near = Near * Scale;
+    Out.Far = Far * Scale;
+
+    // Copy the slopes.
+    Out.RightSlope = RightSlope;
+    Out.LeftSlope = LeftSlope;
+    Out.TopSlope = TopSlope;
+    Out.BottomSlope = BottomSlope;
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the corner points of the frustum
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingFrustum::GetCorners( XMFLOAT3* Corners ) const
+{
+    assert( Corners != 0 );
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
+
+    // Returns 8 corners position of bounding frustum.
+    //     Near    Far
+    //    0----1  4----5
+    //    |    |  |    |
+    //    |    |  |    |
+    //    3----2  7----6
+
+    XMVECTOR vCorners[CORNER_COUNT];
+    vCorners[0] = vLeftTop * vNear;
+    vCorners[1] = vRightTop * vNear;
+    vCorners[2] = vRightBottom * vNear;
+    vCorners[3] = vLeftBottom * vNear;
+    vCorners[4] = vLeftTop * vFar;
+    vCorners[5] = vRightTop * vFar;
+    vCorners[6] = vRightBottom * vFar;
+    vCorners[7] = vLeftBottom * vFar;
+
+    for( size_t i=0; i < CORNER_COUNT; ++i )
+    {
+        XMVECTOR C = XMVector3Rotate( vCorners[i], vOrientation ) + vOrigin;
+        XMStoreFloat3( &Corners[i], C );
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR Point ) const
+{
+    // Build frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+
+    // Load origin and orientation.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Transform point into local space of frustum.
+    XMVECTOR TPoint = XMVector3InverseRotate( Point - vOrigin, vOrientation );
+
+    // Set w to one.
+    TPoint = XMVectorInsert<0, 0, 0, 0, 1>( TPoint, XMVectorSplatOne() );
+
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Outside = Zero;
+
+    // Test point against each plane of the frustum.
+    for( size_t i = 0; i < 6; ++i )
+    {
+        XMVECTOR Dot = XMVector4Dot( TPoint, Planes[i] );
+        Outside = XMVectorOrInt( Outside, XMVectorGreater( Dot, Zero ) );
+    }
+
+    return XMVector4NotEqualInt( Outside, XMVectorTrueInt() ) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
+    NearPlane = XMPlaneNormalize( NearPlane );
+
+    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
+    FarPlane = XMPlaneNormalize( FarPlane );
+
+    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
+    RightPlane = XMPlaneNormalize( RightPlane );
+
+    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
+    LeftPlane = XMPlaneNormalize( LeftPlane );
+    
+    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
+    TopPlane = XMPlaneNormalize( TopPlane );
+
+    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
+    BottomPlane = XMPlaneNormalize( BottomPlane );
+
+    return TriangleTests::ContainedBy( V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains( const BoundingSphere& sh ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
+    NearPlane = XMPlaneNormalize( NearPlane );
+
+    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
+    FarPlane = XMPlaneNormalize( FarPlane );
+
+    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
+    RightPlane = XMPlaneNormalize( RightPlane );
+
+    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
+    LeftPlane = XMPlaneNormalize( LeftPlane );
+    
+    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
+    TopPlane = XMPlaneNormalize( TopPlane );
+
+    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
+    BottomPlane = XMPlaneNormalize( BottomPlane );
+
+    return sh.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains( const BoundingBox& box ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
+    NearPlane = XMPlaneNormalize( NearPlane );
+
+    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
+    FarPlane = XMPlaneNormalize( FarPlane );
+
+    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
+    RightPlane = XMPlaneNormalize( RightPlane );
+
+    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
+    LeftPlane = XMPlaneNormalize( LeftPlane );
+    
+    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
+    TopPlane = XMPlaneNormalize( TopPlane );
+
+    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
+    BottomPlane = XMPlaneNormalize( BottomPlane );
+
+    return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains( const BoundingOrientedBox& box ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
+    NearPlane = XMPlaneNormalize( NearPlane );
+
+    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
+    FarPlane = XMPlaneNormalize( FarPlane );
+
+    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
+    RightPlane = XMPlaneNormalize( RightPlane );
+
+    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
+    LeftPlane = XMPlaneNormalize( LeftPlane );
+    
+    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
+    TopPlane = XMPlaneNormalize( TopPlane );
+
+    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
+    BottomPlane = XMPlaneNormalize( BottomPlane );
+
+    return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains( const BoundingFrustum& fr ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin );
+    NearPlane = XMPlaneNormalize( NearPlane );
+
+    XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin );
+    FarPlane = XMPlaneNormalize( FarPlane );
+
+    XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin );
+    RightPlane = XMPlaneNormalize( RightPlane );
+
+    XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin );
+    LeftPlane = XMPlaneNormalize( LeftPlane );
+    
+    XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin );
+    TopPlane = XMPlaneNormalize( TopPlane );
+
+    XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+    BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin );
+    BottomPlane = XMPlaneNormalize( BottomPlane );
+
+    return fr.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane );
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact sphere vs frustum test.  The algorithm first checks the sphere against
+// the planes of the frustum, then if the plane checks were indeterminate finds
+// the nearest feature (plane, line, point) on the frustum to the center of the
+// sphere and compares the distance to the nearest feature to the radius of the 
+// sphere
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects( const BoundingSphere& sh ) const
+{
+    XMVECTOR Zero = XMVectorZero();
+
+    // Build the frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+
+    // Normalize the planes so we can compare to the sphere radius.
+    Planes[2] = XMVector3Normalize( Planes[2] );
+    Planes[3] = XMVector3Normalize( Planes[3] );
+    Planes[4] = XMVector3Normalize( Planes[4] );
+    Planes[5] = XMVector3Normalize( Planes[5] );
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3( &sh.Center );
+    XMVECTOR vRadius = XMVectorReplicatePtr( &sh.Radius );
+
+    // Transform the center of the sphere into the local space of frustum.
+    vCenter = XMVector3InverseRotate( vCenter - vOrigin, vOrientation );
+
+    // Set w of the center to one so we can dot4 with the plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() );
+
+    // Check against each plane of the frustum.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+    XMVECTOR CenterInsideAll = XMVectorTrueInt();
+
+    XMVECTOR Dist[6];
+
+    for( size_t i = 0; i < 6; ++i )
+    {
+        Dist[i] = XMVector4Dot( vCenter, Planes[i] );
+
+        // Outside the plane?
+        Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist[i], vRadius ) );
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist[i], -vRadius ) );
+
+        // Check if the center is inside the plane.
+        CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist[i], Zero ) );
+    }
+
+    // If the sphere is outside any of the planes it is outside. 
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If the sphere is inside all planes it is fully inside.
+    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
+        return true;
+
+    // If the center of the sphere is inside all planes and the sphere intersects 
+    // one or more planes then it must intersect.
+    if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) )
+        return true;
+
+    // The sphere may be outside the frustum or intersecting the frustum.
+    // Find the nearest feature (face, edge, or corner) on the frustum 
+    // to the sphere.
+
+    // The faces adjacent to each face are:
+    static const size_t adjacent_faces[6][4] =
+    {
+        { 2, 3, 4, 5 },    // 0
+        { 2, 3, 4, 5 },    // 1
+        { 0, 1, 4, 5 },    // 2
+        { 0, 1, 4, 5 },    // 3
+        { 0, 1, 2, 3 },    // 4
+        { 0, 1, 2, 3 }
+    };  // 5
+
+    XMVECTOR Intersects = XMVectorFalseInt();
+
+    // Check to see if the nearest feature is one of the planes.
+    for( size_t i = 0; i < 6; ++i )
+    {
+        // Find the nearest point on the plane to the center of the sphere.
+        XMVECTOR Point = vCenter - (Planes[i] * Dist[i]);
+
+        // Set w of the point to one.
+        Point = XMVectorInsert<0, 0, 0, 0, 1>( Point, XMVectorSplatOne() );
+        
+        // If the point is inside the face (inside the adjacent planes) then
+        // this plane is the nearest feature.
+        XMVECTOR InsideFace = XMVectorTrueInt();
+        
+        for ( size_t j = 0; j < 4; j++ )
+        {
+            size_t plane_index = adjacent_faces[i][j];
+
+            InsideFace = XMVectorAndInt( InsideFace,
+                           XMVectorLessOrEqual( XMVector4Dot( Point, Planes[plane_index] ), Zero ) );
+        }
+     
+        // Since we have already checked distance from the plane we know that the
+        // sphere must intersect if this plane is the nearest feature.
+        Intersects = XMVectorOrInt( Intersects, 
+                                    XMVectorAndInt( XMVectorGreater( Dist[i], Zero ), InsideFace ) );
+    }
+
+    if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) )
+        return true;
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
+
+    XMVECTOR Corners[CORNER_COUNT];
+    Corners[0] = vRightTop * vNear;
+    Corners[1] = vRightBottom * vNear;
+    Corners[2] = vLeftTop * vNear;
+    Corners[3] = vLeftBottom * vNear;
+    Corners[4] = vRightTop * vFar;
+    Corners[5] = vRightBottom * vFar;
+    Corners[6] = vLeftTop * vFar;
+    Corners[7] = vLeftBottom * vFar;
+
+    // The Edges are:
+    static const size_t edges[12][2] =
+    {
+        { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 },    // Near plane
+        { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 },    // Far plane
+        { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 },
+    }; // Near to far
+
+    XMVECTOR RadiusSq = vRadius * vRadius;
+
+    // Check to see if the nearest feature is one of the edges (or corners).
+    for( size_t i = 0; i < 12; ++i )
+    {
+        size_t ei0 = edges[i][0];
+        size_t ei1 = edges[i][1];
+
+        // Find the nearest point on the edge to the center of the sphere.
+        // The corners of the frustum are included as the endpoints of the edges.
+        XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint( Corners[ei0], Corners[ei1], vCenter );
+
+        XMVECTOR Delta = vCenter - Point;
+
+        XMVECTOR DistSq = XMVector3Dot( Delta, Delta );
+
+        // If the distance to the center of the sphere to the point is less than 
+        // the radius of the sphere then it must intersect.
+        Intersects = XMVectorOrInt( Intersects, XMVectorLessOrEqual( DistSq, RadiusSq ) );
+    }
+
+    if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) )
+        return true;
+
+    // The sphere must be outside the frustum.
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact axis aligned box vs frustum test.  Constructs an oriented box and uses
+// the oriented box vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects( const BoundingBox& box ) const
+{
+    // Make the axis aligned box oriented and do an OBB vs frustum test.
+    BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) );
+    return Intersects( obox );
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact oriented box vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects( const BoundingOrientedBox& box ) const
+{
+    static const XMVECTORU32 SelectY =
+    {
+        XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0
+    };
+    static const XMVECTORU32 SelectZ =
+    {
+        XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0
+    };
+
+    XMVECTOR Zero = XMVectorZero();
+
+    // Build the frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR FrustumOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( FrustumOrientation ) );
+
+    // Load the box.
+    XMVECTOR Center = XMLoadFloat3( &box.Center );
+    XMVECTOR Extents = XMLoadFloat3( &box.Extents );
+    XMVECTOR BoxOrientation = XMLoadFloat4( &box.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) );
+
+    // Transform the oriented box into the space of the frustum in order to 
+    // minimize the number of transforms we have to do.
+    Center = XMVector3InverseRotate( Center - vOrigin, FrustumOrientation );
+    BoxOrientation = XMQuaternionMultiply( BoxOrientation, XMQuaternionConjugate( FrustumOrientation ) );
+
+    // Set w of the center to one so we can dot4 with the plane.
+    Center = XMVectorInsert<0, 0, 0, 0, 1>( Center, XMVectorSplatOne() );
+
+    // Build the 3x3 rotation matrix that defines the box axes.
+    XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation );
+
+    // Check against each plane of the frustum.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+    XMVECTOR CenterInsideAll = XMVectorTrueInt();
+
+    for( size_t i = 0; i < 6; ++i )
+    {
+        // Compute the distance to the center of the box.
+        XMVECTOR Dist = XMVector4Dot( Center, Planes[i] );
+
+        // Project the axes of the box onto the normal of the plane.  Half the
+        // length of the projection (sometime called the "radius") is equal to
+        // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
+        // where h(i) are extents of the box, n is the plane normal, and b(i) are the 
+        // axes of the box.
+        XMVECTOR Radius = XMVector3Dot( Planes[i], R.r[0] );
+        Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[1] ), SelectY );
+        Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[2] ), SelectZ );
+        Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) );
+
+        // Outside the plane?
+        Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, Radius ) );
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist, -Radius ) );
+
+        // Check if the center is inside the plane.
+        CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist, Zero ) );
+    }
+
+    // If the box is outside any of the planes it is outside. 
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If the box is inside all planes it is fully inside.
+    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
+        return true;
+
+    // If the center of the box is inside all planes and the box intersects 
+    // one or more planes then it must intersect.
+    if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) )
+        return true;
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
+
+    XMVECTOR Corners[CORNER_COUNT];
+    Corners[0] = vRightTop * vNear;
+    Corners[1] = vRightBottom * vNear;
+    Corners[2] = vLeftTop * vNear;
+    Corners[3] = vLeftBottom * vNear;
+    Corners[4] = vRightTop * vFar;
+    Corners[5] = vRightBottom * vFar;
+    Corners[6] = vLeftTop * vFar;
+    Corners[7] = vLeftBottom * vFar;
+
+    // Test against box axes (3)
+    {
+        // Find the min/max values of the projection of the frustum onto each axis.
+        XMVECTOR FrustumMin, FrustumMax;
+
+        FrustumMin = XMVector3Dot( Corners[0], R.r[0] );
+        FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[1] ), SelectY );
+        FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[2] ), SelectZ );
+        FrustumMax = FrustumMin;
+
+        for( size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i )
+        {
+            XMVECTOR Temp = XMVector3Dot( Corners[i], R.r[0] );
+            Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[1] ), SelectY );
+            Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[2] ), SelectZ );
+
+            FrustumMin = XMVectorMin( FrustumMin, Temp );
+            FrustumMax = XMVectorMax( FrustumMax, Temp );
+        }
+
+        // Project the center of the box onto the axes.
+        XMVECTOR BoxDist = XMVector3Dot( Center, R.r[0] );
+        BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[1] ), SelectY );
+        BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[2] ), SelectZ );
+
+        // The projection of the box onto the axis is just its Center and Extents.
+        // if (min > box_max || max < box_min) reject;
+        XMVECTOR Result = XMVectorOrInt( XMVectorGreater( FrustumMin, BoxDist + Extents ),
+                                          XMVectorLess( FrustumMax, BoxDist - Extents ) );
+
+        if( DirectX::Internal::XMVector3AnyTrue( Result ) )
+            return false;
+    }
+
+    // Test against edge/edge axes (3*6).
+    XMVECTOR FrustumEdgeAxis[6];
+
+    FrustumEdgeAxis[0] = vRightTop;
+    FrustumEdgeAxis[1] = vRightBottom;
+    FrustumEdgeAxis[2] = vLeftTop;
+    FrustumEdgeAxis[3] = vLeftBottom;
+    FrustumEdgeAxis[4] = vRightTop - vLeftTop;
+    FrustumEdgeAxis[5] = vLeftBottom - vLeftTop;
+
+    for( size_t i = 0; i < 3; ++i )
+    {
+        for( size_t j = 0; j < 6; j++ )
+        {
+            // Compute the axis we are going to test.
+            XMVECTOR Axis = XMVector3Cross( R.r[i], FrustumEdgeAxis[j] );
+
+            // Find the min/max values of the projection of the frustum onto the axis.
+            XMVECTOR FrustumMin, FrustumMax;
+
+            FrustumMin = FrustumMax = XMVector3Dot( Axis, Corners[0] );
+
+            for( size_t k = 1; k < CORNER_COUNT; k++ )
+            {
+                XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] );
+                FrustumMin = XMVectorMin( FrustumMin, Temp );
+                FrustumMax = XMVectorMax( FrustumMax, Temp );
+            }
+
+            // Project the center of the box onto the axis.
+            XMVECTOR Dist = XMVector3Dot( Center, Axis );
+
+            // Project the axes of the box onto the axis to find the "radius" of the box.
+            XMVECTOR Radius = XMVector3Dot( Axis, R.r[0] );
+            Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[1] ), SelectY );
+            Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[2] ), SelectZ );
+            Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) );
+
+            // if (center > max + radius || center < min - radius) reject;
+            Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, FrustumMax + Radius ) );
+            Outside = XMVectorOrInt( Outside, XMVectorLess( Dist, FrustumMin - Radius ) );
+        }
+    }
+
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If we did not find a separating plane then the box must intersect the frustum.
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact frustum vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects( const BoundingFrustum& fr ) const
+{
+    // Load origin and orientation of frustum B.
+    XMVECTOR OriginB = XMLoadFloat3( &Origin );
+    XMVECTOR OrientationB = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( OrientationB ) );
+
+    // Build the planes of frustum B.
+    XMVECTOR AxisB[6];
+    AxisB[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f );
+    AxisB[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f );
+    AxisB[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    AxisB[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    AxisB[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    AxisB[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+
+    XMVECTOR PlaneDistB[6];
+    PlaneDistB[0] = -XMVectorReplicatePtr( &Near );
+    PlaneDistB[1] = XMVectorReplicatePtr( &Far );
+    PlaneDistB[2] = XMVectorZero();
+    PlaneDistB[3] = XMVectorZero();
+    PlaneDistB[4] = XMVectorZero();
+    PlaneDistB[5] = XMVectorZero();
+
+    // Load origin and orientation of frustum A.
+    XMVECTOR OriginA = XMLoadFloat3( &fr.Origin );
+    XMVECTOR OrientationA = XMLoadFloat4( &fr.Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( OrientationA ) );
+
+    // Transform frustum A into the space of the frustum B in order to 
+    // minimize the number of transforms we have to do.
+    OriginA = XMVector3InverseRotate( OriginA - OriginB, OrientationB );
+    OrientationA = XMQuaternionMultiply( OrientationA, XMQuaternionConjugate( OrientationB ) );
+
+    // Build the corners of frustum A (in the local space of B).
+    XMVECTOR RightTopA = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f );
+    XMVECTOR RightBottomA = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f );
+    XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope,fr.TopSlope, 1.0f, 0.0f );
+    XMVECTOR LeftBottomA = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f );
+    XMVECTOR NearA = XMVectorReplicatePtr( &fr.Near );
+    XMVECTOR FarA = XMVectorReplicatePtr( &fr.Far );
+
+    RightTopA = XMVector3Rotate( RightTopA, OrientationA );
+    RightBottomA = XMVector3Rotate( RightBottomA, OrientationA );
+    LeftTopA = XMVector3Rotate( LeftTopA, OrientationA );
+    LeftBottomA = XMVector3Rotate( LeftBottomA, OrientationA );
+
+    XMVECTOR CornersA[CORNER_COUNT];
+    CornersA[0] = OriginA + RightTopA * NearA;
+    CornersA[1] = OriginA + RightBottomA * NearA;
+    CornersA[2] = OriginA + LeftTopA * NearA;
+    CornersA[3] = OriginA + LeftBottomA * NearA;
+    CornersA[4] = OriginA + RightTopA * FarA;
+    CornersA[5] = OriginA + RightBottomA * FarA;
+    CornersA[6] = OriginA + LeftTopA * FarA;
+    CornersA[7] = OriginA + LeftBottomA * FarA;
+
+    // Check frustum A against each plane of frustum B.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    for( size_t i = 0; i < 6; ++i )
+    {
+        // Find the min/max projection of the frustum onto the plane normal.
+        XMVECTOR Min, Max;
+
+        Min = Max = XMVector3Dot( AxisB[i], CornersA[0] );
+
+        for( size_t j = 1; j < CORNER_COUNT; j++ )
+        {
+            XMVECTOR Temp = XMVector3Dot( AxisB[i], CornersA[j] );
+            Min = XMVectorMin( Min, Temp );
+            Max = XMVectorMax( Max, Temp );
+        }
+
+        // Outside the plane?
+        Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistB[i] ) );
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Max, PlaneDistB[i] ) );
+    }
+
+    // If the frustum A is outside any of the planes of frustum B it is outside. 
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If frustum A is inside all planes of frustum B it is fully inside.
+    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
+        return true;
+
+    // Build the corners of frustum B.
+    XMVECTOR RightTopB = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR RightBottomB = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR LeftTopB = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR LeftBottomB = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR NearB = XMVectorReplicatePtr( &Near );
+    XMVECTOR FarB = XMVectorReplicatePtr( &Far );
+
+    XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT];
+    CornersB[0] = RightTopB * NearB;
+    CornersB[1] = RightBottomB * NearB;
+    CornersB[2] = LeftTopB * NearB;
+    CornersB[3] = LeftBottomB * NearB;
+    CornersB[4] = RightTopB * FarB;
+    CornersB[5] = RightBottomB * FarB;
+    CornersB[6] = LeftTopB * FarB;
+    CornersB[7] = LeftBottomB * FarB;
+
+    // Build the planes of frustum A (in the local space of B).
+    XMVECTOR AxisA[6];
+    XMVECTOR PlaneDistA[6];
+
+    AxisA[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f );
+    AxisA[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f );
+    AxisA[2] = XMVectorSet( 1.0f, 0.0f, -fr.RightSlope, 0.0f );
+    AxisA[3] = XMVectorSet( -1.0f, 0.0f, fr.LeftSlope, 0.0f );
+    AxisA[4] = XMVectorSet( 0.0f, 1.0f, -fr.TopSlope, 0.0f );
+    AxisA[5] = XMVectorSet( 0.0f, -1.0f, fr.BottomSlope, 0.0f );
+
+    AxisA[0] = XMVector3Rotate( AxisA[0], OrientationA );
+    AxisA[1] = -AxisA[0];
+    AxisA[2] = XMVector3Rotate( AxisA[2], OrientationA );
+    AxisA[3] = XMVector3Rotate( AxisA[3], OrientationA );
+    AxisA[4] = XMVector3Rotate( AxisA[4], OrientationA );
+    AxisA[5] = XMVector3Rotate( AxisA[5], OrientationA );
+
+    PlaneDistA[0] = XMVector3Dot( AxisA[0], CornersA[0] );  // Re-use corner on near plane.
+    PlaneDistA[1] = XMVector3Dot( AxisA[1], CornersA[4] );  // Re-use corner on far plane.
+    PlaneDistA[2] = XMVector3Dot( AxisA[2], OriginA );
+    PlaneDistA[3] = XMVector3Dot( AxisA[3], OriginA );
+    PlaneDistA[4] = XMVector3Dot( AxisA[4], OriginA );
+    PlaneDistA[5] = XMVector3Dot( AxisA[5], OriginA );
+
+    // Check each axis of frustum A for a seperating plane (5).
+    for( size_t i = 0; i < 6; ++i )
+    {
+        // Find the minimum projection of the frustum onto the plane normal.
+        XMVECTOR Min;
+
+        Min = XMVector3Dot( AxisA[i], CornersB[0] );
+
+        for( size_t j = 1; j < CORNER_COUNT; j++ )
+        {
+            XMVECTOR Temp = XMVector3Dot( AxisA[i], CornersB[j] );
+            Min = XMVectorMin( Min, Temp );
+        }
+
+        // Outside the plane?
+        Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistA[i] ) );
+    }
+
+    // If the frustum B is outside any of the planes of frustum A it is outside. 
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // Check edge/edge axes (6 * 6).
+    XMVECTOR FrustumEdgeAxisA[6];
+    FrustumEdgeAxisA[0] = RightTopA;
+    FrustumEdgeAxisA[1] = RightBottomA;
+    FrustumEdgeAxisA[2] = LeftTopA;
+    FrustumEdgeAxisA[3] = LeftBottomA;
+    FrustumEdgeAxisA[4] = RightTopA - LeftTopA;
+    FrustumEdgeAxisA[5] = LeftBottomA - LeftTopA;
+
+    XMVECTOR FrustumEdgeAxisB[6];
+    FrustumEdgeAxisB[0] = RightTopB;
+    FrustumEdgeAxisB[1] = RightBottomB;
+    FrustumEdgeAxisB[2] = LeftTopB;
+    FrustumEdgeAxisB[3] = LeftBottomB;
+    FrustumEdgeAxisB[4] = RightTopB - LeftTopB;
+    FrustumEdgeAxisB[5] = LeftBottomB - LeftTopB;
+
+    for( size_t i = 0; i < 6; ++i )
+    {
+        for( size_t j = 0; j < 6; j++ )
+        {
+            // Compute the axis we are going to test.
+            XMVECTOR Axis = XMVector3Cross( FrustumEdgeAxisA[i], FrustumEdgeAxisB[j] );
+
+            // Find the min/max values of the projection of both frustums onto the axis.
+            XMVECTOR MinA, MaxA;
+            XMVECTOR MinB, MaxB;
+
+            MinA = MaxA = XMVector3Dot( Axis, CornersA[0] );
+            MinB = MaxB = XMVector3Dot( Axis, CornersB[0] );
+
+            for( size_t k = 1; k < CORNER_COUNT; k++ )
+            {
+                XMVECTOR TempA = XMVector3Dot( Axis, CornersA[k] );
+                MinA = XMVectorMin( MinA, TempA );
+                MaxA = XMVectorMax( MaxA, TempA );
+
+                XMVECTOR TempB = XMVector3Dot( Axis, CornersB[k] );
+                MinB = XMVectorMin( MinB, TempB );
+                MaxB = XMVectorMax( MaxB, TempB );
+            }
+
+            // if (MinA > MaxB || MinB > MaxA) reject
+            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) );
+            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) );
+        }
+    }
+
+    // If there is a seperating plane, then the frustums do not intersect.
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If we did not find a separating plane then the frustums intersect.
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const
+{
+    // Build the frustum planes (NOTE: D is negated from the usual).
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, -Near );
+    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, Far );
+    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Transform triangle into the local space of frustum.
+    XMVECTOR TV0 = XMVector3InverseRotate( V0 - vOrigin, vOrientation );
+    XMVECTOR TV1 = XMVector3InverseRotate( V1 - vOrigin, vOrientation );
+    XMVECTOR TV2 = XMVector3InverseRotate( V2 - vOrigin, vOrientation );
+
+    // Test each vertex of the triangle against the frustum planes.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    for( size_t i = 0; i < 6; ++i )
+    {
+        XMVECTOR Dist0 = XMVector3Dot( TV0, Planes[i] );
+        XMVECTOR Dist1 = XMVector3Dot( TV1, Planes[i] );
+        XMVECTOR Dist2 = XMVector3Dot( TV2, Planes[i] );
+
+        XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 );
+        MinDist = XMVectorMin( MinDist, Dist2 );
+        XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 );
+        MaxDist = XMVectorMax( MaxDist, Dist2 );
+
+        XMVECTOR PlaneDist = XMVectorSplatW( Planes[i] );
+
+        // Outside the plane?
+        Outside = XMVectorOrInt( Outside, XMVectorGreater( MinDist, PlaneDist ) );
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( MaxDist, PlaneDist ) );
+    }
+
+    // If the triangle is outside any of the planes it is outside. 
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If the triangle is inside all planes it is fully inside.
+    if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) )
+        return true;
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
+
+    XMVECTOR Corners[CORNER_COUNT];
+    Corners[0] = vRightTop * vNear;
+    Corners[1] = vRightBottom * vNear;
+    Corners[2] = vLeftTop * vNear;
+    Corners[3] = vLeftBottom * vNear;
+    Corners[4] = vRightTop * vFar;
+    Corners[5] = vRightBottom * vFar;
+    Corners[6] = vLeftTop * vFar;
+    Corners[7] = vLeftBottom * vFar;
+
+    // Test the plane of the triangle.
+    XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 );
+    XMVECTOR Dist = XMVector3Dot( Normal, V0 );
+
+    XMVECTOR MinDist, MaxDist;
+    MinDist = MaxDist = XMVector3Dot( Corners[0], Normal );
+    for( size_t i = 1; i < CORNER_COUNT; ++i )
+    {
+        XMVECTOR Temp = XMVector3Dot( Corners[i], Normal );
+        MinDist = XMVectorMin( MinDist, Temp );
+        MaxDist = XMVectorMax( MaxDist, Temp );
+    }
+
+    Outside = XMVectorOrInt( XMVectorGreater( MinDist, Dist ), XMVectorLess( MaxDist, Dist ) );   
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // Check the edge/edge axes (3*6).
+    XMVECTOR TriangleEdgeAxis[3];
+    TriangleEdgeAxis[0] = V1 - V0;
+    TriangleEdgeAxis[1] = V2 - V1;
+    TriangleEdgeAxis[2] = V0 - V2;
+
+    XMVECTOR FrustumEdgeAxis[6];
+    FrustumEdgeAxis[0] = vRightTop;
+    FrustumEdgeAxis[1] = vRightBottom;
+    FrustumEdgeAxis[2] = vLeftTop;
+    FrustumEdgeAxis[3] = vLeftBottom;
+    FrustumEdgeAxis[4] = vRightTop - vLeftTop;
+    FrustumEdgeAxis[5] = vLeftBottom - vLeftTop;
+
+    for( size_t i = 0; i < 3; ++i )
+    {
+        for( size_t j = 0; j < 6; j++ )
+        {
+            // Compute the axis we are going to test.
+            XMVECTOR Axis = XMVector3Cross( TriangleEdgeAxis[i], FrustumEdgeAxis[j] );
+
+            // Find the min/max of the projection of the triangle onto the axis.
+            XMVECTOR MinA, MaxA;
+
+            XMVECTOR Dist0 = XMVector3Dot( V0, Axis );
+            XMVECTOR Dist1 = XMVector3Dot( V1, Axis );
+            XMVECTOR Dist2 = XMVector3Dot( V2, Axis );
+
+            MinA = XMVectorMin( Dist0, Dist1 );
+            MinA = XMVectorMin( MinA, Dist2 );
+            MaxA = XMVectorMax( Dist0, Dist1 );
+            MaxA = XMVectorMax( MaxA, Dist2 );
+
+            // Find the min/max of the projection of the frustum onto the axis.
+            XMVECTOR MinB, MaxB;
+
+            MinB = MaxB = XMVector3Dot( Axis, Corners[0] );
+
+            for( size_t k = 1; k < CORNER_COUNT; k++ )
+            {
+                XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] );
+                MinB = XMVectorMin( MinB, Temp );
+                MaxB = XMVectorMax( MaxB, Temp );
+            }
+
+            // if (MinA > MaxB || MinB > MaxA) reject;
+            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) );
+            Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) );
+        }
+    }
+
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return false;
+
+    // If we did not find a separating plane then the triangle must intersect the frustum.
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR Plane ) const
+{
+    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Set w of the origin to one so we can dot4 with a plane.
+    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() );
+
+    // Build the corners of the frustum (in world space).
+    XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
+
+    RightTop = XMVector3Rotate( RightTop, vOrientation );
+    RightBottom = XMVector3Rotate( RightBottom, vOrientation );
+    LeftTop = XMVector3Rotate( LeftTop, vOrientation );
+    LeftBottom = XMVector3Rotate( LeftBottom, vOrientation );
+
+    XMVECTOR Corners0 = vOrigin + RightTop * vNear;
+    XMVECTOR Corners1 = vOrigin + RightBottom * vNear;
+    XMVECTOR Corners2 = vOrigin + LeftTop * vNear;
+    XMVECTOR Corners3 = vOrigin + LeftBottom * vNear;
+    XMVECTOR Corners4 = vOrigin + RightTop * vFar;
+    XMVECTOR Corners5 = vOrigin + RightBottom * vFar;
+    XMVECTOR Corners6 = vOrigin + LeftTop * vFar;
+    XMVECTOR Corners7 = vOrigin + LeftBottom * vFar;
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane, Outside, Inside );
+
+    // If the frustum is outside any plane it is outside.
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return FRONT;
+
+    // If the frustum is inside all planes it is inside.
+    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
+        return BACK;
+
+    // The frustum is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Ray vs. frustum test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist ) const
+{
+    // If ray starts inside the frustum, return a distance of 0 for the hit
+    if ( Contains(rayOrigin) == CONTAINS )
+    {
+        Dist = 0.0f;
+        return true;
+    }
+
+    // Build the frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+    Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+    Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+    Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+    Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+    Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR frOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR frOrientation = XMLoadFloat4( &Orientation );
+
+    // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250
+    float tnear = -FLT_MAX;
+    float tfar = FLT_MAX;
+
+    for( size_t i=0; i < 6; ++i )
+    {
+        XMVECTOR Plane = DirectX::Internal::XMPlaneTransform( Planes[i], frOrientation, frOrigin );
+        Plane = XMPlaneNormalize( Plane );
+
+        XMVECTOR AxisDotOrigin = XMPlaneDotCoord( Plane, rayOrigin );
+        XMVECTOR AxisDotDirection = XMVector3Dot( Plane, Direction );
+
+        if ( XMVector3LessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ) )
+        {
+            // Ray is parallel to plane - check if ray origin is inside plane's
+            if ( XMVector3Greater( AxisDotOrigin, g_XMZero ) )
+            {
+                // Ray origin is outside half-space.
+                Dist = 0.f;
+                return false;
+            }
+        }
+        else
+        {
+            // Ray not parallel - get distance to plane.
+            float vd = XMVectorGetX( AxisDotDirection );
+            float vn = XMVectorGetX( AxisDotOrigin );
+            float t = -vn / vd;
+            if (vd < 0.0f)
+            {
+                // Front face - T is a near point.
+                if (t > tfar)
+                {
+                    Dist = 0.f;
+                    return false;
+                }
+                if (t > tnear)
+                {
+                    // Hit near face.
+                    tnear = t;
+                }
+            }
+            else
+            {
+                // back face - T is far point.
+                if (t < tnear)
+                {
+                    Dist = 0.f;
+                    return false;
+                }
+                if (t < tfar)
+                {
+                    // Hit far face.
+                    tfar = t;
+                }
+            }
+        }
+    }
+
+    // Survived all tests.
+    // Note: if ray originates on polyhedron, may want to change 0.0f to some
+    // epsilon to avoid intersecting the originating face.
+    float distance = ( tnear >= 0.0f ) ? tnear : tfar;
+    if (distance >= 0.0f)
+    {
+        Dist = distance;
+        return true;
+    }    
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test a frustum vs 6 planes (typically forming another frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+                                                                 GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) );
+
+    // Set w of the origin to one so we can dot4 with a plane.
+    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() );
+
+    // Build the corners of the frustum (in world space).
+    XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f );
+    XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f );
+    XMVECTOR vNear = XMVectorReplicatePtr( &Near );
+    XMVECTOR vFar = XMVectorReplicatePtr( &Far );
+
+    RightTop = XMVector3Rotate( RightTop, vOrientation );
+    RightBottom = XMVector3Rotate( RightBottom, vOrientation );
+    LeftTop = XMVector3Rotate( LeftTop, vOrientation );
+    LeftBottom = XMVector3Rotate( LeftBottom, vOrientation );
+
+    XMVECTOR Corners0 = vOrigin + RightTop * vNear;
+    XMVECTOR Corners1 = vOrigin + RightBottom * vNear;
+    XMVECTOR Corners2 = vOrigin + LeftTop * vNear;
+    XMVECTOR Corners3 = vOrigin + LeftBottom * vNear;
+    XMVECTOR Corners4 = vOrigin + RightTop * vFar;
+    XMVECTOR Corners5 = vOrigin + RightBottom * vFar;
+    XMVECTOR Corners6 = vOrigin + LeftTop * vFar;
+    XMVECTOR Corners7 = vOrigin + LeftBottom * vFar;
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane0, Outside, Inside );
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane1, Outside, Inside );
+
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane2, Outside, Inside );
+
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane3, Outside, Inside );
+
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane4, Outside, Inside );
+
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, 
+                                                  Corners4, Corners5, Corners6, Corners7, 
+                                                  Plane5, Outside, Inside );
+
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    // If the frustum is outside any plane it is outside.
+    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
+        return DISJOINT;
+
+    // If the frustum is inside all planes it is inside.
+    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
+        return CONTAINS;
+
+    // The frustum is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Build the 6 frustum planes from a frustum.
+//
+// The intended use for these routines is for fast culling to a view frustum.  
+// When the volume being tested against a view frustum is small relative to the
+// view frustum it is usually either inside all six planes of the frustum
+// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither
+// of these cases is true then it may or may not be intersecting the frustum
+// (INTERSECTS)
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingFrustum::GetPlanes( XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane,
+                                        XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane ) const
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3( &Origin );
+    XMVECTOR vOrientation = XMLoadFloat4( &Orientation );
+
+    if (NearPlane)
+    {
+        XMVECTOR vNearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near );
+        vNearPlane = DirectX::Internal::XMPlaneTransform( vNearPlane, vOrientation, vOrigin );
+        *NearPlane = XMPlaneNormalize( vNearPlane );
+    }
+
+    if (FarPlane)
+    {
+        XMVECTOR vFarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far );
+        vFarPlane = DirectX::Internal::XMPlaneTransform( vFarPlane, vOrientation, vOrigin );
+        *FarPlane = XMPlaneNormalize( vFarPlane );
+    }
+
+    if (RightPlane)
+    {
+        XMVECTOR vRightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f );
+        vRightPlane = DirectX::Internal::XMPlaneTransform( vRightPlane, vOrientation, vOrigin );
+        *RightPlane = XMPlaneNormalize( vRightPlane );
+    }
+
+    if (LeftPlane)
+    {
+        XMVECTOR vLeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f );
+        vLeftPlane = DirectX::Internal::XMPlaneTransform( vLeftPlane, vOrientation, vOrigin );
+        *LeftPlane = XMPlaneNormalize( vLeftPlane );
+    }
+
+    if (TopPlane)
+    {
+        XMVECTOR vTopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f );
+        vTopPlane = DirectX::Internal::XMPlaneTransform( vTopPlane, vOrientation, vOrigin );
+        *TopPlane = XMPlaneNormalize( vTopPlane );
+    }
+
+    if (BottomPlane)
+    {
+        XMVECTOR vBottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f );
+        vBottomPlane = DirectX::Internal::XMPlaneTransform( vBottomPlane, vOrientation, vOrigin );
+        *BottomPlane = XMPlaneNormalize( vBottomPlane );
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Build a frustum from a persepective projection matrix.  The matrix may only
+// contain a projection; any rotation, translation or scale will cause the
+// constructed frustum to be incorrect.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix( BoundingFrustum& Out, FXMMATRIX Projection )
+{
+    // Corners of the projection frustum in homogenous space.
+    static XMVECTORF32 HomogenousPoints[6] =
+    {
+        {  1.0f,  0.0f, 1.0f, 1.0f },   // right (at far plane)
+        { -1.0f,  0.0f, 1.0f, 1.0f },   // left
+        {  0.0f,  1.0f, 1.0f, 1.0f },   // top
+        {  0.0f, -1.0f, 1.0f, 1.0f },   // bottom
+
+        { 0.0f, 0.0f, 0.0f, 1.0f },     // near
+        { 0.0f, 0.0f, 1.0f, 1.0f }      // far
+    };
+
+    XMVECTOR Determinant;
+    XMMATRIX matInverse = XMMatrixInverse( &Determinant, Projection );
+
+    // Compute the frustum corners in world space.
+    XMVECTOR Points[6];
+
+    for( size_t i = 0; i < 6; ++i )
+    {
+        // Transform point.
+        Points[i] = XMVector4Transform( HomogenousPoints[i], matInverse );
+    }
+
+    Out.Origin = XMFLOAT3( 0.0f, 0.0f, 0.0f );
+    Out.Orientation = XMFLOAT4( 0.0f, 0.0f, 0.0f, 1.0f );
+
+    // Compute the slopes.
+    Points[0] = Points[0] * XMVectorReciprocal( XMVectorSplatZ( Points[0] ) );
+    Points[1] = Points[1] * XMVectorReciprocal( XMVectorSplatZ( Points[1] ) );
+    Points[2] = Points[2] * XMVectorReciprocal( XMVectorSplatZ( Points[2] ) );
+    Points[3] = Points[3] * XMVectorReciprocal( XMVectorSplatZ( Points[3] ) );
+
+    Out.RightSlope = XMVectorGetX( Points[0] );
+    Out.LeftSlope = XMVectorGetX( Points[1] );
+    Out.TopSlope = XMVectorGetY( Points[2] );
+    Out.BottomSlope = XMVectorGetY( Points[3] );
+
+    // Compute near and far.
+    Points[4] = Points[4] * XMVectorReciprocal( XMVectorSplatW( Points[4] ) );
+    Points[5] = Points[5] * XMVectorReciprocal( XMVectorSplatW( Points[5] ) );
+
+    Out.Near = XMVectorGetZ( Points[4] );
+    Out.Far = XMVectorGetZ( Points[5] );
+}
+
+
+/****************************************************************************
+ *
+ * TriangleTests
+ *
+ ****************************************************************************/
+
+namespace TriangleTests
+{
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with a triangle 
+// (V0, V1, V2).  Return true if there is an intersection and also set *pDist 
+// to the distance along the ray to the intersection.
+// 
+// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage 
+// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, 
+// pp 21-28, 1997.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV Intersects( FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1, HXMVECTOR V2, float& Dist )
+{
+    assert( DirectX::Internal::XMVector3IsUnit( Direction ) );
+
+    XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR e1 = V1 - V0;
+    XMVECTOR e2 = V2 - V0;
+
+    // p = Direction ^ e2;
+    XMVECTOR p = XMVector3Cross( Direction, e2 );
+
+    // det = e1 * p;
+    XMVECTOR det = XMVector3Dot( e1, p );
+
+    XMVECTOR u, v, t;
+
+    if( XMVector3GreaterOrEqual( det, g_RayEpsilon ) )
+    {
+        // Determinate is positive (front side of the triangle).
+        XMVECTOR s = Origin - V0;
+
+        // u = s * p;
+        u = XMVector3Dot( s, p );
+
+        XMVECTOR NoIntersection = XMVectorLess( u, Zero );
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u, det ) );
+
+        // q = s ^ e1;
+        XMVECTOR q = XMVector3Cross( s, e1 );
+
+        // v = Direction * q;
+        v = XMVector3Dot( Direction, q );
+
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( v, Zero ) );
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u + v, det ) );
+
+        // t = e2 * q;
+        t = XMVector3Dot( e2, q );
+
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( t, Zero ) );
+
+        if( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) )
+        {
+            Dist = 0.f;
+            return false;
+        }
+    }
+    else if( XMVector3LessOrEqual( det, g_RayNegEpsilon ) )
+    {
+        // Determinate is negative (back side of the triangle).
+        XMVECTOR s = Origin - V0;
+
+        // u = s * p;
+        u = XMVector3Dot( s, p );
+
+        XMVECTOR NoIntersection = XMVectorGreater( u, Zero );
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u, det ) );
+
+        // q = s ^ e1;
+        XMVECTOR q = XMVector3Cross( s, e1 );
+
+        // v = Direction * q;
+        v = XMVector3Dot( Direction, q );
+
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( v, Zero ) );
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u + v, det ) );
+
+        // t = e2 * q;
+        t = XMVector3Dot( e2, q );
+
+        NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( t, Zero ) );
+
+        if ( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) )
+        {
+            Dist = 0.f;
+            return false;
+        }
+    }
+    else
+    {
+        // Parallel ray.
+        Dist = 0.f;
+        return false;
+    }
+
+    t = XMVectorDivide ( t, det );
+
+    // (u / det) and (v / dev) are the barycentric cooridinates of the intersection.
+
+    // Store the x-component to *pDist
+    XMStoreFloat( &Dist, t );
+
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test if two triangles intersect.
+//
+// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast 
+// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics 
+// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and 
+// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal 
+// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003.
+//
+// The final test could be considered an edge-edge separating plane test with
+// the 9 possible cases narrowed down to the only two pairs of edges that can 
+// actaully result in a seperation.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV Intersects( FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2 )
+{
+    static const XMVECTORU32 SelectY =
+    {
+        XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0
+    };
+    static const XMVECTORU32 SelectZ =
+    {
+        XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0
+    };
+    static const XMVECTORU32 Select0111 =
+    {
+        XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1
+    };
+    static const XMVECTORU32 Select1011 =
+    {
+        XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1
+    };
+    static const XMVECTORU32 Select1101 =
+    {
+        XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1
+    };
+
+    XMVECTOR Zero = XMVectorZero();
+
+    // Compute the normal of triangle A.
+    XMVECTOR N1 = XMVector3Cross( A1 - A0, A2 - A0 );
+
+    // Assert that the triangle is not degenerate.
+    assert( !XMVector3Equal( N1, Zero ) );
+
+    // Test points of B against the plane of A.
+    XMVECTOR BDist = XMVector3Dot( N1, B0 - A0 );
+    BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B1 - A0 ), SelectY );
+    BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B2 - A0 ), SelectZ );
+
+    // Ensure robustness with co-planar triangles by zeroing small distances.
+    uint32_t BDistIsZeroCR;
+    XMVECTOR BDistIsZero = XMVectorGreaterR( &BDistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) );
+    BDist = XMVectorSelect( BDist, Zero, BDistIsZero );
+
+    uint32_t BDistIsLessCR;
+    XMVECTOR BDistIsLess = XMVectorGreaterR( &BDistIsLessCR, Zero, BDist );
+
+    uint32_t BDistIsGreaterCR;
+    XMVECTOR BDistIsGreater = XMVectorGreaterR( &BDistIsGreaterCR, BDist, Zero );
+
+    // If all the points are on the same side we don't intersect.
+    if( XMComparisonAllTrue( BDistIsLessCR ) || XMComparisonAllTrue( BDistIsGreaterCR ) )
+        return false;
+
+    // Compute the normal of triangle B.
+    XMVECTOR N2 = XMVector3Cross( B1 - B0, B2 - B0 );
+
+    // Assert that the triangle is not degenerate.
+    assert( !XMVector3Equal( N2, Zero ) );
+
+    // Test points of A against the plane of B.
+    XMVECTOR ADist = XMVector3Dot( N2, A0 - B0 );
+    ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A1 - B0 ), SelectY );
+    ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A2 - B0 ), SelectZ );
+
+    // Ensure robustness with co-planar triangles by zeroing small distances.
+    uint32_t ADistIsZeroCR;
+    XMVECTOR ADistIsZero = XMVectorGreaterR( &ADistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) );
+    ADist = XMVectorSelect( ADist, Zero, ADistIsZero );
+
+    uint32_t ADistIsLessCR;
+    XMVECTOR ADistIsLess = XMVectorGreaterR( &ADistIsLessCR, Zero, ADist );
+
+    uint32_t ADistIsGreaterCR;
+    XMVECTOR ADistIsGreater = XMVectorGreaterR( &ADistIsGreaterCR, ADist, Zero );
+
+    // If all the points are on the same side we don't intersect.
+    if( XMComparisonAllTrue( ADistIsLessCR ) || XMComparisonAllTrue( ADistIsGreaterCR ) )
+        return false;
+
+    // Special case for co-planar triangles.
+    if( XMComparisonAllTrue( ADistIsZeroCR ) || XMComparisonAllTrue( BDistIsZeroCR ) )
+    {
+        XMVECTOR Axis, Dist, MinDist;
+
+        // Compute an axis perpindicular to the edge (points out).
+        Axis = XMVector3Cross( N1, A1 - A0 );
+        Dist = XMVector3Dot( Axis, A0 );
+
+        // Test points of B against the axis.
+        MinDist = XMVector3Dot( B0, Axis );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) );
+        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
+            return false;
+
+        // Edge (A1, A2)
+        Axis = XMVector3Cross( N1, A2 - A1 );
+        Dist = XMVector3Dot( Axis, A1 );
+
+        MinDist = XMVector3Dot( B0, Axis );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) );
+        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
+            return false;
+
+        // Edge (A2, A0)
+        Axis = XMVector3Cross( N1, A0 - A2 );
+        Dist = XMVector3Dot( Axis, A2 );
+
+        MinDist = XMVector3Dot( B0, Axis );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) );
+        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
+            return false;
+
+        // Edge (B0, B1)
+        Axis = XMVector3Cross( N2, B1 - B0 );
+        Dist = XMVector3Dot( Axis, B0 );
+
+        MinDist = XMVector3Dot( A0, Axis );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) );
+        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
+            return false;
+
+        // Edge (B1, B2)
+        Axis = XMVector3Cross( N2, B2 - B1 );
+        Dist = XMVector3Dot( Axis, B1 );
+
+        MinDist = XMVector3Dot( A0, Axis );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) );
+        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
+            return false;
+
+        // Edge (B2,B0)
+        Axis = XMVector3Cross( N2, B0 - B2 );
+        Dist = XMVector3Dot( Axis, B2 );
+
+        MinDist = XMVector3Dot( A0, Axis );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) );
+        MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) );
+        if( XMVector4GreaterOrEqual( MinDist, Dist ) )
+            return false;
+
+        return true;
+    }
+
+    //
+    // Find the single vertex of A and B (ie the vertex on the opposite side
+    // of the plane from the other two) and reorder the edges so we can compute 
+    // the signed edge/edge distances.
+    //
+    // if ( (V0 >= 0 && V1 <  0 && V2 <  0) ||
+    //      (V0 >  0 && V1 <= 0 && V2 <= 0) ||
+    //      (V0 <= 0 && V1 >  0 && V2 >  0) ||
+    //      (V0 <  0 && V1 >= 0 && V2 >= 0) ) then V0 is singular;
+    //
+    // If our singular vertex is not on the positive side of the plane we reverse
+    // the triangle winding so that the overlap comparisons will compare the 
+    // correct edges with the correct signs.
+    //
+    XMVECTOR ADistIsLessEqual = XMVectorOrInt( ADistIsLess, ADistIsZero );
+    XMVECTOR ADistIsGreaterEqual = XMVectorOrInt( ADistIsGreater, ADistIsZero );
+
+    XMVECTOR AA0, AA1, AA2;
+    bool bPositiveA;
+
+    if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select0111 ) ) ||
+        DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select0111 ) ) )
+    {
+        // A0 is singular, crossing from positive to negative.
+        AA0 = A0; AA1 = A1; AA2 = A2;
+        bPositiveA = true;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select0111 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select0111 ) ) )
+    {
+        // A0 is singular, crossing from negative to positive.
+        AA0 = A0; AA1 = A2; AA2 = A1;
+        bPositiveA = false;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1011 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1011 ) ) )
+    {
+        // A1 is singular, crossing from positive to negative.
+        AA0 = A1; AA1 = A2; AA2 = A0;
+        bPositiveA = true;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1011 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1011 ) ) )
+    {
+        // A1 is singular, crossing from negative to positive.
+        AA0 = A1; AA1 = A0; AA2 = A2;
+        bPositiveA = false;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1101 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1101 ) ) )
+    {
+        // A2 is singular, crossing from positive to negative.
+        AA0 = A2; AA1 = A0; AA2 = A1;
+        bPositiveA = true;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1101 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1101 ) ) )
+    {
+        // A2 is singular, crossing from negative to positive.
+        AA0 = A2; AA1 = A1; AA2 = A0;
+        bPositiveA = false;
+    }
+    else
+    {
+        assert( false );
+        return false;
+    }
+
+    XMVECTOR BDistIsLessEqual = XMVectorOrInt( BDistIsLess, BDistIsZero );
+    XMVECTOR BDistIsGreaterEqual = XMVectorOrInt( BDistIsGreater, BDistIsZero );
+
+    XMVECTOR BB0, BB1, BB2;
+    bool bPositiveB;
+
+    if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select0111 ) ) ||
+        DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select0111 ) ) )
+    {
+        // B0 is singular, crossing from positive to negative.
+        BB0 = B0; BB1 = B1; BB2 = B2;
+        bPositiveB = true;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select0111 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select0111 ) ) )
+    {
+        // B0 is singular, crossing from negative to positive.
+        BB0 = B0; BB1 = B2; BB2 = B1;
+        bPositiveB = false;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1011 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1011 ) ) )
+    {
+        // B1 is singular, crossing from positive to negative.
+        BB0 = B1; BB1 = B2; BB2 = B0;
+        bPositiveB = true;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1011 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1011 ) ) )
+    {
+        // B1 is singular, crossing from negative to positive.
+        BB0 = B1; BB1 = B0; BB2 = B2;
+        bPositiveB = false;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1101 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1101 ) ) )
+    {
+        // B2 is singular, crossing from positive to negative.
+        BB0 = B2; BB1 = B0; BB2 = B1;
+        bPositiveB = true;
+    }
+    else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1101 ) ) ||
+             DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1101 ) ) )
+    {
+        // B2 is singular, crossing from negative to positive.
+        BB0 = B2; BB1 = B1; BB2 = B0;
+        bPositiveB = false;
+    }
+    else
+    {
+        assert( false );
+        return false;
+    }
+
+    XMVECTOR Delta0, Delta1;
+
+    // Reverse the direction of the test depending on whether the singular vertices are
+    // the same sign or different signs.
+    if( bPositiveA ^ bPositiveB )
+    {
+        Delta0 = ( BB0 - AA0 );
+        Delta1 = ( AA0 - BB0 );
+    }
+    else
+    {
+        Delta0 = ( AA0 - BB0 );
+        Delta1 = ( BB0 - AA0 );
+    }
+
+    // Check if the triangles overlap on the line of intersection between the
+    // planes of the two triangles by finding the signed line distances.
+    XMVECTOR Dist0 = XMVector3Dot( Delta0, XMVector3Cross( ( BB2 - BB0 ), ( AA2 - AA0 ) ) );
+    if( XMVector4Greater( Dist0, Zero ) )
+        return false;
+
+    XMVECTOR Dist1 = XMVector3Dot( Delta1, XMVector3Cross( ( BB1 - BB0 ), ( AA1 - AA0 ) ) );
+    if( XMVector4Greater( Dist1, Zero ) )
+        return false;
+
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Ray-triangle test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane )
+{
+    XMVECTOR One = XMVectorSplatOne();
+
+    assert( DirectX::Internal::XMPlaneIsUnit( Plane ) );
+
+    // Set w of the points to one so we can dot4 with a plane.
+    XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
+    XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
+    XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane, Outside, Inside );
+
+    // If the triangle is outside any plane it is outside.
+    if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) )
+        return FRONT;
+
+    // If the triangle is inside all planes it is inside.
+    if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) )
+        return BACK;
+
+    // The triangle is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test a triangle vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV ContainedBy( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2,
+                                                GXMVECTOR Plane0, HXMVECTOR Plane1, HXMVECTOR Plane2,
+                                                CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 )
+{
+    XMVECTOR One = XMVectorSplatOne();
+
+    // Set w of the points to one so we can dot4 with a plane.
+    XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
+    XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
+    XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane0, Outside, Inside );
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane1, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane2, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane3, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane4, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane5, Outside, Inside );
+    AnyOutside = XMVectorOrInt( AnyOutside, Outside );
+    AllInside = XMVectorAndInt( AllInside, Inside );
+
+    // If the triangle is outside any plane it is outside.
+    if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) )
+        return DISJOINT;
+
+    // If the triangle is inside all planes it is inside.
+    if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) )
+        return CONTAINS;
+
+    // The triangle is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+}; // namespace TriangleTests
+
diff --git a/Inc/DirectXColors.h b/Inc/DirectXColors.h
index c0ca2b3..13e33e7 100644
--- a/Inc/DirectXColors.h
+++ b/Inc/DirectXColors.h
@@ -1,169 +1,169 @@
-//-------------------------------------------------------------------------------------
-// DirectXColors.h -- C++ Color Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#include "DirectXMath.h"
-
-namespace DirectX
-{
-
-namespace Colors
-{
-    // Standard colors (Red/Green/Blue/Alpha)
-    XMGLOBALCONST XMVECTORF32 AliceBlue            = {0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 AntiqueWhite         = {0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Aqua                 = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Aquamarine           = {0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Azure                = {0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Beige                = {0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Bisque               = {1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Black                = {0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 BlanchedAlmond       = {1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Blue                 = {0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 BlueViolet           = {0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Brown                = {0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 BurlyWood            = {0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 CadetBlue            = {0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Chartreuse           = {0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Chocolate            = {0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Coral                = {1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 CornflowerBlue       = {0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Cornsilk             = {1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Crimson              = {0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Cyan                 = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkBlue             = {0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkCyan             = {0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkGoldenrod        = {0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkGray             = {0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkGreen            = {0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkKhaki            = {0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkMagenta          = {0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkOliveGreen       = {0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkOrange           = {1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkOrchid           = {0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkRed              = {0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkSalmon           = {0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkSeaGreen         = {0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkSlateBlue        = {0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkSlateGray        = {0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkTurquoise        = {0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DarkViolet           = {0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DeepPink             = {1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DeepSkyBlue          = {0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DimGray              = {0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 DodgerBlue           = {0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Firebrick            = {0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 FloralWhite          = {1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 ForestGreen          = {0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Fuchsia              = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Gainsboro            = {0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 GhostWhite           = {0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Gold                 = {1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Goldenrod            = {0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Gray                 = {0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Green                = {0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 GreenYellow          = {0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Honeydew             = {0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 HotPink              = {1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 IndianRed            = {0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Indigo               = {0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Ivory                = {1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Khaki                = {0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Lavender             = {0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LavenderBlush        = {1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LawnGreen            = {0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LemonChiffon         = {1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightBlue            = {0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightCoral           = {0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightCyan            = {0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightGreen           = {0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightGray            = {0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightPink            = {1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightSalmon          = {1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightSeaGreen        = {0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightSkyBlue         = {0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightSlateGray       = {0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightSteelBlue       = {0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LightYellow          = {1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Lime                 = {0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 LimeGreen            = {0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Linen                = {0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Magenta              = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Maroon               = {0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumAquamarine     = {0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumBlue           = {0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumOrchid         = {0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumPurple         = {0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumSeaGreen       = {0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumSlateBlue      = {0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumSpringGreen    = {0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumTurquoise      = {0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MediumVioletRed      = {0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MidnightBlue         = {0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MintCream            = {0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 MistyRose            = {1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Moccasin             = {1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 NavajoWhite          = {1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Navy                 = {0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 OldLace              = {0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Olive                = {0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 OliveDrab            = {0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Orange               = {1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 OrangeRed            = {1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Orchid               = {0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PaleGoldenrod        = {0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PaleGreen            = {0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PaleTurquoise        = {0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PaleVioletRed        = {0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PapayaWhip           = {1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PeachPuff            = {1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Peru                 = {0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Pink                 = {1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Plum                 = {0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 PowderBlue           = {0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Purple               = {0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Red                  = {1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 RosyBrown            = {0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 RoyalBlue            = {0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SaddleBrown          = {0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Salmon               = {0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SandyBrown           = {0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SeaGreen             = {0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SeaShell             = {1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Sienna               = {0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Silver               = {0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SkyBlue              = {0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SlateBlue            = {0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SlateGray            = {0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Snow                 = {1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SpringGreen          = {0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 SteelBlue            = {0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Tan                  = {0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Teal                 = {0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Thistle              = {0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Tomato               = {1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Transparent          = {0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f};
-    XMGLOBALCONST XMVECTORF32 Turquoise            = {0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Violet               = {0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Wheat                = {0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 White                = {1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 WhiteSmoke           = {0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 Yellow               = {1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f};
-    XMGLOBALCONST XMVECTORF32 YellowGreen          = {0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f};
-
-}; // namespace Colors
-
-}; // namespace DirectX
-
+//-------------------------------------------------------------------------------------
+// DirectXColors.h -- C++ Color Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+
+namespace Colors
+{
+    // Standard colors (Red/Green/Blue/Alpha)
+    XMGLOBALCONST XMVECTORF32 AliceBlue            = {0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 AntiqueWhite         = {0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Aqua                 = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Aquamarine           = {0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Azure                = {0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Beige                = {0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Bisque               = {1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Black                = {0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 BlanchedAlmond       = {1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Blue                 = {0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 BlueViolet           = {0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Brown                = {0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 BurlyWood            = {0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 CadetBlue            = {0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Chartreuse           = {0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Chocolate            = {0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Coral                = {1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 CornflowerBlue       = {0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Cornsilk             = {1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Crimson              = {0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Cyan                 = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkBlue             = {0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkCyan             = {0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkGoldenrod        = {0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkGray             = {0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkGreen            = {0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkKhaki            = {0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkMagenta          = {0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkOliveGreen       = {0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkOrange           = {1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkOrchid           = {0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkRed              = {0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkSalmon           = {0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkSeaGreen         = {0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkSlateBlue        = {0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkSlateGray        = {0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkTurquoise        = {0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DarkViolet           = {0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DeepPink             = {1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DeepSkyBlue          = {0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DimGray              = {0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 DodgerBlue           = {0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Firebrick            = {0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 FloralWhite          = {1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 ForestGreen          = {0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Fuchsia              = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Gainsboro            = {0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 GhostWhite           = {0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Gold                 = {1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Goldenrod            = {0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Gray                 = {0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Green                = {0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 GreenYellow          = {0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Honeydew             = {0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 HotPink              = {1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 IndianRed            = {0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Indigo               = {0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Ivory                = {1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Khaki                = {0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Lavender             = {0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LavenderBlush        = {1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LawnGreen            = {0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LemonChiffon         = {1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightBlue            = {0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightCoral           = {0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightCyan            = {0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightGreen           = {0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightGray            = {0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightPink            = {1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightSalmon          = {1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightSeaGreen        = {0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightSkyBlue         = {0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightSlateGray       = {0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightSteelBlue       = {0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LightYellow          = {1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Lime                 = {0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 LimeGreen            = {0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Linen                = {0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Magenta              = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Maroon               = {0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumAquamarine     = {0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumBlue           = {0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumOrchid         = {0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumPurple         = {0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumSeaGreen       = {0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumSlateBlue      = {0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumSpringGreen    = {0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumTurquoise      = {0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MediumVioletRed      = {0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MidnightBlue         = {0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MintCream            = {0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 MistyRose            = {1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Moccasin             = {1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 NavajoWhite          = {1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Navy                 = {0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 OldLace              = {0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Olive                = {0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 OliveDrab            = {0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Orange               = {1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 OrangeRed            = {1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Orchid               = {0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PaleGoldenrod        = {0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PaleGreen            = {0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PaleTurquoise        = {0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PaleVioletRed        = {0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PapayaWhip           = {1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PeachPuff            = {1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Peru                 = {0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Pink                 = {1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Plum                 = {0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 PowderBlue           = {0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Purple               = {0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Red                  = {1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 RosyBrown            = {0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 RoyalBlue            = {0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SaddleBrown          = {0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Salmon               = {0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SandyBrown           = {0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SeaGreen             = {0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SeaShell             = {1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Sienna               = {0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Silver               = {0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SkyBlue              = {0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SlateBlue            = {0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SlateGray            = {0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Snow                 = {1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SpringGreen          = {0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 SteelBlue            = {0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Tan                  = {0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Teal                 = {0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Thistle              = {0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Tomato               = {1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Transparent          = {0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f};
+    XMGLOBALCONST XMVECTORF32 Turquoise            = {0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Violet               = {0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Wheat                = {0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 White                = {1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 WhiteSmoke           = {0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 Yellow               = {1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f};
+    XMGLOBALCONST XMVECTORF32 YellowGreen          = {0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f};
+
+}; // namespace Colors
+
+}; // namespace DirectX
+
diff --git a/Inc/DirectXMath.h b/Inc/DirectXMath.h
index a9a0d1b..36b6c0d 100644
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@@ -1,1992 +1,1992 @@
-//-------------------------------------------------------------------------------------
-// DirectXMath.h -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#ifndef __cplusplus
-#error DirectX Math requires C++
-#endif
-
-#define DIRECTX_MATH_VERSION 309
-
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-#error DirectX Math Visual C++ 2013 or later.
-#endif
-
-#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_)
-#define _XM_VECTORCALL_ 1
-#endif
-
-#if _XM_VECTORCALL_
-#define XM_CALLCONV __vectorcall
-#else
-#define XM_CALLCONV __fastcall
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-#define XM_CTOR_DEFAULT {}
-#else
-#define XM_CTOR_DEFAULT =default;
-#endif
-
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-#define XM_CONSTEXPR const
-#else
-#define XM_CONSTEXPR constexpr
-#endif
-
-#ifndef XM_DEPRECATED
-#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
-#endif
-
-#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
-#define _XM_F16C_INTRINSICS_
-#endif
-
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
-#define _XM_AVX_INTRINSICS_
-#endif
-
-#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
-#define _XM_AVX_INTRINSICS_
-#endif
-
-#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
-#define _XM_SSE4_INTRINSICS_
-#endif
-
-#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_)
-#define _XM_SSE3_INTRINSICS_
-#endif
-
-#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
-#define _XM_SSE_INTRINSICS_
-#endif
-
-#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-#if defined(_M_IX86) || defined(_M_X64)
-#define _XM_SSE_INTRINSICS_
-#elif defined(_M_ARM) || defined(_M_ARM64)
-#define _XM_ARM_NEON_INTRINSICS_
-#elif !defined(_XM_NO_INTRINSICS_)
-#error DirectX Math does not support this target
-#endif
-#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-#pragma warning(push)
-#pragma warning(disable:4514 4820)
-// C4514/4820: Off by default noise
-#include <math.h>
-#include <float.h>
-#include <malloc.h>
-#pragma warning(pop)
-
-#ifndef _XM_NO_INTRINSICS_
-#pragma warning(push)
-#pragma warning(disable : 4987)
-// C4987: Off by default noise
-#include <intrin.h>
-#pragma warning(pop)
-
-#ifdef _XM_SSE_INTRINSICS_
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-#ifdef _XM_SSE3_INTRINSICS_
-#include <pmmintrin.h>
-#endif
-
-#ifdef _XM_SSE4_INTRINSICS_
-#include <smmintrin.h>
-#endif
-
-#ifdef _XM_AVX_INTRINSICS_
-#include <immintrin.h>
-#endif
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#ifdef _M_ARM64
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-#endif
-#endif // !_XM_NO_INTRINSICS_
-
-#include <sal.h>
-#include <assert.h>
-
-#ifndef _XM_NO_ROUNDF_
-#ifdef _MSC_VER
-#include <yvals.h>
-#if defined(_CPPLIB_VER) && ( _CPPLIB_VER < 610 )
-#define _XM_NO_ROUNDF_
-#endif
-#endif
-#endif
-
-#pragma warning(push)
-#pragma warning(disable : 4005 4668)
-// C4005/4668: Old header issue
-#include <stdint.h>
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * Conditional intrinsics
- *
- ****************************************************************************/
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-#if defined(_XM_NO_MOVNT_)
-#define XM_STREAM_PS( p, a ) _mm_store_ps( p, a )
-#define XM_SFENCE()
-#else
-#define XM_STREAM_PS( p, a ) _mm_stream_ps( p, a )
-#define XM_SFENCE() _mm_sfence()
-#endif
-
-#if defined(_XM_AVX_INTRINSICS_)
-#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
-#else
-#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
-#endif
-
-#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-namespace DirectX
-{
-
-/****************************************************************************
- *
- * Constant definitions
- *
- ****************************************************************************/
-
-#if defined(__XNAMATH_H__) && defined(XM_PI)
-#undef XM_PI
-#undef XM_2PI
-#undef XM_1DIVPI
-#undef XM_1DIV2PI
-#undef XM_PIDIV2
-#undef XM_PIDIV4
-#undef XM_SELECT_0
-#undef XM_SELECT_1
-#undef XM_PERMUTE_0X
-#undef XM_PERMUTE_0Y
-#undef XM_PERMUTE_0Z
-#undef XM_PERMUTE_0W
-#undef XM_PERMUTE_1X
-#undef XM_PERMUTE_1Y
-#undef XM_PERMUTE_1Z
-#undef XM_PERMUTE_1W
-#undef XM_CRMASK_CR6
-#undef XM_CRMASK_CR6TRUE
-#undef XM_CRMASK_CR6FALSE
-#undef XM_CRMASK_CR6BOUNDS
-#undef XM_CACHE_LINE_SIZE
-#endif
-
-XM_CONSTEXPR float XM_PI        = 3.141592654f;
-XM_CONSTEXPR float XM_2PI       = 6.283185307f;
-XM_CONSTEXPR float XM_1DIVPI    = 0.318309886f;
-XM_CONSTEXPR float XM_1DIV2PI   = 0.159154943f;
-XM_CONSTEXPR float XM_PIDIV2    = 1.570796327f;
-XM_CONSTEXPR float XM_PIDIV4    = 0.785398163f;
-
-XM_CONSTEXPR uint32_t XM_SELECT_0   = 0x00000000;
-XM_CONSTEXPR uint32_t XM_SELECT_1   = 0xFFFFFFFF;
-
-XM_CONSTEXPR uint32_t XM_PERMUTE_0X = 0;
-XM_CONSTEXPR uint32_t XM_PERMUTE_0Y = 1;
-XM_CONSTEXPR uint32_t XM_PERMUTE_0Z = 2;
-XM_CONSTEXPR uint32_t XM_PERMUTE_0W = 3;
-XM_CONSTEXPR uint32_t XM_PERMUTE_1X = 4;
-XM_CONSTEXPR uint32_t XM_PERMUTE_1Y = 5;
-XM_CONSTEXPR uint32_t XM_PERMUTE_1Z = 6;
-XM_CONSTEXPR uint32_t XM_PERMUTE_1W = 7;
-
-XM_CONSTEXPR uint32_t XM_SWIZZLE_X  = 0;
-XM_CONSTEXPR uint32_t XM_SWIZZLE_Y  = 1;
-XM_CONSTEXPR uint32_t XM_SWIZZLE_Z  = 2;
-XM_CONSTEXPR uint32_t XM_SWIZZLE_W  = 3;
-
-XM_CONSTEXPR uint32_t XM_CRMASK_CR6         = 0x000000F0;
-XM_CONSTEXPR uint32_t XM_CRMASK_CR6TRUE     = 0x00000080;
-XM_CONSTEXPR uint32_t XM_CRMASK_CR6FALSE    = 0x00000020;
-XM_CONSTEXPR uint32_t XM_CRMASK_CR6BOUNDS   = XM_CRMASK_CR6FALSE;
-
-XM_CONSTEXPR size_t XM_CACHE_LINE_SIZE = 64;
-
-
-/****************************************************************************
- *
- * Macros
- *
- ****************************************************************************/
-
-#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue)
-#undef XMComparisonAllTrue
-#undef XMComparisonAnyTrue
-#undef XMComparisonAllFalse
-#undef XMComparisonAnyFalse
-#undef XMComparisonMixed
-#undef XMComparisonAllInBounds
-#undef XMComparisonAnyOutOfBounds
-#endif
-
-// Unit conversion
-
-inline XM_CONSTEXPR float XMConvertToRadians(float fDegrees) { return fDegrees * (XM_PI / 180.0f); }
-inline XM_CONSTEXPR float XMConvertToDegrees(float fRadians) { return fRadians * (180.0f / XM_PI); }
-
-// Condition register evaluation proceeding a recording (R) comparison
-
-inline bool XMComparisonAllTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); }
-inline bool XMComparisonAnyTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); }
-inline bool XMComparisonAllFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); }
-inline bool XMComparisonAnyFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); }
-inline bool XMComparisonMixed(uint32_t CR) { return (((CR) & XM_CRMASK_CR6) == 0); }
-inline bool XMComparisonAllInBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); }
-inline bool XMComparisonAnyOutOfBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); }
-
-
-/****************************************************************************
- *
- * Data types
- *
- ****************************************************************************/
-
-#pragma warning(push)
-#pragma warning(disable:4068 4201 4365 4324 4820)
-// C4068: ignore unknown pragmas
-// C4201: nonstandard extension used : nameless struct/union
-// C4365: Off by default noise
-// C4324/4820: padding warnings
-
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-
-//------------------------------------------------------------------------------
-#if defined(_XM_NO_INTRINSICS_)
-struct __vector4
-{
-    union
-    {
-        float       vector4_f32[4];
-        uint32_t    vector4_u32[4];
-    };
-};
-#endif // _XM_NO_INTRINSICS_
-
-//------------------------------------------------------------------------------
-// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte 
-// boundary and mapped to hardware vector registers
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-typedef __m128 XMVECTOR;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-typedef float32x4_t XMVECTOR;
-#else
-typedef __vector4 XMVECTOR;
-#endif
-
-// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise
-#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
-typedef const XMVECTOR FXMVECTOR;
-#else
-typedef const XMVECTOR& FXMVECTOR;
-#endif
-
-// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise
-#if ( defined(_M_ARM) || defined(_M_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_)
-typedef const XMVECTOR GXMVECTOR;
-#else
-typedef const XMVECTOR& GXMVECTOR;
-#endif
-
-// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise
-#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
-typedef const XMVECTOR HXMVECTOR;
-#else
-typedef const XMVECTOR& HXMVECTOR;
-#endif
-
-// Fix-up for (7th+) XMVECTOR parameters to pass by reference
-typedef const XMVECTOR& CXMVECTOR;
-
-//------------------------------------------------------------------------------
-// Conversion types for constants
-__declspec(align(16)) struct XMVECTORF32
-{
-    union
-    {
-        float f[4];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const { return v; }
-    inline operator const float*() const { return f; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return _mm_castps_si128(v); }
-    inline operator __m128d() const { return _mm_castps_pd(v); }
-#endif
-};
-
-__declspec(align(16)) struct XMVECTORI32
-{
-    union
-    {
-        int32_t i[4];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const { return v; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return _mm_castps_si128(v); }
-    inline operator __m128d() const { return _mm_castps_pd(v); }
-#endif
-};
-
-__declspec(align(16)) struct XMVECTORU8
-{
-    union
-    {
-        uint8_t u[16];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const { return v; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return _mm_castps_si128(v); }
-    inline operator __m128d() const { return _mm_castps_pd(v); }
-#endif
-};
-
-__declspec(align(16)) struct XMVECTORU32
-{
-    union
-    {
-        uint32_t u[4];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const { return v; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return _mm_castps_si128(v); }
-    inline operator __m128d() const { return _mm_castps_pd(v); }
-#endif
-};
-
-//------------------------------------------------------------------------------
-// Vector operators
-XMVECTOR    XM_CALLCONV     operator+ (FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     operator- (FXMVECTOR V);
-
-XMVECTOR&   XM_CALLCONV     operator+= (XMVECTOR& V1, FXMVECTOR V2);
-XMVECTOR&   XM_CALLCONV     operator-= (XMVECTOR& V1, FXMVECTOR V2);
-XMVECTOR&   XM_CALLCONV     operator*= (XMVECTOR& V1, FXMVECTOR V2);
-XMVECTOR&   XM_CALLCONV     operator/= (XMVECTOR& V1, FXMVECTOR V2);
-
-XMVECTOR&   operator*= (XMVECTOR& V, float S);
-XMVECTOR&   operator/= (XMVECTOR& V, float S);
-
-XMVECTOR    XM_CALLCONV     operator+ (FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     operator- (FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     operator* (FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     operator/ (FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     operator* (FXMVECTOR V, float S);
-XMVECTOR    XM_CALLCONV     operator* (float S, FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     operator/ (FXMVECTOR V, float S);
-
-//------------------------------------------------------------------------------
-// Matrix type: Sixteen 32 bit floating point components aligned on a
-// 16 byte boundary and mapped to four hardware vector registers
-
-struct XMMATRIX;
-
-// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise
-#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
-typedef const XMMATRIX FXMMATRIX;
-#else
-typedef const XMMATRIX& FXMMATRIX;
-#endif
-
-// Fix-up for (2nd+) XMMATRIX parameters to pass by reference
-typedef const XMMATRIX& CXMMATRIX;
-
-#ifdef _XM_NO_INTRINSICS_
-struct XMMATRIX
-#else
-__declspec(align(16)) struct XMMATRIX
-#endif
-{
-#ifdef _XM_NO_INTRINSICS_
-    union
-    {
-        XMVECTOR r[4];
-        struct
-        {
-            float _11, _12, _13, _14;
-            float _21, _22, _23, _24;
-            float _31, _32, _33, _34;
-            float _41, _42, _43, _44;
-        };
-        float m[4][4];
-    };
-#else
-    XMVECTOR r[4];
-#endif
-
-    XMMATRIX() XM_CTOR_DEFAULT
-#if defined(_MSC_VER) && _MSC_VER >= 1900
-    constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) : r{ R0,R1,R2,R3 } {}
-#else
-    XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; }
-#endif
-    XMMATRIX(float m00, float m01, float m02, float m03,
-             float m10, float m11, float m12, float m13,
-             float m20, float m21, float m22, float m23,
-             float m30, float m31, float m32, float m33);
-    explicit XMMATRIX(_In_reads_(16) const float *pArray);
-
-#ifdef _XM_NO_INTRINSICS_
-    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
-    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
-#endif
-
-    XMMATRIX&   operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; }
-
-    XMMATRIX    operator+ () const { return *this; }
-    XMMATRIX    operator- () const;
-
-    XMMATRIX&   XM_CALLCONV     operator+= (FXMMATRIX M);
-    XMMATRIX&   XM_CALLCONV     operator-= (FXMMATRIX M);
-    XMMATRIX&   XM_CALLCONV     operator*= (FXMMATRIX M);
-    XMMATRIX&   operator*= (float S);
-    XMMATRIX&   operator/= (float S);
-
-    XMMATRIX    XM_CALLCONV     operator+ (FXMMATRIX M) const;
-    XMMATRIX    XM_CALLCONV     operator- (FXMMATRIX M) const;
-    XMMATRIX    XM_CALLCONV     operator* (FXMMATRIX M) const;
-    XMMATRIX    operator* (float S) const;
-    XMMATRIX    operator/ (float S) const;
-
-    friend XMMATRIX     XM_CALLCONV     operator* (float S, FXMMATRIX M);
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 32 bit floating point components
-struct XMFLOAT2
-{
-    float x;
-    float y;
-
-    XMFLOAT2() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT2(float _x, float _y) : x(_x), y(_y) {}
-    explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {}
-
-    XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; }
-};
-
-// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary
-__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2
-{
-    XMFLOAT2A() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {}
-    explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {}
-
-    XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 32 bit signed integer components
-struct XMINT2
-{
-    int32_t x;
-    int32_t y;
-
-    XMINT2() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMINT2(int32_t _x, int32_t _y) : x(_x), y(_y) {}
-    explicit XMINT2(_In_reads_(2) const int32_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-
-    XMINT2& operator= (const XMINT2& Int2) { x = Int2.x; y = Int2.y; return *this; }
-};
-
-// 2D Vector; 32 bit unsigned integer components
-struct XMUINT2
-{
-    uint32_t x;
-    uint32_t y;
-
-    XMUINT2() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMUINT2(uint32_t _x, uint32_t _y) : x(_x), y(_y) {}
-    explicit XMUINT2(_In_reads_(2) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-
-    XMUINT2& operator= (const XMUINT2& UInt2) { x = UInt2.x; y = UInt2.y; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 3D Vector; 32 bit floating point components
-struct XMFLOAT3
-{
-    float x;
-    float y;
-    float z;
-
-    XMFLOAT3() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {}
-    explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-
-    XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
-};
-
-// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary
-__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3
-{
-    XMFLOAT3A() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {}
-    explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {}
-
-    XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 3D Vector; 32 bit signed integer components
-struct XMINT3
-{
-    int32_t x;
-    int32_t y;
-    int32_t z;
-
-    XMINT3() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {}
-    explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-
-    XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; }
-};
-
-// 3D Vector; 32 bit unsigned integer components
-struct XMUINT3
-{
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-
-    XMUINT3() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {}
-    explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-
-    XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 32 bit floating point components
-struct XMFLOAT4
-{
-    float x;
-    float y;
-    float z;
-    float w;
-
-    XMFLOAT4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-
-    XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
-};
-
-// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary
-__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4
-{
-    XMFLOAT4A() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {}
-    explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {}
-
-    XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 32 bit signed integer components
-struct XMINT4
-{
-    int32_t x;
-    int32_t y;
-    int32_t z;
-    int32_t w;
-
-    XMINT4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMINT4(_In_reads_(4) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-
-    XMINT4& operator= (const XMINT4& Int4) { x = Int4.x; y = Int4.y; z = Int4.z; w = Int4.w; return *this; }
-};
-
-// 4D Vector; 32 bit unsigned integer components
-struct XMUINT4
-{
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-    uint32_t w;
-
-    XMUINT4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUINT4(_In_reads_(4) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-
-    XMUINT4& operator= (const XMUINT4& UInt4) { x = UInt4.x; y = UInt4.y; z = UInt4.z; w = UInt4.w; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 3x3 Matrix: 32 bit floating point components
-struct XMFLOAT3X3
-{
-    union
-    {
-        struct
-        {
-            float _11, _12, _13;
-            float _21, _22, _23;
-            float _31, _32, _33;
-        };
-        float m[3][3];
-    };
-
-    XMFLOAT3X3() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT3X3(float m00, float m01, float m02,
-                            float m10, float m11, float m12,
-                            float m20, float m21, float m22)
-        : _11(m00), _12(m01), _13(m02),
-          _21(m10), _22(m11), _23(m12),
-          _31(m20), _32(m21), _33(m22) {}
-    explicit XMFLOAT3X3(_In_reads_(9) const float *pArray);
-
-    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
-    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
-
-    XMFLOAT3X3& operator= (const XMFLOAT3X3& Float3x3);
-};
-
-//------------------------------------------------------------------------------
-// 4x3 Matrix: 32 bit floating point components
-struct XMFLOAT4X3
-{
-    union
-    {
-        struct
-        {
-            float _11, _12, _13;
-            float _21, _22, _23;
-            float _31, _32, _33;
-            float _41, _42, _43;
-        };
-        float m[4][3];
-    };
-
-    XMFLOAT4X3() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT4X3(float m00, float m01, float m02,
-                            float m10, float m11, float m12,
-                            float m20, float m21, float m22,
-                            float m30, float m31, float m32)
-        : _11(m00), _12(m01), _13(m02),
-          _21(m10), _22(m11), _23(m12),
-          _31(m20), _32(m21), _33(m22),
-          _41(m30), _42(m31), _43(m32) {}
-    explicit XMFLOAT4X3(_In_reads_(12) const float *pArray);
-
-    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
-    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
-
-    XMFLOAT4X3& operator= (const XMFLOAT4X3& Float4x3);
-
-};
-
-// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary
-__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3
-{
-    XMFLOAT4X3A() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT4X3A(float m00, float m01, float m02,
-                            float m10, float m11, float m12,
-                            float m20, float m21, float m22,
-                            float m30, float m31, float m32) :
-        XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {}
-    explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {}
-
-    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
-    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
-
-    XMFLOAT4X3A& operator= (const XMFLOAT4X3A& Float4x3);
-};
-
-//------------------------------------------------------------------------------
-// 4x4 Matrix: 32 bit floating point components
-struct XMFLOAT4X4
-{
-    union
-    {
-        struct
-        {
-            float _11, _12, _13, _14;
-            float _21, _22, _23, _24;
-            float _31, _32, _33, _34;
-            float _41, _42, _43, _44;
-        };
-        float m[4][4];
-    };
-
-    XMFLOAT4X4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT4X4(float m00, float m01, float m02, float m03,
-                            float m10, float m11, float m12, float m13,
-                            float m20, float m21, float m22, float m23,
-                            float m30, float m31, float m32, float m33)
-        : _11(m00), _12(m01), _13(m02), _14(m03),
-          _21(m10), _22(m11), _23(m12), _24(m13),
-          _31(m20), _32(m21), _33(m22), _34(m23),
-          _41(m30), _42(m31), _43(m32), _44(m33) {}
-    explicit XMFLOAT4X4(_In_reads_(16) const float *pArray);
-
-    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
-    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
-
-    XMFLOAT4X4& operator= (const XMFLOAT4X4& Float4x4);
-};
-
-// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary
-__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4
-{
-    XMFLOAT4X4A() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMFLOAT4X4A(float m00, float m01, float m02, float m03,
-                             float m10, float m11, float m12, float m13,
-                             float m20, float m21, float m22, float m23,
-                             float m30, float m31, float m32, float m33)
-        : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {}
-    explicit XMFLOAT4X4A(_In_reads_(16) const float *pArray) : XMFLOAT4X4(pArray) {}
-
-    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
-    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
-
-    XMFLOAT4X4A& operator= (const XMFLOAT4X4A& Float4x4);
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-#pragma prefast(pop)
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * Data conversion operations
- *
- ****************************************************************************/
-
-XMVECTOR    XM_CALLCONV     XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent);
-XMVECTOR    XM_CALLCONV     XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent);
-XMVECTOR    XM_CALLCONV     XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent);
-XMVECTOR    XM_CALLCONV     XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent);
-
-#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant)
-#undef XMVectorSetBinaryConstant
-#undef XMVectorSplatConstant
-#undef XMVectorSplatConstantInt
-#endif
-
-XMVECTOR    XM_CALLCONV     XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3);
-XMVECTOR    XM_CALLCONV     XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent);
-XMVECTOR    XM_CALLCONV     XMVectorSplatConstantInt(int32_t IntConstant);
-
-/****************************************************************************
- *
- * Load operations
- *
- ****************************************************************************/
-
-XMVECTOR    XM_CALLCONV     XMLoadInt(_In_ const uint32_t* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat(_In_ const float* pSource);
-
-XMVECTOR    XM_CALLCONV     XMLoadInt2(_In_reads_(2) const uint32_t* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadInt2A(_In_reads_(2) const uint32_t* PSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat2(_In_ const XMFLOAT2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat2A(_In_ const XMFLOAT2A* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadSInt2(_In_ const XMINT2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUInt2(_In_ const XMUINT2* pSource);
-
-XMVECTOR    XM_CALLCONV     XMLoadInt3(_In_reads_(3) const uint32_t* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadInt3A(_In_reads_(3) const uint32_t* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat3(_In_ const XMFLOAT3* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat3A(_In_ const XMFLOAT3A* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadSInt3(_In_ const XMINT3* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUInt3(_In_ const XMUINT3* pSource);
-
-XMVECTOR    XM_CALLCONV     XMLoadInt4(_In_reads_(4) const uint32_t* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadInt4A(_In_reads_(4) const uint32_t* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat4(_In_ const XMFLOAT4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat4A(_In_ const XMFLOAT4A* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadSInt4(_In_ const XMINT4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUInt4(_In_ const XMUINT4* pSource);
-
-XMMATRIX    XM_CALLCONV     XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource);
-XMMATRIX    XM_CALLCONV     XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource);
-XMMATRIX    XM_CALLCONV     XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource);
-XMMATRIX    XM_CALLCONV     XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource);
-XMMATRIX    XM_CALLCONV     XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource);
-
-/****************************************************************************
- *
- * Store operations
- *
- ****************************************************************************/
-
-void        XM_CALLCONV     XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V);
-
-void        XM_CALLCONV     XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V);
-
-void        XM_CALLCONV     XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V);
-
-void        XM_CALLCONV     XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V);
-
-void        XM_CALLCONV     XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M);
-void        XM_CALLCONV     XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M);
-void        XM_CALLCONV     XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M);
-void        XM_CALLCONV     XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M);
-void        XM_CALLCONV     XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M);
-
-/****************************************************************************
- *
- * General vector operations
- *
- ****************************************************************************/
-
-XMVECTOR    XM_CALLCONV     XMVectorZero();
-XMVECTOR    XM_CALLCONV     XMVectorSet(float x, float y, float z, float w);
-XMVECTOR    XM_CALLCONV     XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
-XMVECTOR    XM_CALLCONV     XMVectorReplicate(float Value);
-XMVECTOR    XM_CALLCONV     XMVectorReplicatePtr(_In_ const float *pValue);
-XMVECTOR    XM_CALLCONV     XMVectorReplicateInt(uint32_t Value);
-XMVECTOR    XM_CALLCONV     XMVectorReplicateIntPtr(_In_ const uint32_t *pValue);
-XMVECTOR    XM_CALLCONV     XMVectorTrueInt();
-XMVECTOR    XM_CALLCONV     XMVectorFalseInt();
-XMVECTOR    XM_CALLCONV     XMVectorSplatX(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSplatY(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSplatZ(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSplatW(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSplatOne();
-XMVECTOR    XM_CALLCONV     XMVectorSplatInfinity();
-XMVECTOR    XM_CALLCONV     XMVectorSplatQNaN();
-XMVECTOR    XM_CALLCONV     XMVectorSplatEpsilon();
-XMVECTOR    XM_CALLCONV     XMVectorSplatSignMask();
-
-float       XM_CALLCONV     XMVectorGetByIndex(FXMVECTOR V, size_t i);
-float       XM_CALLCONV     XMVectorGetX(FXMVECTOR V);
-float       XM_CALLCONV     XMVectorGetY(FXMVECTOR V);
-float       XM_CALLCONV     XMVectorGetZ(FXMVECTOR V);
-float       XM_CALLCONV     XMVectorGetW(FXMVECTOR V);
-
-void        XM_CALLCONV     XMVectorGetByIndexPtr(_Out_ float *f, _In_ FXMVECTOR V, _In_ size_t i);
-void        XM_CALLCONV     XMVectorGetXPtr(_Out_ float *x, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V);
-
-uint32_t    XM_CALLCONV     XMVectorGetIntByIndex(FXMVECTOR V, size_t i);
-uint32_t    XM_CALLCONV     XMVectorGetIntX(FXMVECTOR V);
-uint32_t    XM_CALLCONV     XMVectorGetIntY(FXMVECTOR V);
-uint32_t    XM_CALLCONV     XMVectorGetIntZ(FXMVECTOR V);
-uint32_t    XM_CALLCONV     XMVectorGetIntW(FXMVECTOR V);
-
-void        XM_CALLCONV     XMVectorGetIntByIndexPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V, _In_ size_t i);
-void        XM_CALLCONV     XMVectorGetIntXPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V);
-
-XMVECTOR    XM_CALLCONV     XMVectorSetByIndex(FXMVECTOR V,float f, size_t i);
-XMVECTOR    XM_CALLCONV     XMVectorSetX(FXMVECTOR V, float x);
-XMVECTOR    XM_CALLCONV     XMVectorSetY(FXMVECTOR V, float y);
-XMVECTOR    XM_CALLCONV     XMVectorSetZ(FXMVECTOR V, float z);
-XMVECTOR    XM_CALLCONV     XMVectorSetW(FXMVECTOR V, float w);
-
-XMVECTOR    XM_CALLCONV     XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float *f, _In_ size_t i);
-XMVECTOR    XM_CALLCONV     XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float *x);
-XMVECTOR    XM_CALLCONV     XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float *y);
-XMVECTOR    XM_CALLCONV     XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float *z);
-XMVECTOR    XM_CALLCONV     XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float *w);
-
-XMVECTOR    XM_CALLCONV     XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntX(FXMVECTOR V, uint32_t x);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntY(FXMVECTOR V, uint32_t y);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntZ(FXMVECTOR V, uint32_t z);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntW(FXMVECTOR V, uint32_t w);
-
-XMVECTOR    XM_CALLCONV     XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x, _In_ size_t i);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t *y);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t *z);
-XMVECTOR    XM_CALLCONV     XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t *w);
-
-#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle)
-#undef XMVectorSwizzle
-#endif
-
-XMVECTOR    XM_CALLCONV     XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3);
-XMVECTOR    XM_CALLCONV     XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW);
-XMVECTOR    XM_CALLCONV     XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3);
-XMVECTOR    XM_CALLCONV     XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control);
-XMVECTOR    XM_CALLCONV     XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2);
-
-#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft)
-#undef XMVectorShiftLeft
-#undef XMVectorRotateLeft
-#undef XMVectorRotateRight
-#undef XMVectorInsert
-#endif
-
-XMVECTOR    XM_CALLCONV     XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements);
-XMVECTOR    XM_CALLCONV     XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements);
-XMVECTOR    XM_CALLCONV     XMVectorRotateRight(FXMVECTOR V, uint32_t Elements);
-XMVECTOR    XM_CALLCONV     XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
-                                           uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3);
-
-XMVECTOR    XM_CALLCONV     XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
-XMVECTOR    XM_CALLCONV     XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorLess(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds);
-XMVECTOR    XM_CALLCONV     XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds);
-
-XMVECTOR    XM_CALLCONV     XMVectorIsNaN(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorIsInfinite(FXMVECTOR V);
-
-XMVECTOR    XM_CALLCONV     XMVectorMin(FXMVECTOR V1,FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorMax(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorRound(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorTruncate(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorFloor(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorCeiling(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max);
-XMVECTOR    XM_CALLCONV     XMVectorSaturate(FXMVECTOR V);
-
-XMVECTOR    XM_CALLCONV     XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2);
-
-XMVECTOR    XM_CALLCONV     XMVectorNegate(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorSum(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
-XMVECTOR    XM_CALLCONV     XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
-XMVECTOR    XM_CALLCONV     XMVectorScale(FXMVECTOR V, float ScaleFactor);
-XMVECTOR    XM_CALLCONV     XMVectorReciprocalEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorReciprocal(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSqrtEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSqrt(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorReciprocalSqrtEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorReciprocalSqrt(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorExp2(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorExpE(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorExp(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorLog2(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorLogE(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorLog(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorPow(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorAbs(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorMod(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVectorModAngles(FXMVECTOR Angles);
-XMVECTOR    XM_CALLCONV     XMVectorSin(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSinEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorCos(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorCosEst(FXMVECTOR V);
-void        XM_CALLCONV     XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V);
-void        XM_CALLCONV     XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorTan(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorTanEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorSinH(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorCosH(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorTanH(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorASin(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorASinEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorACos(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorACosEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorATan(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorATanEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVectorATan2(FXMVECTOR Y, FXMVECTOR X);
-XMVECTOR    XM_CALLCONV     XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X);
-XMVECTOR    XM_CALLCONV     XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t);
-XMVECTOR    XM_CALLCONV     XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T);
-XMVECTOR    XM_CALLCONV     XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t);
-XMVECTOR    XM_CALLCONV     XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T);
-XMVECTOR    XM_CALLCONV     XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t);
-XMVECTOR    XM_CALLCONV     XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T);
-XMVECTOR    XM_CALLCONV     XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g);
-XMVECTOR    XM_CALLCONV     XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G);
-
-/****************************************************************************
- *
- * 2D vector operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
-bool        XM_CALLCONV     XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2Less(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds);
-
-bool        XM_CALLCONV     XMVector2IsNaN(FXMVECTOR V);
-bool        XM_CALLCONV     XMVector2IsInfinite(FXMVECTOR V);
-
-XMVECTOR    XM_CALLCONV     XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector2LengthSq(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2ReciprocalLengthEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2ReciprocalLength(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2LengthEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2Length(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2NormalizeEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2Normalize(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax);
-XMVECTOR    XM_CALLCONV     XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
-XMVECTOR    XM_CALLCONV     XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
-XMVECTOR    XM_CALLCONV     XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex);
-XMVECTOR    XM_CALLCONV     XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
-XMVECTOR    XM_CALLCONV     XMVector2Orthogonal(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
-XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
-XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point);
-XMVECTOR    XM_CALLCONV     XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2);
-XMVECTOR    XM_CALLCONV     XMVector2Transform(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT4*   XM_CALLCONV     XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
-                                                    _In_ size_t OutputStride,
-                                                    _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream,
-                                                    _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-XMVECTOR    XM_CALLCONV     XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT2*   XM_CALLCONV     XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream,
-                                                          _In_ size_t OutputStride,
-                                                          _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream,
-                                                          _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-XMVECTOR    XM_CALLCONV     XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT2*   XM_CALLCONV     XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream,
-                                                           _In_ size_t OutputStride,
-                                                           _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream,
-                                                           _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-
-/****************************************************************************
- *
- * 3D vector operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
-bool        XM_CALLCONV     XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3Less(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds);
-
-bool        XM_CALLCONV     XMVector3IsNaN(FXMVECTOR V);
-bool        XM_CALLCONV     XMVector3IsInfinite(FXMVECTOR V);
-
-XMVECTOR    XM_CALLCONV     XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector3LengthSq(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3ReciprocalLengthEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3ReciprocalLength(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3LengthEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3Length(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3NormalizeEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3Normalize(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax);
-XMVECTOR    XM_CALLCONV     XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
-XMVECTOR    XM_CALLCONV     XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
-XMVECTOR    XM_CALLCONV     XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex);
-XMVECTOR    XM_CALLCONV     XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
-XMVECTOR    XM_CALLCONV     XMVector3Orthogonal(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
-XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
-XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point);
-void        XM_CALLCONV     XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal);
-XMVECTOR    XM_CALLCONV     XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion);
-XMVECTOR    XM_CALLCONV     XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion);
-XMVECTOR    XM_CALLCONV     XMVector3Transform(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT4*   XM_CALLCONV     XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
-                                                     _In_ size_t OutputStride,
-                                                     _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
-                                                     _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-XMVECTOR    XM_CALLCONV     XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT3*   XM_CALLCONV     XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
-                                                          _In_ size_t OutputStride,
-                                                          _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
-                                                          _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-XMVECTOR    XM_CALLCONV     XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT3*   XM_CALLCONV     XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
-                                                           _In_ size_t OutputStride,
-                                                           _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
-                                                           _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-XMVECTOR    XM_CALLCONV     XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, 
-                                             FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
-XMFLOAT3*   XM_CALLCONV     XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
-                                                   _In_ size_t OutputStride,
-                                                   _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
-                                                   _In_ size_t InputStride, _In_ size_t VectorCount, 
-                                                   _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, 
-                                                   _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World);
-XMVECTOR    XM_CALLCONV     XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, 
-                                               FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
-XMFLOAT3*   XM_CALLCONV     XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
-                                                     _In_ size_t OutputStride,
-                                                     _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
-                                                     _In_ size_t InputStride, _In_ size_t VectorCount, 
-                                                     _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, 
-                                                     _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World);
-
-/****************************************************************************
- *
- * 4D vector operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
-bool        XM_CALLCONV     XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-uint32_t    XM_CALLCONV     XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4Less(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
-bool        XM_CALLCONV     XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds);
-
-bool        XM_CALLCONV     XMVector4IsNaN(FXMVECTOR V);
-bool        XM_CALLCONV     XMVector4IsInfinite(FXMVECTOR V);
-
-XMVECTOR    XM_CALLCONV     XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
-XMVECTOR    XM_CALLCONV     XMVector4LengthSq(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4ReciprocalLengthEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4ReciprocalLength(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4LengthEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4Length(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4NormalizeEst(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4Normalize(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax);
-XMVECTOR    XM_CALLCONV     XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
-XMVECTOR    XM_CALLCONV     XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
-XMVECTOR    XM_CALLCONV     XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex);
-XMVECTOR    XM_CALLCONV     XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
-XMVECTOR    XM_CALLCONV     XMVector4Orthogonal(FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
-XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
-XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
-XMVECTOR    XM_CALLCONV     XMVector4Transform(FXMVECTOR V, FXMMATRIX M);
-XMFLOAT4*   XM_CALLCONV     XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
-                                                     _In_ size_t OutputStride,
-                                                     _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) const XMFLOAT4* pInputStream,
-                                                     _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
-
-/****************************************************************************
- *
- * Matrix operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMMatrixIsNaN(FXMMATRIX M);
-bool        XM_CALLCONV     XMMatrixIsInfinite(FXMMATRIX M);
-bool        XM_CALLCONV     XMMatrixIsIdentity(FXMMATRIX M);
-
-XMMATRIX    XM_CALLCONV     XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2);
-XMMATRIX    XM_CALLCONV     XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2);
-XMMATRIX    XM_CALLCONV     XMMatrixTranspose(FXMMATRIX M);
-XMMATRIX    XM_CALLCONV     XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M);
-XMVECTOR    XM_CALLCONV     XMMatrixDeterminant(FXMMATRIX M);
-_Success_(return)
-bool        XM_CALLCONV     XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, _In_ FXMMATRIX M);
-
-XMMATRIX    XM_CALLCONV     XMMatrixIdentity();
-XMMATRIX    XM_CALLCONV     XMMatrixSet(float m00, float m01, float m02, float m03,
-                                        float m10, float m11, float m12, float m13,
-                                        float m20, float m21, float m22, float m23,
-                                        float m30, float m31, float m32, float m33);
-XMMATRIX    XM_CALLCONV     XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ);
-XMMATRIX    XM_CALLCONV     XMMatrixTranslationFromVector(FXMVECTOR Offset);
-XMMATRIX    XM_CALLCONV     XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ);
-XMMATRIX    XM_CALLCONV     XMMatrixScalingFromVector(FXMVECTOR Scale);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationX(float Angle);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationY(float Angle);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationZ(float Angle);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationAxis(FXMVECTOR Axis, float Angle);
-XMMATRIX    XM_CALLCONV     XMMatrixRotationQuaternion(FXMVECTOR Quaternion);
-XMMATRIX    XM_CALLCONV     XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, 
-                                                     FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation);
-XMMATRIX    XM_CALLCONV     XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, 
-                                                   GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation);
-XMMATRIX    XM_CALLCONV     XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation);
-XMMATRIX    XM_CALLCONV     XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation);
-XMMATRIX    XM_CALLCONV     XMMatrixReflect(FXMVECTOR ReflectionPlane);
-XMMATRIX    XM_CALLCONV     XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition);
-
-XMMATRIX    XM_CALLCONV     XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection);
-XMMATRIX    XM_CALLCONV     XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection);
-XMMATRIX    XM_CALLCONV     XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection);
-XMMATRIX    XM_CALLCONV     XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection);
-XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
-XMMATRIX    XM_CALLCONV     XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
-
-
-/****************************************************************************
- *
- * Quaternion operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2);
-bool        XM_CALLCONV     XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2);
-
-bool        XM_CALLCONV     XMQuaternionIsNaN(FXMVECTOR Q);
-bool        XM_CALLCONV     XMQuaternionIsInfinite(FXMVECTOR Q);
-bool        XM_CALLCONV     XMQuaternionIsIdentity(FXMVECTOR Q);
-
-XMVECTOR    XM_CALLCONV     XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2);
-XMVECTOR    XM_CALLCONV     XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2);
-XMVECTOR    XM_CALLCONV     XMQuaternionLengthSq(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionReciprocalLength(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionLength(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionNormalizeEst(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionNormalize(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionConjugate(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionInverse(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionLn(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionExp(FXMVECTOR Q);
-XMVECTOR    XM_CALLCONV     XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t);
-XMVECTOR    XM_CALLCONV     XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T);
-XMVECTOR    XM_CALLCONV     XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t);
-XMVECTOR    XM_CALLCONV     XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T);
-void        XM_CALLCONV     XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3);
-XMVECTOR    XM_CALLCONV     XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g);
-XMVECTOR    XM_CALLCONV     XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G);
-
-XMVECTOR    XM_CALLCONV     XMQuaternionIdentity();
-XMVECTOR    XM_CALLCONV     XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll);
-XMVECTOR    XM_CALLCONV     XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles);
-XMVECTOR    XM_CALLCONV     XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle);
-XMVECTOR    XM_CALLCONV     XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle);
-XMVECTOR    XM_CALLCONV     XMQuaternionRotationMatrix(FXMMATRIX M);
-
-void        XM_CALLCONV     XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q);
-
-/****************************************************************************
- *
- * Plane operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2);
-bool        XM_CALLCONV     XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon);
-bool        XM_CALLCONV     XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2);
-
-bool        XM_CALLCONV     XMPlaneIsNaN(FXMVECTOR P);
-bool        XM_CALLCONV     XMPlaneIsInfinite(FXMVECTOR P);
-
-XMVECTOR    XM_CALLCONV     XMPlaneDot(FXMVECTOR P, FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V);
-XMVECTOR    XM_CALLCONV     XMPlaneNormalizeEst(FXMVECTOR P);
-XMVECTOR    XM_CALLCONV     XMPlaneNormalize(FXMVECTOR P);
-XMVECTOR    XM_CALLCONV     XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2);
-void        XM_CALLCONV     XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2);
-XMVECTOR    XM_CALLCONV     XMPlaneTransform(FXMVECTOR P, FXMMATRIX M);
-XMFLOAT4*   XM_CALLCONV     XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream,
-                                                   _In_ size_t OutputStride,
-                                                   _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) const XMFLOAT4* pInputStream,
-                                                   _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX M);
-
-XMVECTOR    XM_CALLCONV     XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal);
-XMVECTOR    XM_CALLCONV     XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3);
-
-/****************************************************************************
- *
- * Color operations
- *
- ****************************************************************************/
-
-bool        XM_CALLCONV     XMColorEqual(FXMVECTOR C1, FXMVECTOR C2);
-bool        XM_CALLCONV     XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2);
-bool        XM_CALLCONV     XMColorGreater(FXMVECTOR C1, FXMVECTOR C2);
-bool        XM_CALLCONV     XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2);
-bool        XM_CALLCONV     XMColorLess(FXMVECTOR C1, FXMVECTOR C2);
-bool        XM_CALLCONV     XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2);
-
-bool        XM_CALLCONV     XMColorIsNaN(FXMVECTOR C);
-bool        XM_CALLCONV     XMColorIsInfinite(FXMVECTOR C);
-
-XMVECTOR    XM_CALLCONV     XMColorNegative(FXMVECTOR C);
-XMVECTOR    XM_CALLCONV     XMColorModulate(FXMVECTOR C1, FXMVECTOR C2);
-XMVECTOR    XM_CALLCONV     XMColorAdjustSaturation(FXMVECTOR C, float Saturation);
-XMVECTOR    XM_CALLCONV     XMColorAdjustContrast(FXMVECTOR C, float Contrast);
-
-XMVECTOR    XM_CALLCONV     XMColorRGBToHSL( FXMVECTOR rgb );
-XMVECTOR    XM_CALLCONV     XMColorHSLToRGB( FXMVECTOR hsl );
-
-XMVECTOR    XM_CALLCONV     XMColorRGBToHSV( FXMVECTOR rgb );
-XMVECTOR    XM_CALLCONV     XMColorHSVToRGB( FXMVECTOR hsv );
-
-XMVECTOR    XM_CALLCONV     XMColorRGBToYUV( FXMVECTOR rgb );
-XMVECTOR    XM_CALLCONV     XMColorYUVToRGB( FXMVECTOR yuv );
-
-XMVECTOR    XM_CALLCONV     XMColorRGBToYUV_HD( FXMVECTOR rgb );
-XMVECTOR    XM_CALLCONV     XMColorYUVToRGB_HD( FXMVECTOR yuv );
-
-XMVECTOR    XM_CALLCONV     XMColorRGBToXYZ( FXMVECTOR rgb );
-XMVECTOR    XM_CALLCONV     XMColorXYZToRGB( FXMVECTOR xyz );
-
-XMVECTOR    XM_CALLCONV     XMColorXYZToSRGB( FXMVECTOR xyz );
-XMVECTOR    XM_CALLCONV     XMColorSRGBToXYZ( FXMVECTOR srgb );
-
-XMVECTOR    XM_CALLCONV     XMColorRGBToSRGB( FXMVECTOR rgb );
-XMVECTOR    XM_CALLCONV     XMColorSRGBToRGB( FXMVECTOR srgb );
-
-
-/****************************************************************************
- *
- * Miscellaneous operations
- *
- ****************************************************************************/
-
-bool            XMVerifyCPUSupport();
-
-XMVECTOR    XM_CALLCONV     XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex);
-
-bool            XMScalarNearEqual(float S1, float S2, float Epsilon);
-float           XMScalarModAngle(float Value);
-
-float           XMScalarSin(float Value);
-float           XMScalarSinEst(float Value);
-
-float           XMScalarCos(float Value);
-float           XMScalarCosEst(float Value);
-
-void            XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value);
-void            XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value);
-
-float           XMScalarASin(float Value);
-float           XMScalarASinEst(float Value);
-
-float           XMScalarACos(float Value);
-float           XMScalarACosEst(float Value);
-
-/****************************************************************************
- *
- * Templates
- *
- ****************************************************************************/
-
-#if defined(__XNAMATH_H__) && defined(XMMin)
-#undef XMMin
-#undef XMMax
-#endif
-
-template<class T> inline T XMMin(T a, T b) { return (a < b) ? a : b; }
-template<class T> inline T XMMax(T a, T b) { return (a > b) ? a : b; }
-
-//------------------------------------------------------------------------------
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-// PermuteHelper internal template (SSE only)
-namespace Internal
-{
-    // Slow path fallback for permutes that do not map to a single SSE shuffle opcode.
-    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
-    {
-        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2)
-        {
-            static const XMVECTORU32 selectMask =
-            {
-                WhichX ? 0xFFFFFFFF : 0,
-                WhichY ? 0xFFFFFFFF : 0,
-                WhichZ ? 0xFFFFFFFF : 0,
-                WhichW ? 0xFFFFFFFF : 0,
-            };
-
-            XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
-            XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);
-
-            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
-            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
-
-            return _mm_or_ps(masked1, masked2);
-        }
-    };
-
-    // Fast path for permutes that only read from the first vector.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
-    {
-        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); }
-    };
-
-    // Fast path for permutes that only read from the second vector.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
-    {
-        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); }
-    };
-
-    // Fast path for permutes that read XY from the first vector, ZW from the second.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
-    {
-        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
-    };
-
-    // Fast path for permutes that read XY from the second vector, ZW from the first.
-    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
-    {
-        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
-    };
-};
-
-#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-// General permute template
-template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
-    inline XMVECTOR     XM_CALLCONV     XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
-{
-    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
-    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
-    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
-    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
-
-    const bool WhichX = PermuteX > 3;
-    const bool WhichY = PermuteY > 3;
-    const bool WhichZ = PermuteZ > 3;
-    const bool WhichW = PermuteW > 3;
-
-    return Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
-#else
-
-    return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW );
-
-#endif
-}
-
-// Special-case permute templates
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movelh_ps(V1,V2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<6,7,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movehl_ps(V1,V2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpacklo_ps(V1,V2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpackhi_ps(V1,V2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); }
-#endif
-
-#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
-#endif
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead
-// The mirror cases are not spelled out here as the programmer can always swap the arguments
-// (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector)
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); }
-
-#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-//------------------------------------------------------------------------------
-
-// General swizzle template
-template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
-    inline XMVECTOR     XM_CALLCONV     XMVectorSwizzle(FXMVECTOR V)
-{
-    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
-    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
-    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
-    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
-#else
-
-    return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
-
-#endif
-}
-
-// Specialized swizzles
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { return _mm_movelh_ps(V,V); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { return _mm_movehl_ps(V,V); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return _mm_unpacklo_ps(V,V); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return _mm_unpackhi_ps(V,V); }
-#endif
-
-#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
-#endif
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; }
-
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); }
-template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); }
-
-#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-//------------------------------------------------------------------------------
-
-template<uint32_t Elements>
-    inline XMVECTOR     XM_CALLCONV     XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
-}
-
-template<uint32_t Elements>
-    inline XMVECTOR     XM_CALLCONV     XMVectorRotateLeft(FXMVECTOR V)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
-}
-
-template<uint32_t Elements>
-    inline XMVECTOR     XM_CALLCONV     XMVectorRotateRight(FXMVECTOR V)
-{
-    static_assert( Elements < 4, "Elements template parameter out of range" );
-    return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
-}
-
-template<uint32_t VSLeftRotateElements, uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3>
-    inline XMVECTOR     XM_CALLCONV     XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS)
-{
-    XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
-    return XMVectorSelect( VD, XMVectorRotateLeft<VSLeftRotateElements>(VS), Control );
-}
-
-/****************************************************************************
- *
- * Globals
- *
- ****************************************************************************/
-
-// The purpose of the following global constants is to prevent redundant 
-// reloading of the constants when they are referenced by more than one
-// separate inline math routine called within the same function.  Declaring
-// a constant locally within a routine is sufficient to prevent redundant
-// reloads of that constant when that single routine is called multiple
-// times in a function, but if the constant is used (and declared) in a 
-// separate math routine it would be reloaded.
-
-#ifndef XMGLOBALCONST
-#define XMGLOBALCONST extern const __declspec(selectany)
-#endif
-
-XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0    = {-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f};
-XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1    = {-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/};
-XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0    = {-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f};
-XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1    = {-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/};
-XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0    = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f};
-XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1    = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f};
-XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2    = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f};
-XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0    = {+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f};
-XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1    = {+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f};
-XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0   = {-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f};
-XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1   = {-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f};
-XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {+0.999866f, +0.999866f, +0.999866f, +0.999866f};
-XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {-0.3302995f, +0.180141f, -0.085133f, +0.0208351f};
-XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients  = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI};
-XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients  = {+1.5707288f,-0.2121144f,+0.0742610f,-0.0187293f};
-XMGLOBALCONST XMVECTORF32 g_XMPiConstants0        = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR0          = {1.0f, 0.0f, 0.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR1          = {0.0f, 1.0f, 0.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR2          = {0.0f, 0.0f, 1.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR3          = {0.0f, 0.0f, 0.0f, 1.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0       = {-1.0f,0.0f, 0.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1       = {0.0f,-1.0f, 0.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2       = {0.0f, 0.0f,-1.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3       = {0.0f, 0.0f, 0.0f,-1.0f};
-XMGLOBALCONST XMVECTORU32 g_XMNegativeZero      = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-XMGLOBALCONST XMVECTORU32 g_XMNegate3           = {0x80000000, 0x80000000, 0x80000000, 0x00000000};
-XMGLOBALCONST XMVECTORU32 g_XMMaskXY            = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000};
-XMGLOBALCONST XMVECTORU32 g_XMMask3             = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
-XMGLOBALCONST XMVECTORU32 g_XMMaskX             = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000};
-XMGLOBALCONST XMVECTORU32 g_XMMaskY             = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000};
-XMGLOBALCONST XMVECTORU32 g_XMMaskZ             = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000};
-XMGLOBALCONST XMVECTORU32 g_XMMaskW             = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF};
-XMGLOBALCONST XMVECTORF32 g_XMOne               = { 1.0f, 1.0f, 1.0f, 1.0f};
-XMGLOBALCONST XMVECTORF32 g_XMOne3              = { 1.0f, 1.0f, 1.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMZero              = { 0.0f, 0.0f, 0.0f, 0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMTwo               = { 2.f, 2.f, 2.f, 2.f };
-XMGLOBALCONST XMVECTORF32 g_XMFour              = { 4.f, 4.f, 4.f, 4.f };
-XMGLOBALCONST XMVECTORF32 g_XMSix               = { 6.f, 6.f, 6.f, 6.f };
-XMGLOBALCONST XMVECTORF32 g_XMNegativeOne       = {-1.0f,-1.0f,-1.0f,-1.0f};
-XMGLOBALCONST XMVECTORF32 g_XMOneHalf           = { 0.5f, 0.5f, 0.5f, 0.5f};
-XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf   = {-0.5f,-0.5f,-0.5f,-0.5f};
-XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi     = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI};
-XMGLOBALCONST XMVECTORF32 g_XMNegativePi        = {-XM_PI, -XM_PI, -XM_PI, -XM_PI};
-XMGLOBALCONST XMVECTORF32 g_XMHalfPi            = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2};
-XMGLOBALCONST XMVECTORF32 g_XMPi                = {XM_PI, XM_PI, XM_PI, XM_PI};
-XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi      = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI};
-XMGLOBALCONST XMVECTORF32 g_XMTwoPi             = {XM_2PI, XM_2PI, XM_2PI, XM_2PI};
-XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi   = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI};
-XMGLOBALCONST XMVECTORF32 g_XMEpsilon           = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f};
-XMGLOBALCONST XMVECTORI32 g_XMInfinity          = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
-XMGLOBALCONST XMVECTORI32 g_XMQNaN              = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000};
-XMGLOBALCONST XMVECTORI32 g_XMQNaNTest          = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
-XMGLOBALCONST XMVECTORI32 g_XMAbsMask           = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-XMGLOBALCONST XMVECTORI32 g_XMFltMin            = {0x00800000, 0x00800000, 0x00800000, 0x00800000};
-XMGLOBALCONST XMVECTORI32 g_XMFltMax            = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF};
-XMGLOBALCONST XMVECTORU32 g_XMNegOneMask        = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
-XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8      = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000};
-XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8      = {0x00000000, 0x00000000, 0x00000000, 0x80000000};
-XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8      = {0.0f,0.0f,0.0f,(float)(0x80000000U)};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))};
-XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10   = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000};
-XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10   = {0x00000200, 0x00080000, 0x20000000, 0x80000000};
-XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10   = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))};
-XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16        = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000};
-XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16        = {0x00008000, 0x00000000, 0x00000000, 0x00000000};
-XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16         = {-32768.0f,0.0f,0.0f,0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16   = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f};
-XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16  = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000};
-XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16  = {0x00008000, 0x00008000, 0x00000000, 0x00000000};
-XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16   = {-32768.0f,-32768.0f,0.0f,0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)};
-XMGLOBALCONST XMVECTORF32 g_XMNoFraction        = {8388608.0f,8388608.0f,8388608.0f,8388608.0f};
-XMGLOBALCONST XMVECTORI32 g_XMMaskByte          = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
-XMGLOBALCONST XMVECTORF32 g_XMNegateX           = {-1.0f, 1.0f, 1.0f, 1.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegateY           = { 1.0f,-1.0f, 1.0f, 1.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegateZ           = { 1.0f, 1.0f,-1.0f, 1.0f};
-XMGLOBALCONST XMVECTORF32 g_XMNegateW           = { 1.0f, 1.0f, 1.0f,-1.0f};
-XMGLOBALCONST XMVECTORU32 g_XMSelect0101        = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1010        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0};
-XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1000        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1100        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1110        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1011          = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 };
-XMGLOBALCONST XMVECTORF32 g_XMFixupY16          = {1.0f,1.0f/65536.0f,0.0f,0.0f};
-XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16       = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f};
-XMGLOBALCONST XMVECTORU32 g_XMFlipY             = {0,0x80000000,0,0};
-XMGLOBALCONST XMVECTORU32 g_XMFlipZ             = {0,0,0x80000000,0};
-XMGLOBALCONST XMVECTORU32 g_XMFlipW             = {0,0,0,0x80000000};
-XMGLOBALCONST XMVECTORU32 g_XMFlipYZ            = {0,0x80000000,0x80000000,0};
-XMGLOBALCONST XMVECTORU32 g_XMFlipZW            = {0,0,0x80000000,0x80000000};
-XMGLOBALCONST XMVECTORU32 g_XMFlipYW            = {0,0x80000000,0,0x80000000};
-XMGLOBALCONST XMVECTORI32 g_XMMaskDec4          = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
-XMGLOBALCONST XMVECTORI32 g_XMXorDec4           = {0x200,0x200<<10,0x200<<20,0};
-XMGLOBALCONST XMVECTORF32 g_XMAddUDec4          = {0,0,0,32768.0f*65536.0f};
-XMGLOBALCONST XMVECTORF32 g_XMAddDec4           = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0};
-XMGLOBALCONST XMVECTORF32 g_XMMulDec4           = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
-XMGLOBALCONST XMVECTORU32 g_XMMaskByte4         = {0xFF,0xFF00,0xFF0000,0xFF000000};
-XMGLOBALCONST XMVECTORI32 g_XMXorByte4          = {0x80,0x8000,0x800000,0x00000000};
-XMGLOBALCONST XMVECTORF32 g_XMAddByte4          = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0};
-XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned       = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
-XMGLOBALCONST XMVECTORF32 g_XMMaxInt            = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f};
-XMGLOBALCONST XMVECTORF32 g_XMMaxUInt           = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f};
-XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix       = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
-XMGLOBALCONST XMVECTORF32 g_XMsrgbScale         = { 12.92f, 12.92f, 12.92f, 1.0f };
-XMGLOBALCONST XMVECTORF32 g_XMsrgbA             = { 0.055f, 0.055f, 0.055f, 0.0f };
-XMGLOBALCONST XMVECTORF32 g_XMsrgbA1            = { 1.055f, 1.055f, 1.055f, 1.0f };
-XMGLOBALCONST XMVECTORI32 g_XMExponentBias      = {127, 127, 127, 127};
-XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = {-126, -126, -126, -126};
-XMGLOBALCONST XMVECTORI32 g_XMNumTrailing       = {23, 23, 23, 23};
-XMGLOBALCONST XMVECTORI32 g_XMMinNormal         = {0x00800000, 0x00800000, 0x00800000, 0x00800000};
-XMGLOBALCONST XMVECTORU32 g_XMNegInfinity       = {0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000};
-XMGLOBALCONST XMVECTORU32 g_XMNegQNaN           = {0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000};
-XMGLOBALCONST XMVECTORI32 g_XMBin128            = {0x43000000, 0x43000000, 0x43000000, 0x43000000};
-XMGLOBALCONST XMVECTORU32 g_XMBinNeg150         = {0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000};
-XMGLOBALCONST XMVECTORI32 g_XM253               = {253, 253, 253, 253};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst1           = {-6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst2           = {+2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst3           = {-5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst4           = {+9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst5           = {-1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst6           = {+1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst7           = {-1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst0           = {+1.442693f, +1.442693f, +1.442693f, +1.442693f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst1           = {-0.721242f, -0.721242f, -0.721242f, -0.721242f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst2           = {+0.479384f, +0.479384f, +0.479384f, +0.479384f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst3           = {-0.350295f, -0.350295f, -0.350295f, -0.350295f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst4           = {+0.248590f, +0.248590f, +0.248590f, +0.248590f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst5           = {-0.145700f, -0.145700f, -0.145700f, -0.145700f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst6           = {+0.057148f, +0.057148f, +0.057148f, +0.057148f};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst7           = {-0.010578f, -0.010578f, -0.010578f, -0.010578f};
-XMGLOBALCONST XMVECTORF32 g_XMLgE               = {+1.442695f, +1.442695f, +1.442695f, +1.442695f};
-XMGLOBALCONST XMVECTORF32 g_XMInvLgE            = {+6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f};
-XMGLOBALCONST XMVECTORF32 g_UByteMax            = {255.0f, 255.0f, 255.0f, 255.0f};
-XMGLOBALCONST XMVECTORF32 g_ByteMin             = {-127.0f, -127.0f, -127.0f, -127.0f};
-XMGLOBALCONST XMVECTORF32 g_ByteMax             = {127.0f, 127.0f, 127.0f, 127.0f};
-XMGLOBALCONST XMVECTORF32 g_ShortMin            = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
-XMGLOBALCONST XMVECTORF32 g_ShortMax            = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
-XMGLOBALCONST XMVECTORF32 g_UShortMax           = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
-
-/****************************************************************************
- *
- * Implementation
- *
- ****************************************************************************/
-
-#pragma warning(push)
-#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101)
-// C4068/4616: ignore unknown pragmas
-// C4214/4204: nonstandard extension used
-// C4365/4640: Off by default noise
-// C6001/6101: False positives
-
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = (0-(C0&1)) & 0x3F800000;
-    vResult.u[1] = (0-(C1&1)) & 0x3F800000;
-    vResult.u[2] = (0-(C2&1)) & 0x3F800000;
-    vResult.u[3] = (0-(C3&1)) & 0x3F800000;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = (0-(C0&1)) & 0x3F800000;
-    vResult.u[1] = (0-(C1&1)) & 0x3F800000;
-    vResult.u[2] = (0-(C2&1)) & 0x3F800000;
-    vResult.u[3] = (0-(C3&1)) & 0x3F800000;
-    return vResult.v;
-#else // XM_SSE_INTRINSICS_
-    static const XMVECTORU32 g_vMask1 = {1,1,1,1};
-    // Move the parms to a vector
-    __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0);
-    // Mask off the low bits
-    vTemp = _mm_and_si128(vTemp,g_vMask1);
-    // 0xFFFFFFFF on true bits
-    vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
-    // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
-    vTemp = _mm_and_si128(vTemp,g_XMOne);
-    return _mm_castsi128_ps(vTemp);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent)
-{
-    assert( IntConstant >= -16 && IntConstant <= 15 );
-    assert( DivExponent < 32 );
-#if defined(_XM_NO_INTRINSICS_)
-
-    using DirectX::XMConvertVectorIntToFloat;
-
-    XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant };
-    return XMConvertVectorIntToFloat( V.v, DivExponent);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Splat the int
-    int32x4_t vScale = vdupq_n_s32(IntConstant);
-    // Convert to a float
-    XMVECTOR vResult = vcvtq_f32_s32(vScale);
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat the scalar value (It's really a float)
-    vScale = vdupq_n_s32(uScale);
-    // Multiply by the reciprocal (Perform a right shift by DivExponent)
-    vResult = vmulq_f32(vResult,reinterpret_cast<const float32x4_t *>(&vScale)[0]);
-    return vResult;
-#else // XM_SSE_INTRINSICS_
-    // Splat the int
-    __m128i vScale = _mm_set1_epi32(IntConstant);
-    // Convert to a float
-    XMVECTOR vResult = _mm_cvtepi32_ps(vScale);
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat the scalar value (It's really a float)
-    vScale = _mm_set1_epi32(uScale);
-    // Multiply by the reciprocal (Perform a right shift by DivExponent)
-    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant)
-{
-    assert( IntConstant >= -16 && IntConstant <= 15 );
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant };
-    return V.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t V = vdupq_n_s32( IntConstant );
-    return reinterpret_cast<float32x4_t *>(&V)[0];
-#else // XM_SSE_INTRINSICS_
-    __m128i V = _mm_set1_epi32( IntConstant );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-#include "DirectXMathConvert.inl"
-#include "DirectXMathVector.inl"
-#include "DirectXMathMatrix.inl"
-#include "DirectXMathMisc.inl"
-
-#pragma prefast(pop)
-#pragma warning(pop)
-
-}; // namespace DirectX
-
+//-------------------------------------------------------------------------------------
+// DirectXMath.h -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#ifndef __cplusplus
+#error DirectX Math requires C++
+#endif
+
+#define DIRECTX_MATH_VERSION 309
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+#error DirectX Math Visual C++ 2013 or later.
+#endif
+
+#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_)
+#define _XM_VECTORCALL_ 1
+#endif
+
+#if _XM_VECTORCALL_
+#define XM_CALLCONV __vectorcall
+#else
+#define XM_CALLCONV __fastcall
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+#define XM_CTOR_DEFAULT {}
+#else
+#define XM_CTOR_DEFAULT =default;
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define XM_CONSTEXPR const
+#else
+#define XM_CONSTEXPR constexpr
+#endif
+
+#ifndef XM_DEPRECATED
+#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
+#endif
+
+#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_F16C_INTRINSICS_
+#endif
+
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
+#define _XM_SSE4_INTRINSICS_
+#endif
+
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_)
+#define _XM_SSE3_INTRINSICS_
+#endif
+
+#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
+#define _XM_SSE_INTRINSICS_
+#endif
+
+#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#if defined(_M_IX86) || defined(_M_X64)
+#define _XM_SSE_INTRINSICS_
+#elif defined(_M_ARM) || defined(_M_ARM64)
+#define _XM_ARM_NEON_INTRINSICS_
+#elif !defined(_XM_NO_INTRINSICS_)
+#error DirectX Math does not support this target
+#endif
+#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+#pragma warning(push)
+#pragma warning(disable:4514 4820)
+// C4514/4820: Off by default noise
+#include <math.h>
+#include <float.h>
+#include <malloc.h>
+#pragma warning(pop)
+
+#ifndef _XM_NO_INTRINSICS_
+#pragma warning(push)
+#pragma warning(disable : 4987)
+// C4987: Off by default noise
+#include <intrin.h>
+#pragma warning(pop)
+
+#ifdef _XM_SSE_INTRINSICS_
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef _XM_SSE3_INTRINSICS_
+#include <pmmintrin.h>
+#endif
+
+#ifdef _XM_SSE4_INTRINSICS_
+#include <smmintrin.h>
+#endif
+
+#ifdef _XM_AVX_INTRINSICS_
+#include <immintrin.h>
+#endif
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+#endif // !_XM_NO_INTRINSICS_
+
+#include <sal.h>
+#include <assert.h>
+
+#ifndef _XM_NO_ROUNDF_
+#ifdef _MSC_VER
+#include <yvals.h>
+#if defined(_CPPLIB_VER) && ( _CPPLIB_VER < 610 )
+#define _XM_NO_ROUNDF_
+#endif
+#endif
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4005 4668)
+// C4005/4668: Old header issue
+#include <stdint.h>
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Conditional intrinsics
+ *
+ ****************************************************************************/
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+#if defined(_XM_NO_MOVNT_)
+#define XM_STREAM_PS( p, a ) _mm_store_ps( p, a )
+#define XM_SFENCE()
+#else
+#define XM_STREAM_PS( p, a ) _mm_stream_ps( p, a )
+#define XM_SFENCE() _mm_sfence()
+#endif
+
+#if defined(_XM_AVX_INTRINSICS_)
+#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
+#else
+#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
+#endif
+
+#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+namespace DirectX
+{
+
+/****************************************************************************
+ *
+ * Constant definitions
+ *
+ ****************************************************************************/
+
+#if defined(__XNAMATH_H__) && defined(XM_PI)
+#undef XM_PI
+#undef XM_2PI
+#undef XM_1DIVPI
+#undef XM_1DIV2PI
+#undef XM_PIDIV2
+#undef XM_PIDIV4
+#undef XM_SELECT_0
+#undef XM_SELECT_1
+#undef XM_PERMUTE_0X
+#undef XM_PERMUTE_0Y
+#undef XM_PERMUTE_0Z
+#undef XM_PERMUTE_0W
+#undef XM_PERMUTE_1X
+#undef XM_PERMUTE_1Y
+#undef XM_PERMUTE_1Z
+#undef XM_PERMUTE_1W
+#undef XM_CRMASK_CR6
+#undef XM_CRMASK_CR6TRUE
+#undef XM_CRMASK_CR6FALSE
+#undef XM_CRMASK_CR6BOUNDS
+#undef XM_CACHE_LINE_SIZE
+#endif
+
+XM_CONSTEXPR float XM_PI        = 3.141592654f;
+XM_CONSTEXPR float XM_2PI       = 6.283185307f;
+XM_CONSTEXPR float XM_1DIVPI    = 0.318309886f;
+XM_CONSTEXPR float XM_1DIV2PI   = 0.159154943f;
+XM_CONSTEXPR float XM_PIDIV2    = 1.570796327f;
+XM_CONSTEXPR float XM_PIDIV4    = 0.785398163f;
+
+XM_CONSTEXPR uint32_t XM_SELECT_0   = 0x00000000;
+XM_CONSTEXPR uint32_t XM_SELECT_1   = 0xFFFFFFFF;
+
+XM_CONSTEXPR uint32_t XM_PERMUTE_0X = 0;
+XM_CONSTEXPR uint32_t XM_PERMUTE_0Y = 1;
+XM_CONSTEXPR uint32_t XM_PERMUTE_0Z = 2;
+XM_CONSTEXPR uint32_t XM_PERMUTE_0W = 3;
+XM_CONSTEXPR uint32_t XM_PERMUTE_1X = 4;
+XM_CONSTEXPR uint32_t XM_PERMUTE_1Y = 5;
+XM_CONSTEXPR uint32_t XM_PERMUTE_1Z = 6;
+XM_CONSTEXPR uint32_t XM_PERMUTE_1W = 7;
+
+XM_CONSTEXPR uint32_t XM_SWIZZLE_X  = 0;
+XM_CONSTEXPR uint32_t XM_SWIZZLE_Y  = 1;
+XM_CONSTEXPR uint32_t XM_SWIZZLE_Z  = 2;
+XM_CONSTEXPR uint32_t XM_SWIZZLE_W  = 3;
+
+XM_CONSTEXPR uint32_t XM_CRMASK_CR6         = 0x000000F0;
+XM_CONSTEXPR uint32_t XM_CRMASK_CR6TRUE     = 0x00000080;
+XM_CONSTEXPR uint32_t XM_CRMASK_CR6FALSE    = 0x00000020;
+XM_CONSTEXPR uint32_t XM_CRMASK_CR6BOUNDS   = XM_CRMASK_CR6FALSE;
+
+XM_CONSTEXPR size_t XM_CACHE_LINE_SIZE = 64;
+
+
+/****************************************************************************
+ *
+ * Macros
+ *
+ ****************************************************************************/
+
+#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue)
+#undef XMComparisonAllTrue
+#undef XMComparisonAnyTrue
+#undef XMComparisonAllFalse
+#undef XMComparisonAnyFalse
+#undef XMComparisonMixed
+#undef XMComparisonAllInBounds
+#undef XMComparisonAnyOutOfBounds
+#endif
+
+// Unit conversion
+
+inline XM_CONSTEXPR float XMConvertToRadians(float fDegrees) { return fDegrees * (XM_PI / 180.0f); }
+inline XM_CONSTEXPR float XMConvertToDegrees(float fRadians) { return fRadians * (180.0f / XM_PI); }
+
+// Condition register evaluation proceeding a recording (R) comparison
+
+inline bool XMComparisonAllTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); }
+inline bool XMComparisonAnyTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); }
+inline bool XMComparisonAllFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); }
+inline bool XMComparisonAnyFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); }
+inline bool XMComparisonMixed(uint32_t CR) { return (((CR) & XM_CRMASK_CR6) == 0); }
+inline bool XMComparisonAllInBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); }
+inline bool XMComparisonAnyOutOfBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); }
+
+
+/****************************************************************************
+ *
+ * Data types
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable:4068 4201 4365 4324 4820)
+// C4068: ignore unknown pragmas
+// C4201: nonstandard extension used : nameless struct/union
+// C4365: Off by default noise
+// C4324/4820: padding warnings
+
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+
+//------------------------------------------------------------------------------
+#if defined(_XM_NO_INTRINSICS_)
+struct __vector4
+{
+    union
+    {
+        float       vector4_f32[4];
+        uint32_t    vector4_u32[4];
+    };
+};
+#endif // _XM_NO_INTRINSICS_
+
+//------------------------------------------------------------------------------
+// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte 
+// boundary and mapped to hardware vector registers
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+typedef __m128 XMVECTOR;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+typedef float32x4_t XMVECTOR;
+#else
+typedef __vector4 XMVECTOR;
+#endif
+
+// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise
+#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMVECTOR FXMVECTOR;
+#else
+typedef const XMVECTOR& FXMVECTOR;
+#endif
+
+// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise
+#if ( defined(_M_ARM) || defined(_M_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMVECTOR GXMVECTOR;
+#else
+typedef const XMVECTOR& GXMVECTOR;
+#endif
+
+// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise
+#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMVECTOR HXMVECTOR;
+#else
+typedef const XMVECTOR& HXMVECTOR;
+#endif
+
+// Fix-up for (7th+) XMVECTOR parameters to pass by reference
+typedef const XMVECTOR& CXMVECTOR;
+
+//------------------------------------------------------------------------------
+// Conversion types for constants
+__declspec(align(16)) struct XMVECTORF32
+{
+    union
+    {
+        float f[4];
+        XMVECTOR v;
+    };
+
+    inline operator XMVECTOR() const { return v; }
+    inline operator const float*() const { return f; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
+#endif
+};
+
+__declspec(align(16)) struct XMVECTORI32
+{
+    union
+    {
+        int32_t i[4];
+        XMVECTOR v;
+    };
+
+    inline operator XMVECTOR() const { return v; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
+#endif
+};
+
+__declspec(align(16)) struct XMVECTORU8
+{
+    union
+    {
+        uint8_t u[16];
+        XMVECTOR v;
+    };
+
+    inline operator XMVECTOR() const { return v; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
+#endif
+};
+
+__declspec(align(16)) struct XMVECTORU32
+{
+    union
+    {
+        uint32_t u[4];
+        XMVECTOR v;
+    };
+
+    inline operator XMVECTOR() const { return v; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
+#endif
+};
+
+//------------------------------------------------------------------------------
+// Vector operators
+XMVECTOR    XM_CALLCONV     operator+ (FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     operator- (FXMVECTOR V);
+
+XMVECTOR&   XM_CALLCONV     operator+= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR&   XM_CALLCONV     operator-= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR&   XM_CALLCONV     operator*= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR&   XM_CALLCONV     operator/= (XMVECTOR& V1, FXMVECTOR V2);
+
+XMVECTOR&   operator*= (XMVECTOR& V, float S);
+XMVECTOR&   operator/= (XMVECTOR& V, float S);
+
+XMVECTOR    XM_CALLCONV     operator+ (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     operator- (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     operator* (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     operator/ (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     operator* (FXMVECTOR V, float S);
+XMVECTOR    XM_CALLCONV     operator* (float S, FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     operator/ (FXMVECTOR V, float S);
+
+//------------------------------------------------------------------------------
+// Matrix type: Sixteen 32 bit floating point components aligned on a
+// 16 byte boundary and mapped to four hardware vector registers
+
+struct XMMATRIX;
+
+// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise
+#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMMATRIX FXMMATRIX;
+#else
+typedef const XMMATRIX& FXMMATRIX;
+#endif
+
+// Fix-up for (2nd+) XMMATRIX parameters to pass by reference
+typedef const XMMATRIX& CXMMATRIX;
+
+#ifdef _XM_NO_INTRINSICS_
+struct XMMATRIX
+#else
+__declspec(align(16)) struct XMMATRIX
+#endif
+{
+#ifdef _XM_NO_INTRINSICS_
+    union
+    {
+        XMVECTOR r[4];
+        struct
+        {
+            float _11, _12, _13, _14;
+            float _21, _22, _23, _24;
+            float _31, _32, _33, _34;
+            float _41, _42, _43, _44;
+        };
+        float m[4][4];
+    };
+#else
+    XMVECTOR r[4];
+#endif
+
+    XMMATRIX() XM_CTOR_DEFAULT
+#if defined(_MSC_VER) && _MSC_VER >= 1900
+    constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) : r{ R0,R1,R2,R3 } {}
+#else
+    XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; }
+#endif
+    XMMATRIX(float m00, float m01, float m02, float m03,
+             float m10, float m11, float m12, float m13,
+             float m20, float m21, float m22, float m23,
+             float m30, float m31, float m32, float m33);
+    explicit XMMATRIX(_In_reads_(16) const float *pArray);
+
+#ifdef _XM_NO_INTRINSICS_
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+#endif
+
+    XMMATRIX&   operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; }
+
+    XMMATRIX    operator+ () const { return *this; }
+    XMMATRIX    operator- () const;
+
+    XMMATRIX&   XM_CALLCONV     operator+= (FXMMATRIX M);
+    XMMATRIX&   XM_CALLCONV     operator-= (FXMMATRIX M);
+    XMMATRIX&   XM_CALLCONV     operator*= (FXMMATRIX M);
+    XMMATRIX&   operator*= (float S);
+    XMMATRIX&   operator/= (float S);
+
+    XMMATRIX    XM_CALLCONV     operator+ (FXMMATRIX M) const;
+    XMMATRIX    XM_CALLCONV     operator- (FXMMATRIX M) const;
+    XMMATRIX    XM_CALLCONV     operator* (FXMMATRIX M) const;
+    XMMATRIX    operator* (float S) const;
+    XMMATRIX    operator/ (float S) const;
+
+    friend XMMATRIX     XM_CALLCONV     operator* (float S, FXMMATRIX M);
+};
+
+//------------------------------------------------------------------------------
+// 2D Vector; 32 bit floating point components
+struct XMFLOAT2
+{
+    float x;
+    float y;
+
+    XMFLOAT2() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT2(float _x, float _y) : x(_x), y(_y) {}
+    explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {}
+
+    XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; }
+};
+
+// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary
+__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2
+{
+    XMFLOAT2A() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {}
+    explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {}
+
+    XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 2D Vector; 32 bit signed integer components
+struct XMINT2
+{
+    int32_t x;
+    int32_t y;
+
+    XMINT2() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMINT2(int32_t _x, int32_t _y) : x(_x), y(_y) {}
+    explicit XMINT2(_In_reads_(2) const int32_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+
+    XMINT2& operator= (const XMINT2& Int2) { x = Int2.x; y = Int2.y; return *this; }
+};
+
+// 2D Vector; 32 bit unsigned integer components
+struct XMUINT2
+{
+    uint32_t x;
+    uint32_t y;
+
+    XMUINT2() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMUINT2(uint32_t _x, uint32_t _y) : x(_x), y(_y) {}
+    explicit XMUINT2(_In_reads_(2) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+
+    XMUINT2& operator= (const XMUINT2& UInt2) { x = UInt2.x; y = UInt2.y; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 3D Vector; 32 bit floating point components
+struct XMFLOAT3
+{
+    float x;
+    float y;
+    float z;
+
+    XMFLOAT3() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {}
+    explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+
+    XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
+};
+
+// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary
+__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3
+{
+    XMFLOAT3A() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {}
+    explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {}
+
+    XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 3D Vector; 32 bit signed integer components
+struct XMINT3
+{
+    int32_t x;
+    int32_t y;
+    int32_t z;
+
+    XMINT3() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {}
+    explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+
+    XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; }
+};
+
+// 3D Vector; 32 bit unsigned integer components
+struct XMUINT3
+{
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+
+    XMUINT3() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {}
+    explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+
+    XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D Vector; 32 bit floating point components
+struct XMFLOAT4
+{
+    float x;
+    float y;
+    float z;
+    float w;
+
+    XMFLOAT4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+
+    XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
+};
+
+// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary
+__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4
+{
+    XMFLOAT4A() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {}
+    explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {}
+
+    XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D Vector; 32 bit signed integer components
+struct XMINT4
+{
+    int32_t x;
+    int32_t y;
+    int32_t z;
+    int32_t w;
+
+    XMINT4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMINT4(_In_reads_(4) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+
+    XMINT4& operator= (const XMINT4& Int4) { x = Int4.x; y = Int4.y; z = Int4.z; w = Int4.w; return *this; }
+};
+
+// 4D Vector; 32 bit unsigned integer components
+struct XMUINT4
+{
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+    uint32_t w;
+
+    XMUINT4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMUINT4(_In_reads_(4) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+
+    XMUINT4& operator= (const XMUINT4& UInt4) { x = UInt4.x; y = UInt4.y; z = UInt4.z; w = UInt4.w; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 3x3 Matrix: 32 bit floating point components
+struct XMFLOAT3X3
+{
+    union
+    {
+        struct
+        {
+            float _11, _12, _13;
+            float _21, _22, _23;
+            float _31, _32, _33;
+        };
+        float m[3][3];
+    };
+
+    XMFLOAT3X3() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT3X3(float m00, float m01, float m02,
+                            float m10, float m11, float m12,
+                            float m20, float m21, float m22)
+        : _11(m00), _12(m01), _13(m02),
+          _21(m10), _22(m11), _23(m12),
+          _31(m20), _32(m21), _33(m22) {}
+    explicit XMFLOAT3X3(_In_reads_(9) const float *pArray);
+
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+
+    XMFLOAT3X3& operator= (const XMFLOAT3X3& Float3x3);
+};
+
+//------------------------------------------------------------------------------
+// 4x3 Matrix: 32 bit floating point components
+struct XMFLOAT4X3
+{
+    union
+    {
+        struct
+        {
+            float _11, _12, _13;
+            float _21, _22, _23;
+            float _31, _32, _33;
+            float _41, _42, _43;
+        };
+        float m[4][3];
+    };
+
+    XMFLOAT4X3() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT4X3(float m00, float m01, float m02,
+                            float m10, float m11, float m12,
+                            float m20, float m21, float m22,
+                            float m30, float m31, float m32)
+        : _11(m00), _12(m01), _13(m02),
+          _21(m10), _22(m11), _23(m12),
+          _31(m20), _32(m21), _33(m22),
+          _41(m30), _42(m31), _43(m32) {}
+    explicit XMFLOAT4X3(_In_reads_(12) const float *pArray);
+
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+
+    XMFLOAT4X3& operator= (const XMFLOAT4X3& Float4x3);
+
+};
+
+// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary
+__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3
+{
+    XMFLOAT4X3A() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT4X3A(float m00, float m01, float m02,
+                            float m10, float m11, float m12,
+                            float m20, float m21, float m22,
+                            float m30, float m31, float m32) :
+        XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {}
+    explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {}
+
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+
+    XMFLOAT4X3A& operator= (const XMFLOAT4X3A& Float4x3);
+};
+
+//------------------------------------------------------------------------------
+// 4x4 Matrix: 32 bit floating point components
+struct XMFLOAT4X4
+{
+    union
+    {
+        struct
+        {
+            float _11, _12, _13, _14;
+            float _21, _22, _23, _24;
+            float _31, _32, _33, _34;
+            float _41, _42, _43, _44;
+        };
+        float m[4][4];
+    };
+
+    XMFLOAT4X4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT4X4(float m00, float m01, float m02, float m03,
+                            float m10, float m11, float m12, float m13,
+                            float m20, float m21, float m22, float m23,
+                            float m30, float m31, float m32, float m33)
+        : _11(m00), _12(m01), _13(m02), _14(m03),
+          _21(m10), _22(m11), _23(m12), _24(m13),
+          _31(m20), _32(m21), _33(m22), _34(m23),
+          _41(m30), _42(m31), _43(m32), _44(m33) {}
+    explicit XMFLOAT4X4(_In_reads_(16) const float *pArray);
+
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+
+    XMFLOAT4X4& operator= (const XMFLOAT4X4& Float4x4);
+};
+
+// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary
+__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4
+{
+    XMFLOAT4X4A() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMFLOAT4X4A(float m00, float m01, float m02, float m03,
+                             float m10, float m11, float m12, float m13,
+                             float m20, float m21, float m22, float m23,
+                             float m30, float m31, float m32, float m33)
+        : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {}
+    explicit XMFLOAT4X4A(_In_reads_(16) const float *pArray) : XMFLOAT4X4(pArray) {}
+
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+
+    XMFLOAT4X4A& operator= (const XMFLOAT4X4A& Float4x4);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+#pragma prefast(pop)
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Data conversion operations
+ *
+ ****************************************************************************/
+
+XMVECTOR    XM_CALLCONV     XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent);
+XMVECTOR    XM_CALLCONV     XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent);
+XMVECTOR    XM_CALLCONV     XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent);
+XMVECTOR    XM_CALLCONV     XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent);
+
+#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant)
+#undef XMVectorSetBinaryConstant
+#undef XMVectorSplatConstant
+#undef XMVectorSplatConstantInt
+#endif
+
+XMVECTOR    XM_CALLCONV     XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3);
+XMVECTOR    XM_CALLCONV     XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent);
+XMVECTOR    XM_CALLCONV     XMVectorSplatConstantInt(int32_t IntConstant);
+
+/****************************************************************************
+ *
+ * Load operations
+ *
+ ****************************************************************************/
+
+XMVECTOR    XM_CALLCONV     XMLoadInt(_In_ const uint32_t* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat(_In_ const float* pSource);
+
+XMVECTOR    XM_CALLCONV     XMLoadInt2(_In_reads_(2) const uint32_t* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadInt2A(_In_reads_(2) const uint32_t* PSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat2(_In_ const XMFLOAT2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat2A(_In_ const XMFLOAT2A* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadSInt2(_In_ const XMINT2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUInt2(_In_ const XMUINT2* pSource);
+
+XMVECTOR    XM_CALLCONV     XMLoadInt3(_In_reads_(3) const uint32_t* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadInt3A(_In_reads_(3) const uint32_t* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat3(_In_ const XMFLOAT3* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat3A(_In_ const XMFLOAT3A* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadSInt3(_In_ const XMINT3* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUInt3(_In_ const XMUINT3* pSource);
+
+XMVECTOR    XM_CALLCONV     XMLoadInt4(_In_reads_(4) const uint32_t* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadInt4A(_In_reads_(4) const uint32_t* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat4(_In_ const XMFLOAT4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat4A(_In_ const XMFLOAT4A* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadSInt4(_In_ const XMINT4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUInt4(_In_ const XMUINT4* pSource);
+
+XMMATRIX    XM_CALLCONV     XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource);
+XMMATRIX    XM_CALLCONV     XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource);
+XMMATRIX    XM_CALLCONV     XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource);
+XMMATRIX    XM_CALLCONV     XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource);
+XMMATRIX    XM_CALLCONV     XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource);
+
+/****************************************************************************
+ *
+ * Store operations
+ *
+ ****************************************************************************/
+
+void        XM_CALLCONV     XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V);
+
+void        XM_CALLCONV     XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V);
+
+void        XM_CALLCONV     XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V);
+
+void        XM_CALLCONV     XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V);
+
+void        XM_CALLCONV     XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M);
+void        XM_CALLCONV     XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M);
+void        XM_CALLCONV     XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M);
+void        XM_CALLCONV     XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M);
+void        XM_CALLCONV     XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M);
+
+/****************************************************************************
+ *
+ * General vector operations
+ *
+ ****************************************************************************/
+
+XMVECTOR    XM_CALLCONV     XMVectorZero();
+XMVECTOR    XM_CALLCONV     XMVectorSet(float x, float y, float z, float w);
+XMVECTOR    XM_CALLCONV     XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
+XMVECTOR    XM_CALLCONV     XMVectorReplicate(float Value);
+XMVECTOR    XM_CALLCONV     XMVectorReplicatePtr(_In_ const float *pValue);
+XMVECTOR    XM_CALLCONV     XMVectorReplicateInt(uint32_t Value);
+XMVECTOR    XM_CALLCONV     XMVectorReplicateIntPtr(_In_ const uint32_t *pValue);
+XMVECTOR    XM_CALLCONV     XMVectorTrueInt();
+XMVECTOR    XM_CALLCONV     XMVectorFalseInt();
+XMVECTOR    XM_CALLCONV     XMVectorSplatX(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSplatY(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSplatZ(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSplatW(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSplatOne();
+XMVECTOR    XM_CALLCONV     XMVectorSplatInfinity();
+XMVECTOR    XM_CALLCONV     XMVectorSplatQNaN();
+XMVECTOR    XM_CALLCONV     XMVectorSplatEpsilon();
+XMVECTOR    XM_CALLCONV     XMVectorSplatSignMask();
+
+float       XM_CALLCONV     XMVectorGetByIndex(FXMVECTOR V, size_t i);
+float       XM_CALLCONV     XMVectorGetX(FXMVECTOR V);
+float       XM_CALLCONV     XMVectorGetY(FXMVECTOR V);
+float       XM_CALLCONV     XMVectorGetZ(FXMVECTOR V);
+float       XM_CALLCONV     XMVectorGetW(FXMVECTOR V);
+
+void        XM_CALLCONV     XMVectorGetByIndexPtr(_Out_ float *f, _In_ FXMVECTOR V, _In_ size_t i);
+void        XM_CALLCONV     XMVectorGetXPtr(_Out_ float *x, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V);
+
+uint32_t    XM_CALLCONV     XMVectorGetIntByIndex(FXMVECTOR V, size_t i);
+uint32_t    XM_CALLCONV     XMVectorGetIntX(FXMVECTOR V);
+uint32_t    XM_CALLCONV     XMVectorGetIntY(FXMVECTOR V);
+uint32_t    XM_CALLCONV     XMVectorGetIntZ(FXMVECTOR V);
+uint32_t    XM_CALLCONV     XMVectorGetIntW(FXMVECTOR V);
+
+void        XM_CALLCONV     XMVectorGetIntByIndexPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V, _In_ size_t i);
+void        XM_CALLCONV     XMVectorGetIntXPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V);
+
+XMVECTOR    XM_CALLCONV     XMVectorSetByIndex(FXMVECTOR V,float f, size_t i);
+XMVECTOR    XM_CALLCONV     XMVectorSetX(FXMVECTOR V, float x);
+XMVECTOR    XM_CALLCONV     XMVectorSetY(FXMVECTOR V, float y);
+XMVECTOR    XM_CALLCONV     XMVectorSetZ(FXMVECTOR V, float z);
+XMVECTOR    XM_CALLCONV     XMVectorSetW(FXMVECTOR V, float w);
+
+XMVECTOR    XM_CALLCONV     XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float *f, _In_ size_t i);
+XMVECTOR    XM_CALLCONV     XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float *x);
+XMVECTOR    XM_CALLCONV     XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float *y);
+XMVECTOR    XM_CALLCONV     XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float *z);
+XMVECTOR    XM_CALLCONV     XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float *w);
+
+XMVECTOR    XM_CALLCONV     XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntX(FXMVECTOR V, uint32_t x);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntY(FXMVECTOR V, uint32_t y);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntZ(FXMVECTOR V, uint32_t z);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntW(FXMVECTOR V, uint32_t w);
+
+XMVECTOR    XM_CALLCONV     XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x, _In_ size_t i);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t *y);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t *z);
+XMVECTOR    XM_CALLCONV     XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t *w);
+
+#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle)
+#undef XMVectorSwizzle
+#endif
+
+XMVECTOR    XM_CALLCONV     XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3);
+XMVECTOR    XM_CALLCONV     XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW);
+XMVECTOR    XM_CALLCONV     XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3);
+XMVECTOR    XM_CALLCONV     XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control);
+XMVECTOR    XM_CALLCONV     XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2);
+
+#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft)
+#undef XMVectorShiftLeft
+#undef XMVectorRotateLeft
+#undef XMVectorRotateRight
+#undef XMVectorInsert
+#endif
+
+XMVECTOR    XM_CALLCONV     XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements);
+XMVECTOR    XM_CALLCONV     XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements);
+XMVECTOR    XM_CALLCONV     XMVectorRotateRight(FXMVECTOR V, uint32_t Elements);
+XMVECTOR    XM_CALLCONV     XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
+                                           uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3);
+
+XMVECTOR    XM_CALLCONV     XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+XMVECTOR    XM_CALLCONV     XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorLess(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds);
+XMVECTOR    XM_CALLCONV     XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds);
+
+XMVECTOR    XM_CALLCONV     XMVectorIsNaN(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorIsInfinite(FXMVECTOR V);
+
+XMVECTOR    XM_CALLCONV     XMVectorMin(FXMVECTOR V1,FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorMax(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorRound(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorTruncate(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorFloor(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorCeiling(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max);
+XMVECTOR    XM_CALLCONV     XMVectorSaturate(FXMVECTOR V);
+
+XMVECTOR    XM_CALLCONV     XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2);
+
+XMVECTOR    XM_CALLCONV     XMVectorNegate(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorSum(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
+XMVECTOR    XM_CALLCONV     XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
+XMVECTOR    XM_CALLCONV     XMVectorScale(FXMVECTOR V, float ScaleFactor);
+XMVECTOR    XM_CALLCONV     XMVectorReciprocalEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorReciprocal(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSqrtEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSqrt(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorReciprocalSqrtEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorReciprocalSqrt(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorExp2(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorExpE(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorExp(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorLog2(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorLogE(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorLog(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorPow(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorAbs(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorMod(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorModAngles(FXMVECTOR Angles);
+XMVECTOR    XM_CALLCONV     XMVectorSin(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSinEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorCos(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorCosEst(FXMVECTOR V);
+void        XM_CALLCONV     XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V);
+void        XM_CALLCONV     XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorTan(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorTanEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorSinH(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorCosH(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorTanH(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorASin(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorASinEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorACos(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorACosEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorATan(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorATanEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVectorATan2(FXMVECTOR Y, FXMVECTOR X);
+XMVECTOR    XM_CALLCONV     XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X);
+XMVECTOR    XM_CALLCONV     XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t);
+XMVECTOR    XM_CALLCONV     XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T);
+XMVECTOR    XM_CALLCONV     XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t);
+XMVECTOR    XM_CALLCONV     XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T);
+XMVECTOR    XM_CALLCONV     XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t);
+XMVECTOR    XM_CALLCONV     XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T);
+XMVECTOR    XM_CALLCONV     XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g);
+XMVECTOR    XM_CALLCONV     XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G);
+
+/****************************************************************************
+ *
+ * 2D vector operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+bool        XM_CALLCONV     XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2Less(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds);
+
+bool        XM_CALLCONV     XMVector2IsNaN(FXMVECTOR V);
+bool        XM_CALLCONV     XMVector2IsInfinite(FXMVECTOR V);
+
+XMVECTOR    XM_CALLCONV     XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector2LengthSq(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2ReciprocalLengthEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2ReciprocalLength(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2LengthEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2Length(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2NormalizeEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2Normalize(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax);
+XMVECTOR    XM_CALLCONV     XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
+XMVECTOR    XM_CALLCONV     XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
+XMVECTOR    XM_CALLCONV     XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex);
+XMVECTOR    XM_CALLCONV     XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
+XMVECTOR    XM_CALLCONV     XMVector2Orthogonal(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point);
+XMVECTOR    XM_CALLCONV     XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2);
+XMVECTOR    XM_CALLCONV     XMVector2Transform(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT4*   XM_CALLCONV     XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+                                                    _In_ size_t OutputStride,
+                                                    _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream,
+                                                    _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+XMVECTOR    XM_CALLCONV     XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT2*   XM_CALLCONV     XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream,
+                                                          _In_ size_t OutputStride,
+                                                          _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream,
+                                                          _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+XMVECTOR    XM_CALLCONV     XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT2*   XM_CALLCONV     XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream,
+                                                           _In_ size_t OutputStride,
+                                                           _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream,
+                                                           _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+
+/****************************************************************************
+ *
+ * 3D vector operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+bool        XM_CALLCONV     XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3Less(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds);
+
+bool        XM_CALLCONV     XMVector3IsNaN(FXMVECTOR V);
+bool        XM_CALLCONV     XMVector3IsInfinite(FXMVECTOR V);
+
+XMVECTOR    XM_CALLCONV     XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector3LengthSq(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3ReciprocalLengthEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3ReciprocalLength(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3LengthEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3Length(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3NormalizeEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3Normalize(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax);
+XMVECTOR    XM_CALLCONV     XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
+XMVECTOR    XM_CALLCONV     XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
+XMVECTOR    XM_CALLCONV     XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex);
+XMVECTOR    XM_CALLCONV     XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
+XMVECTOR    XM_CALLCONV     XMVector3Orthogonal(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point);
+void        XM_CALLCONV     XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal);
+XMVECTOR    XM_CALLCONV     XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion);
+XMVECTOR    XM_CALLCONV     XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion);
+XMVECTOR    XM_CALLCONV     XMVector3Transform(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT4*   XM_CALLCONV     XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+                                                     _In_ size_t OutputStride,
+                                                     _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
+                                                     _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+XMVECTOR    XM_CALLCONV     XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT3*   XM_CALLCONV     XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+                                                          _In_ size_t OutputStride,
+                                                          _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
+                                                          _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+XMVECTOR    XM_CALLCONV     XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT3*   XM_CALLCONV     XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+                                                           _In_ size_t OutputStride,
+                                                           _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
+                                                           _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+XMVECTOR    XM_CALLCONV     XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, 
+                                             FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
+XMFLOAT3*   XM_CALLCONV     XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+                                                   _In_ size_t OutputStride,
+                                                   _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
+                                                   _In_ size_t InputStride, _In_ size_t VectorCount, 
+                                                   _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, 
+                                                   _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World);
+XMVECTOR    XM_CALLCONV     XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, 
+                                               FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
+XMFLOAT3*   XM_CALLCONV     XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+                                                     _In_ size_t OutputStride,
+                                                     _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream,
+                                                     _In_ size_t InputStride, _In_ size_t VectorCount, 
+                                                     _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, 
+                                                     _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World);
+
+/****************************************************************************
+ *
+ * 4D vector operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+bool        XM_CALLCONV     XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+uint32_t    XM_CALLCONV     XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4Less(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+bool        XM_CALLCONV     XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds);
+
+bool        XM_CALLCONV     XMVector4IsNaN(FXMVECTOR V);
+bool        XM_CALLCONV     XMVector4IsInfinite(FXMVECTOR V);
+
+XMVECTOR    XM_CALLCONV     XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
+XMVECTOR    XM_CALLCONV     XMVector4LengthSq(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4ReciprocalLengthEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4ReciprocalLength(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4LengthEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4Length(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4NormalizeEst(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4Normalize(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax);
+XMVECTOR    XM_CALLCONV     XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
+XMVECTOR    XM_CALLCONV     XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
+XMVECTOR    XM_CALLCONV     XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex);
+XMVECTOR    XM_CALLCONV     XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
+XMVECTOR    XM_CALLCONV     XMVector4Orthogonal(FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVector4Transform(FXMVECTOR V, FXMMATRIX M);
+XMFLOAT4*   XM_CALLCONV     XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+                                                     _In_ size_t OutputStride,
+                                                     _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) const XMFLOAT4* pInputStream,
+                                                     _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M);
+
+/****************************************************************************
+ *
+ * Matrix operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMMatrixIsNaN(FXMMATRIX M);
+bool        XM_CALLCONV     XMMatrixIsInfinite(FXMMATRIX M);
+bool        XM_CALLCONV     XMMatrixIsIdentity(FXMMATRIX M);
+
+XMMATRIX    XM_CALLCONV     XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2);
+XMMATRIX    XM_CALLCONV     XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2);
+XMMATRIX    XM_CALLCONV     XMMatrixTranspose(FXMMATRIX M);
+XMMATRIX    XM_CALLCONV     XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M);
+XMVECTOR    XM_CALLCONV     XMMatrixDeterminant(FXMMATRIX M);
+_Success_(return)
+bool        XM_CALLCONV     XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, _In_ FXMMATRIX M);
+
+XMMATRIX    XM_CALLCONV     XMMatrixIdentity();
+XMMATRIX    XM_CALLCONV     XMMatrixSet(float m00, float m01, float m02, float m03,
+                                        float m10, float m11, float m12, float m13,
+                                        float m20, float m21, float m22, float m23,
+                                        float m30, float m31, float m32, float m33);
+XMMATRIX    XM_CALLCONV     XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ);
+XMMATRIX    XM_CALLCONV     XMMatrixTranslationFromVector(FXMVECTOR Offset);
+XMMATRIX    XM_CALLCONV     XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ);
+XMMATRIX    XM_CALLCONV     XMMatrixScalingFromVector(FXMVECTOR Scale);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationX(float Angle);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationY(float Angle);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationZ(float Angle);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationAxis(FXMVECTOR Axis, float Angle);
+XMMATRIX    XM_CALLCONV     XMMatrixRotationQuaternion(FXMVECTOR Quaternion);
+XMMATRIX    XM_CALLCONV     XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, 
+                                                     FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation);
+XMMATRIX    XM_CALLCONV     XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, 
+                                                   GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation);
+XMMATRIX    XM_CALLCONV     XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation);
+XMMATRIX    XM_CALLCONV     XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation);
+XMMATRIX    XM_CALLCONV     XMMatrixReflect(FXMVECTOR ReflectionPlane);
+XMMATRIX    XM_CALLCONV     XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition);
+
+XMMATRIX    XM_CALLCONV     XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection);
+XMMATRIX    XM_CALLCONV     XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection);
+XMMATRIX    XM_CALLCONV     XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection);
+XMMATRIX    XM_CALLCONV     XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection);
+XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
+XMMATRIX    XM_CALLCONV     XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ);
+
+
+/****************************************************************************
+ *
+ * Quaternion operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2);
+bool        XM_CALLCONV     XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2);
+
+bool        XM_CALLCONV     XMQuaternionIsNaN(FXMVECTOR Q);
+bool        XM_CALLCONV     XMQuaternionIsInfinite(FXMVECTOR Q);
+bool        XM_CALLCONV     XMQuaternionIsIdentity(FXMVECTOR Q);
+
+XMVECTOR    XM_CALLCONV     XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2);
+XMVECTOR    XM_CALLCONV     XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2);
+XMVECTOR    XM_CALLCONV     XMQuaternionLengthSq(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionReciprocalLength(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionLength(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionNormalizeEst(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionNormalize(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionConjugate(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionInverse(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionLn(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionExp(FXMVECTOR Q);
+XMVECTOR    XM_CALLCONV     XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t);
+XMVECTOR    XM_CALLCONV     XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T);
+XMVECTOR    XM_CALLCONV     XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t);
+XMVECTOR    XM_CALLCONV     XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T);
+void        XM_CALLCONV     XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3);
+XMVECTOR    XM_CALLCONV     XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g);
+XMVECTOR    XM_CALLCONV     XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G);
+
+XMVECTOR    XM_CALLCONV     XMQuaternionIdentity();
+XMVECTOR    XM_CALLCONV     XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll);
+XMVECTOR    XM_CALLCONV     XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles);
+XMVECTOR    XM_CALLCONV     XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle);
+XMVECTOR    XM_CALLCONV     XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle);
+XMVECTOR    XM_CALLCONV     XMQuaternionRotationMatrix(FXMMATRIX M);
+
+void        XM_CALLCONV     XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q);
+
+/****************************************************************************
+ *
+ * Plane operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2);
+bool        XM_CALLCONV     XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon);
+bool        XM_CALLCONV     XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2);
+
+bool        XM_CALLCONV     XMPlaneIsNaN(FXMVECTOR P);
+bool        XM_CALLCONV     XMPlaneIsInfinite(FXMVECTOR P);
+
+XMVECTOR    XM_CALLCONV     XMPlaneDot(FXMVECTOR P, FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V);
+XMVECTOR    XM_CALLCONV     XMPlaneNormalizeEst(FXMVECTOR P);
+XMVECTOR    XM_CALLCONV     XMPlaneNormalize(FXMVECTOR P);
+XMVECTOR    XM_CALLCONV     XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2);
+void        XM_CALLCONV     XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2);
+XMVECTOR    XM_CALLCONV     XMPlaneTransform(FXMVECTOR P, FXMMATRIX M);
+XMFLOAT4*   XM_CALLCONV     XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream,
+                                                   _In_ size_t OutputStride,
+                                                   _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) const XMFLOAT4* pInputStream,
+                                                   _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX M);
+
+XMVECTOR    XM_CALLCONV     XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal);
+XMVECTOR    XM_CALLCONV     XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3);
+
+/****************************************************************************
+ *
+ * Color operations
+ *
+ ****************************************************************************/
+
+bool        XM_CALLCONV     XMColorEqual(FXMVECTOR C1, FXMVECTOR C2);
+bool        XM_CALLCONV     XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2);
+bool        XM_CALLCONV     XMColorGreater(FXMVECTOR C1, FXMVECTOR C2);
+bool        XM_CALLCONV     XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2);
+bool        XM_CALLCONV     XMColorLess(FXMVECTOR C1, FXMVECTOR C2);
+bool        XM_CALLCONV     XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2);
+
+bool        XM_CALLCONV     XMColorIsNaN(FXMVECTOR C);
+bool        XM_CALLCONV     XMColorIsInfinite(FXMVECTOR C);
+
+XMVECTOR    XM_CALLCONV     XMColorNegative(FXMVECTOR C);
+XMVECTOR    XM_CALLCONV     XMColorModulate(FXMVECTOR C1, FXMVECTOR C2);
+XMVECTOR    XM_CALLCONV     XMColorAdjustSaturation(FXMVECTOR C, float Saturation);
+XMVECTOR    XM_CALLCONV     XMColorAdjustContrast(FXMVECTOR C, float Contrast);
+
+XMVECTOR    XM_CALLCONV     XMColorRGBToHSL( FXMVECTOR rgb );
+XMVECTOR    XM_CALLCONV     XMColorHSLToRGB( FXMVECTOR hsl );
+
+XMVECTOR    XM_CALLCONV     XMColorRGBToHSV( FXMVECTOR rgb );
+XMVECTOR    XM_CALLCONV     XMColorHSVToRGB( FXMVECTOR hsv );
+
+XMVECTOR    XM_CALLCONV     XMColorRGBToYUV( FXMVECTOR rgb );
+XMVECTOR    XM_CALLCONV     XMColorYUVToRGB( FXMVECTOR yuv );
+
+XMVECTOR    XM_CALLCONV     XMColorRGBToYUV_HD( FXMVECTOR rgb );
+XMVECTOR    XM_CALLCONV     XMColorYUVToRGB_HD( FXMVECTOR yuv );
+
+XMVECTOR    XM_CALLCONV     XMColorRGBToXYZ( FXMVECTOR rgb );
+XMVECTOR    XM_CALLCONV     XMColorXYZToRGB( FXMVECTOR xyz );
+
+XMVECTOR    XM_CALLCONV     XMColorXYZToSRGB( FXMVECTOR xyz );
+XMVECTOR    XM_CALLCONV     XMColorSRGBToXYZ( FXMVECTOR srgb );
+
+XMVECTOR    XM_CALLCONV     XMColorRGBToSRGB( FXMVECTOR rgb );
+XMVECTOR    XM_CALLCONV     XMColorSRGBToRGB( FXMVECTOR srgb );
+
+
+/****************************************************************************
+ *
+ * Miscellaneous operations
+ *
+ ****************************************************************************/
+
+bool            XMVerifyCPUSupport();
+
+XMVECTOR    XM_CALLCONV     XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex);
+
+bool            XMScalarNearEqual(float S1, float S2, float Epsilon);
+float           XMScalarModAngle(float Value);
+
+float           XMScalarSin(float Value);
+float           XMScalarSinEst(float Value);
+
+float           XMScalarCos(float Value);
+float           XMScalarCosEst(float Value);
+
+void            XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value);
+void            XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value);
+
+float           XMScalarASin(float Value);
+float           XMScalarASinEst(float Value);
+
+float           XMScalarACos(float Value);
+float           XMScalarACosEst(float Value);
+
+/****************************************************************************
+ *
+ * Templates
+ *
+ ****************************************************************************/
+
+#if defined(__XNAMATH_H__) && defined(XMMin)
+#undef XMMin
+#undef XMMax
+#endif
+
+template<class T> inline T XMMin(T a, T b) { return (a < b) ? a : b; }
+template<class T> inline T XMMax(T a, T b) { return (a > b) ? a : b; }
+
+//------------------------------------------------------------------------------
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+// PermuteHelper internal template (SSE only)
+namespace Internal
+{
+    // Slow path fallback for permutes that do not map to a single SSE shuffle opcode.
+    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+    {
+        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2)
+        {
+            static const XMVECTORU32 selectMask =
+            {
+                WhichX ? 0xFFFFFFFF : 0,
+                WhichY ? 0xFFFFFFFF : 0,
+                WhichZ ? 0xFFFFFFFF : 0,
+                WhichW ? 0xFFFFFFFF : 0,
+            };
+
+            XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
+            XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);
+
+            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+            return _mm_or_ps(masked1, masked2);
+        }
+    };
+
+    // Fast path for permutes that only read from the first vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+    {
+        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); }
+    };
+
+    // Fast path for permutes that only read from the second vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+    {
+        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the first vector, ZW from the second.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+    {
+        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the second vector, ZW from the first.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+    {
+        static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
+    };
+};
+
+#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+// General permute template
+template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
+    inline XMVECTOR     XM_CALLCONV     XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+    const bool WhichX = PermuteX > 3;
+    const bool WhichY = PermuteY > 3;
+    const bool WhichZ = PermuteZ > 3;
+    const bool WhichW = PermuteW > 3;
+
+    return Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+#else
+
+    return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW );
+
+#endif
+}
+
+// Special-case permute templates
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movelh_ps(V1,V2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<6,7,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movehl_ps(V1,V2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpacklo_ps(V1,V2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpackhi_ps(V1,V2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); }
+#endif
+
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+#endif
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead
+// The mirror cases are not spelled out here as the programmer can always swap the arguments
+// (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector)
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); }
+
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+//------------------------------------------------------------------------------
+
+// General swizzle template
+template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
+    inline XMVECTOR     XM_CALLCONV     XMVectorSwizzle(FXMVECTOR V)
+{
+    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+#else
+
+    return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
+
+#endif
+}
+
+// Specialized swizzles
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { return _mm_movelh_ps(V,V); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { return _mm_movehl_ps(V,V); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return _mm_unpacklo_ps(V,V); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return _mm_unpackhi_ps(V,V); }
+#endif
+
+#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+#endif
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; }
+
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); }
+
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+//------------------------------------------------------------------------------
+
+template<uint32_t Elements>
+    inline XMVECTOR     XM_CALLCONV     XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR     XM_CALLCONV     XMVectorRotateLeft(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR     XM_CALLCONV     XMVectorRotateRight(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+}
+
+template<uint32_t VSLeftRotateElements, uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3>
+    inline XMVECTOR     XM_CALLCONV     XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS)
+{
+    XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
+    return XMVectorSelect( VD, XMVectorRotateLeft<VSLeftRotateElements>(VS), Control );
+}
+
+/****************************************************************************
+ *
+ * Globals
+ *
+ ****************************************************************************/
+
+// The purpose of the following global constants is to prevent redundant 
+// reloading of the constants when they are referenced by more than one
+// separate inline math routine called within the same function.  Declaring
+// a constant locally within a routine is sufficient to prevent redundant
+// reloads of that constant when that single routine is called multiple
+// times in a function, but if the constant is used (and declared) in a 
+// separate math routine it would be reloaded.
+
+#ifndef XMGLOBALCONST
+#define XMGLOBALCONST extern const __declspec(selectany)
+#endif
+
+XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0    = {-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f};
+XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1    = {-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/};
+XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0    = {-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f};
+XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1    = {-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/};
+XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0    = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f};
+XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1    = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f};
+XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2    = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f};
+XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0    = {+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f};
+XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1    = {+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f};
+XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0   = {-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f};
+XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1   = {-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f};
+XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {+0.999866f, +0.999866f, +0.999866f, +0.999866f};
+XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {-0.3302995f, +0.180141f, -0.085133f, +0.0208351f};
+XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients  = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI};
+XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients  = {+1.5707288f,-0.2121144f,+0.0742610f,-0.0187293f};
+XMGLOBALCONST XMVECTORF32 g_XMPiConstants0        = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR0          = {1.0f, 0.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR1          = {0.0f, 1.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR2          = {0.0f, 0.0f, 1.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR3          = {0.0f, 0.0f, 0.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0       = {-1.0f,0.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1       = {0.0f,-1.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2       = {0.0f, 0.0f,-1.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3       = {0.0f, 0.0f, 0.0f,-1.0f};
+XMGLOBALCONST XMVECTORU32 g_XMNegativeZero      = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+XMGLOBALCONST XMVECTORU32 g_XMNegate3           = {0x80000000, 0x80000000, 0x80000000, 0x00000000};
+XMGLOBALCONST XMVECTORU32 g_XMMaskXY            = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORU32 g_XMMask3             = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
+XMGLOBALCONST XMVECTORU32 g_XMMaskX             = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORU32 g_XMMaskY             = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORU32 g_XMMaskZ             = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000};
+XMGLOBALCONST XMVECTORU32 g_XMMaskW             = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF};
+XMGLOBALCONST XMVECTORF32 g_XMOne               = { 1.0f, 1.0f, 1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMOne3              = { 1.0f, 1.0f, 1.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMZero              = { 0.0f, 0.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMTwo               = { 2.f, 2.f, 2.f, 2.f };
+XMGLOBALCONST XMVECTORF32 g_XMFour              = { 4.f, 4.f, 4.f, 4.f };
+XMGLOBALCONST XMVECTORF32 g_XMSix               = { 6.f, 6.f, 6.f, 6.f };
+XMGLOBALCONST XMVECTORF32 g_XMNegativeOne       = {-1.0f,-1.0f,-1.0f,-1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMOneHalf           = { 0.5f, 0.5f, 0.5f, 0.5f};
+XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf   = {-0.5f,-0.5f,-0.5f,-0.5f};
+XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi     = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI};
+XMGLOBALCONST XMVECTORF32 g_XMNegativePi        = {-XM_PI, -XM_PI, -XM_PI, -XM_PI};
+XMGLOBALCONST XMVECTORF32 g_XMHalfPi            = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2};
+XMGLOBALCONST XMVECTORF32 g_XMPi                = {XM_PI, XM_PI, XM_PI, XM_PI};
+XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi      = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI};
+XMGLOBALCONST XMVECTORF32 g_XMTwoPi             = {XM_2PI, XM_2PI, XM_2PI, XM_2PI};
+XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi   = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI};
+XMGLOBALCONST XMVECTORF32 g_XMEpsilon           = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f};
+XMGLOBALCONST XMVECTORI32 g_XMInfinity          = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
+XMGLOBALCONST XMVECTORI32 g_XMQNaN              = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000};
+XMGLOBALCONST XMVECTORI32 g_XMQNaNTest          = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
+XMGLOBALCONST XMVECTORI32 g_XMAbsMask           = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+XMGLOBALCONST XMVECTORI32 g_XMFltMin            = {0x00800000, 0x00800000, 0x00800000, 0x00800000};
+XMGLOBALCONST XMVECTORI32 g_XMFltMax            = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF};
+XMGLOBALCONST XMVECTORU32 g_XMNegOneMask        = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
+XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8      = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000};
+XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8      = {0x00000000, 0x00000000, 0x00000000, 0x80000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8      = {0.0f,0.0f,0.0f,(float)(0x80000000U)};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))};
+XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10   = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000};
+XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10   = {0x00000200, 0x00080000, 0x20000000, 0x80000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10   = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))};
+XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16        = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16        = {0x00008000, 0x00000000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16         = {-32768.0f,0.0f,0.0f,0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16   = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f};
+XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16  = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16  = {0x00008000, 0x00008000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16   = {-32768.0f,-32768.0f,0.0f,0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)};
+XMGLOBALCONST XMVECTORF32 g_XMNoFraction        = {8388608.0f,8388608.0f,8388608.0f,8388608.0f};
+XMGLOBALCONST XMVECTORI32 g_XMMaskByte          = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
+XMGLOBALCONST XMVECTORF32 g_XMNegateX           = {-1.0f, 1.0f, 1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegateY           = { 1.0f,-1.0f, 1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegateZ           = { 1.0f, 1.0f,-1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegateW           = { 1.0f, 1.0f, 1.0f,-1.0f};
+XMGLOBALCONST XMVECTORU32 g_XMSelect0101        = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1};
+XMGLOBALCONST XMVECTORU32 g_XMSelect1010        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0};
+XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD};
+XMGLOBALCONST XMVECTORU32 g_XMSelect1000        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0};
+XMGLOBALCONST XMVECTORU32 g_XMSelect1100        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0};
+XMGLOBALCONST XMVECTORU32 g_XMSelect1110        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0};
+XMGLOBALCONST XMVECTORU32 g_XMSelect1011          = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 };
+XMGLOBALCONST XMVECTORF32 g_XMFixupY16          = {1.0f,1.0f/65536.0f,0.0f,0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16       = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f};
+XMGLOBALCONST XMVECTORU32 g_XMFlipY             = {0,0x80000000,0,0};
+XMGLOBALCONST XMVECTORU32 g_XMFlipZ             = {0,0,0x80000000,0};
+XMGLOBALCONST XMVECTORU32 g_XMFlipW             = {0,0,0,0x80000000};
+XMGLOBALCONST XMVECTORU32 g_XMFlipYZ            = {0,0x80000000,0x80000000,0};
+XMGLOBALCONST XMVECTORU32 g_XMFlipZW            = {0,0,0x80000000,0x80000000};
+XMGLOBALCONST XMVECTORU32 g_XMFlipYW            = {0,0x80000000,0,0x80000000};
+XMGLOBALCONST XMVECTORI32 g_XMMaskDec4          = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
+XMGLOBALCONST XMVECTORI32 g_XMXorDec4           = {0x200,0x200<<10,0x200<<20,0};
+XMGLOBALCONST XMVECTORF32 g_XMAddUDec4          = {0,0,0,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMAddDec4           = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0};
+XMGLOBALCONST XMVECTORF32 g_XMMulDec4           = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
+XMGLOBALCONST XMVECTORU32 g_XMMaskByte4         = {0xFF,0xFF00,0xFF0000,0xFF000000};
+XMGLOBALCONST XMVECTORI32 g_XMXorByte4          = {0x80,0x8000,0x800000,0x00000000};
+XMGLOBALCONST XMVECTORF32 g_XMAddByte4          = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0};
+XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned       = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMMaxInt            = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f};
+XMGLOBALCONST XMVECTORF32 g_XMMaxUInt           = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f};
+XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix       = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMsrgbScale         = { 12.92f, 12.92f, 12.92f, 1.0f };
+XMGLOBALCONST XMVECTORF32 g_XMsrgbA             = { 0.055f, 0.055f, 0.055f, 0.0f };
+XMGLOBALCONST XMVECTORF32 g_XMsrgbA1            = { 1.055f, 1.055f, 1.055f, 1.0f };
+XMGLOBALCONST XMVECTORI32 g_XMExponentBias      = {127, 127, 127, 127};
+XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = {-126, -126, -126, -126};
+XMGLOBALCONST XMVECTORI32 g_XMNumTrailing       = {23, 23, 23, 23};
+XMGLOBALCONST XMVECTORI32 g_XMMinNormal         = {0x00800000, 0x00800000, 0x00800000, 0x00800000};
+XMGLOBALCONST XMVECTORU32 g_XMNegInfinity       = {0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000};
+XMGLOBALCONST XMVECTORU32 g_XMNegQNaN           = {0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000};
+XMGLOBALCONST XMVECTORI32 g_XMBin128            = {0x43000000, 0x43000000, 0x43000000, 0x43000000};
+XMGLOBALCONST XMVECTORU32 g_XMBinNeg150         = {0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000};
+XMGLOBALCONST XMVECTORI32 g_XM253               = {253, 253, 253, 253};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst1           = {-6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst2           = {+2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst3           = {-5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst4           = {+9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst5           = {-1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst6           = {+1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f};
+XMGLOBALCONST XMVECTORF32 g_XMExpEst7           = {-1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst0           = {+1.442693f, +1.442693f, +1.442693f, +1.442693f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst1           = {-0.721242f, -0.721242f, -0.721242f, -0.721242f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst2           = {+0.479384f, +0.479384f, +0.479384f, +0.479384f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst3           = {-0.350295f, -0.350295f, -0.350295f, -0.350295f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst4           = {+0.248590f, +0.248590f, +0.248590f, +0.248590f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst5           = {-0.145700f, -0.145700f, -0.145700f, -0.145700f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst6           = {+0.057148f, +0.057148f, +0.057148f, +0.057148f};
+XMGLOBALCONST XMVECTORF32 g_XMLogEst7           = {-0.010578f, -0.010578f, -0.010578f, -0.010578f};
+XMGLOBALCONST XMVECTORF32 g_XMLgE               = {+1.442695f, +1.442695f, +1.442695f, +1.442695f};
+XMGLOBALCONST XMVECTORF32 g_XMInvLgE            = {+6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f};
+XMGLOBALCONST XMVECTORF32 g_UByteMax            = {255.0f, 255.0f, 255.0f, 255.0f};
+XMGLOBALCONST XMVECTORF32 g_ByteMin             = {-127.0f, -127.0f, -127.0f, -127.0f};
+XMGLOBALCONST XMVECTORF32 g_ByteMax             = {127.0f, 127.0f, 127.0f, 127.0f};
+XMGLOBALCONST XMVECTORF32 g_ShortMin            = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
+XMGLOBALCONST XMVECTORF32 g_ShortMax            = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+XMGLOBALCONST XMVECTORF32 g_UShortMax           = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+
+/****************************************************************************
+ *
+ * Implementation
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101)
+// C4068/4616: ignore unknown pragmas
+// C4214/4204: nonstandard extension used
+// C4365/4640: Off by default noise
+// C6001/6101: False positives
+
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] = (0-(C0&1)) & 0x3F800000;
+    vResult.u[1] = (0-(C1&1)) & 0x3F800000;
+    vResult.u[2] = (0-(C2&1)) & 0x3F800000;
+    vResult.u[3] = (0-(C3&1)) & 0x3F800000;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] = (0-(C0&1)) & 0x3F800000;
+    vResult.u[1] = (0-(C1&1)) & 0x3F800000;
+    vResult.u[2] = (0-(C2&1)) & 0x3F800000;
+    vResult.u[3] = (0-(C3&1)) & 0x3F800000;
+    return vResult.v;
+#else // XM_SSE_INTRINSICS_
+    static const XMVECTORU32 g_vMask1 = {1,1,1,1};
+    // Move the parms to a vector
+    __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0);
+    // Mask off the low bits
+    vTemp = _mm_and_si128(vTemp,g_vMask1);
+    // 0xFFFFFFFF on true bits
+    vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
+    // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
+    vTemp = _mm_and_si128(vTemp,g_XMOne);
+    return _mm_castsi128_ps(vTemp);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent)
+{
+    assert( IntConstant >= -16 && IntConstant <= 15 );
+    assert( DivExponent < 32 );
+#if defined(_XM_NO_INTRINSICS_)
+
+    using DirectX::XMConvertVectorIntToFloat;
+
+    XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant };
+    return XMConvertVectorIntToFloat( V.v, DivExponent);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Splat the int
+    int32x4_t vScale = vdupq_n_s32(IntConstant);
+    // Convert to a float
+    XMVECTOR vResult = vcvtq_f32_s32(vScale);
+    // Convert DivExponent into 1.0f/(1<<DivExponent)
+    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+    // Splat the scalar value (It's really a float)
+    vScale = vdupq_n_s32(uScale);
+    // Multiply by the reciprocal (Perform a right shift by DivExponent)
+    vResult = vmulq_f32(vResult,reinterpret_cast<const float32x4_t *>(&vScale)[0]);
+    return vResult;
+#else // XM_SSE_INTRINSICS_
+    // Splat the int
+    __m128i vScale = _mm_set1_epi32(IntConstant);
+    // Convert to a float
+    XMVECTOR vResult = _mm_cvtepi32_ps(vScale);
+    // Convert DivExponent into 1.0f/(1<<DivExponent)
+    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+    // Splat the scalar value (It's really a float)
+    vScale = _mm_set1_epi32(uScale);
+    // Multiply by the reciprocal (Perform a right shift by DivExponent)
+    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant)
+{
+    assert( IntConstant >= -16 && IntConstant <= 15 );
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant };
+    return V.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t V = vdupq_n_s32( IntConstant );
+    return reinterpret_cast<float32x4_t *>(&V)[0];
+#else // XM_SSE_INTRINSICS_
+    __m128i V = _mm_set1_epi32( IntConstant );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+#include "DirectXMathConvert.inl"
+#include "DirectXMathVector.inl"
+#include "DirectXMathMatrix.inl"
+#include "DirectXMathMisc.inl"
+
+#pragma prefast(pop)
+#pragma warning(pop)
+
+}; // namespace DirectX
+
diff --git a/Inc/DirectXMathConvert.inl b/Inc/DirectXMathConvert.inl
index 342397b..c7ab705 100644
--- a/Inc/DirectXMathConvert.inl
+++ b/Inc/DirectXMathConvert.inl
@@ -1,1899 +1,1899 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathConvert.inl -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Data conversion
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-#pragma warning(push)
-#pragma warning(disable:4701)
-// C4701: false positives
-
-inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
-(
-    FXMVECTOR    VInt,
-    uint32_t     DivExponent
-)
-{
-    assert(DivExponent<32);
-#if defined(_XM_NO_INTRINSICS_)
-    float fScale = 1.0f / (float)(1U << DivExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex];
-        Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale;
-    } while (++ElementIndex<4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fScale = 1.0f / (float)(1U << DivExponent);
-    float32x4_t vResult = vcvtq_f32_s32( VInt );
-    return vmulq_n_f32( vResult, fScale );
-#else // _XM_SSE_INTRINSICS_
-    // Convert to floats
-    XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat the scalar value
-    __m128i vScale = _mm_set1_epi32(uScale);
-    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
-(
-    FXMVECTOR    VFloat,
-    uint32_t     MulExponent
-)
-{
-    assert(MulExponent<32);
-#if defined(_XM_NO_INTRINSICS_)
-    // Get the scalar factor.
-    float fScale = (float)(1U << MulExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        int32_t iResult;
-        float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
-        if (fTemp <= -(65536.0f*32768.0f)) {
-            iResult = (-0x7FFFFFFF)-1;
-        } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
-            iResult = 0x7FFFFFFF;
-        } else {
-            iResult = (int32_t)fTemp;
-        }
-        Result.vector4_u32[ElementIndex] = (uint32_t)iResult;
-    } while (++ElementIndex<4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent));
-    // In case of positive overflow, detect it
-    uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt);
-    // Float to int conversion
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    vResult = vandq_u32(vOverflow,g_XMAbsMask);
-    vOverflow = vbicq_u32(vResulti,vOverflow);
-    vOverflow = vorrq_u32(vOverflow,vResult);
-    return vOverflow;
-#else // _XM_SSE_INTRINSICS_
-    XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent));
-    vResult = _mm_mul_ps(vResult,VFloat);
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow,vResult);
-    return vOverflow;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
-(
-    FXMVECTOR     VUInt,
-    uint32_t      DivExponent
-)
-{
-    assert(DivExponent<32);
-#if defined(_XM_NO_INTRINSICS_)
-    float fScale = 1.0f / (float)(1U << DivExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale;
-    } while (++ElementIndex<4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fScale = 1.0f / (float)(1U << DivExponent);
-    float32x4_t vResult = vcvtq_f32_u32( VUInt );
-    return vmulq_n_f32( vResult, fScale );
-#else // _XM_SSE_INTRINSICS_
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult,vMask);
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat
-    iMask = _mm_set1_epi32(uScale);
-    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
-(
-    FXMVECTOR     VFloat,
-    uint32_t      MulExponent
-)
-{
-    assert(MulExponent<32);
-#if defined(_XM_NO_INTRINSICS_)
-    // Get the scalar factor.
-    float fScale = (float)(1U << MulExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        uint32_t uResult;
-        float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
-        if (fTemp <= 0.0f) {
-            uResult = 0;
-        } else if (fTemp >= (65536.0f*65536.0f)) {
-            uResult = 0xFFFFFFFFU;
-        } else {
-            uResult = (uint32_t)fTemp;
-        }
-        Result.vector4_u32[ElementIndex] = uResult;
-    } while (++ElementIndex<4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent));
-    // In case of overflow, detect it
-    uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt);
-    // Float to int conversion
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    // If there was overflow, set to 0xFFFFFFFFU
-    vResult = vbicq_u32(vResulti,vOverflow);
-    vOverflow = vorrq_u32(vOverflow,vResult);
-    return vOverflow;
-#else // _XM_SSE_INTRINSICS_
-    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
-    vResult = _mm_mul_ps(vResult,VFloat);
-    // Clamp to >=0
-    vResult = _mm_max_ps(vResult,g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue,vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult,vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult,vOverflow);
-    return vResult;
-#endif
-}
-
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * Vector and matrix load operations
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = *pSource;
-    V.vector4_u32[1] = 0;
-    V.vector4_u32[2] = 0;
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t zero = vdupq_n_u32(0);
-    return vld1q_lane_u32( pSource, zero, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ss( reinterpret_cast<const float*>(pSource) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = *pSource;
-    V.vector4_f32[1] = 0.f;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t zero = vdupq_n_f32(0);
-    return vld1q_lane_f32( pSource, zero, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ss( pSource );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt2
-(
-    const uint32_t* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = 0;
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32( pSource );
-    uint32x2_t zero = vdup_n_u32(0);
-    return vcombine_u32( x, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
-    return _mm_unpacklo_ps( x, y );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt2A
-(
-    const uint32_t* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = 0;
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32_ex( pSource, 64 );
-    uint32x2_t zero = vdup_n_u32(0);
-    return vcombine_u32( x, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat2
-(
-    const XMFLOAT2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32( x, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( &pSource->x );
-    __m128 y = _mm_load_ss( &pSource->y );
-    return _mm_unpacklo_ps( x, y );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat2A
-(
-    const XMFLOAT2A* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 );
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32( x, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadSInt2
-(
-    const XMINT2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = (float)pSource->x;
-    V.vector4_f32[1] = (float)pSource->y;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
-    float32x2_t v = vcvt_f32_s32( x );
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32( v, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
-    __m128 V = _mm_unpacklo_ps( x, y );
-    return _mm_cvtepi32_ps(_mm_castps_si128(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadUInt2
-(
-    const XMUINT2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = (float)pSource->x;
-    V.vector4_f32[1] = (float)pSource->y;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
-    float32x2_t v = vcvt_f32_u32( x );
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32( v, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
-    __m128 V = _mm_unpacklo_ps( x, y );
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(V,vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult,vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt3
-(
-    const uint32_t* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32( pSource );
-    uint32x2_t zero = vdup_n_u32(0);
-    uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
-    return vcombine_u32( x, y );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
-    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
-    __m128 xy = _mm_unpacklo_ps( x, y );
-    return _mm_movelh_ps( xy, z );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt3A
-(
-    const uint32_t* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Reads an extra integer which is zero'd
-    uint32x4_t V = vld1q_u32_ex( pSource, 128 );
-    return vsetq_lane_u32( 0, V, 3 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Reads an extra integer which is zero'd
-    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
-    V = _mm_and_si128( V, g_XMMask3 );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat3
-(
-    const XMFLOAT3* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
-    float32x2_t zero = vdup_n_f32(0);
-    float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
-    return vcombine_f32( x, y );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( &pSource->x );
-    __m128 y = _mm_load_ss( &pSource->y );
-    __m128 z = _mm_load_ss( &pSource->z );
-    __m128 xy = _mm_unpacklo_ps( x, y );
-    return _mm_movelh_ps( xy, z );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat3A
-(
-    const XMFLOAT3A* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Reads an extra float which is zero'd
-    float32x4_t V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
-    return vsetq_lane_f32( 0, V, 3 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Reads an extra float which is zero'd
-    __m128 V = _mm_load_ps( &pSource->x );
-    return _mm_and_ps( V, g_XMMask3 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadSInt3
-(
-    const XMINT3* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR V;
-    V.vector4_f32[0] = (float)pSource->x;
-    V.vector4_f32[1] = (float)pSource->y;
-    V.vector4_f32[2] = (float)pSource->z;
-    V.vector4_f32[3] = 0.f;
-    return V;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
-    int32x2_t zero = vdup_n_s32(0);
-    int32x2_t y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 );
-    int32x4_t v = vcombine_s32( x, y );
-    return vcvtq_f32_s32( v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
-    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
-    __m128 xy = _mm_unpacklo_ps( x, y );
-    __m128 V = _mm_movelh_ps( xy, z );
-    return _mm_cvtepi32_ps(_mm_castps_si128(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadUInt3
-(
-    const XMUINT3* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = (float)pSource->x;
-    V.vector4_f32[1] = (float)pSource->y;
-    V.vector4_f32[2] = (float)pSource->z;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
-    uint32x2_t zero = vdup_n_u32(0);
-    uint32x2_t y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 );
-    uint32x4_t v = vcombine_u32( x, y );
-    return vcvtq_f32_u32( v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
-    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
-    __m128 xy = _mm_unpacklo_ps( x, y );
-    __m128 V = _mm_movelh_ps( xy, z );
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(V,vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult,vMask);
-    return vResult; 
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt4
-(
-    const uint32_t* pSource
-)
-{
-    assert(pSource);
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = pSource[3];
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_u32( pSource );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadInt4A
-(
-    const uint32_t* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = pSource[3];
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_u32_ex( pSource, 128 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat4
-(
-    const XMFLOAT4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = pSource->w;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_f32( reinterpret_cast<const float*>(pSource) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_loadu_ps( &pSource->x );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadFloat4A
-(
-    const XMFLOAT4A* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = pSource->w;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ps( &pSource->x );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadSInt4
-(
-    const XMINT4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR V;
-    V.vector4_f32[0] = (float)pSource->x;
-    V.vector4_f32[1] = (float)pSource->y;
-    V.vector4_f32[2] = (float)pSource->z;
-    V.vector4_f32[3] = (float)pSource->w;
-    return V;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) );
-    return vcvtq_f32_s32( v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_cvtepi32_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMLoadUInt4
-(
-    const XMUINT4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = (float)pSource->x;
-    V.vector4_f32[1] = (float)pSource->y;
-    V.vector4_f32[2] = (float)pSource->z;
-    V.vector4_f32[3] = (float)pSource->w;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) );
-    return vcvtq_f32_u32( v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult,vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMMATRIX XM_CALLCONV XMLoadFloat3x3
-(
-    const XMFLOAT3X3* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-    M.r[3].vector4_f32[0] = 0.0f;
-    M.r[3].vector4_f32[1] = 0.0f;
-    M.r[3].vector4_f32[2] = 0.0f;
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
-    float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
-    float32x2_t v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] );
-    float32x4_t T = vextq_f32( v0, v1, 3 );
-
-    XMMATRIX M;
-    M.r[0] = vandq_u32( v0, g_XMMask3 );
-    M.r[1] = vandq_u32( T, g_XMMask3 );
-    M.r[2] = vcombine_f32( vget_high_f32(v1), v2 );
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 Z = _mm_setzero_ps();
-
-    __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] );
-    __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] );
-    __m128 V3 = _mm_load_ss( &pSource->m[2][2] );
-
-    __m128 T1 = _mm_unpackhi_ps( V1, Z );
-    __m128 T2 = _mm_unpacklo_ps( V2, Z );
-    __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
-    __m128 T4 = _mm_movehl_ps( T2, T3 );
-    __m128 T5 = _mm_movehl_ps( Z, T1 );  
-
-    XMMATRIX M;
-    M.r[0] = _mm_movelh_ps( V1, T1 );
-    M.r[1] = _mm_add_ps( T4, T5 );
-    M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMMATRIX XM_CALLCONV XMLoadFloat4x3
-(
-    const XMFLOAT4X3* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
-    float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
-    float32x4_t v2 = vld1q_f32( &pSource->m[2][2] );
-
-    float32x4_t T1 = vextq_f32( v0, v1, 3 );
-    float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
-    float32x4_t T3 = vextq_f32( v2, v2, 1 );
-
-    XMMATRIX M;
-    M.r[0] = vandq_u32( v0, g_XMMask3 );
-    M.r[1] = vandq_u32( T1, g_XMMask3 );
-    M.r[2] = vandq_u32( T2, g_XMMask3 );
-    M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Use unaligned load instructions to 
-    // load the 12 floats
-    // vTemp1 = x1,y1,z1,x2
-    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
-    // vTemp2 = y2,z2,x3,y3
-    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
-    // vTemp4 = z3,x4,y4,z4
-    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
-    // vTemp3 = x3,y3,z3,z3
-    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
-    // vTemp2 = y2,z2,x2,x2
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
-    // vTemp2 = x2,y2,z2,z2
-    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
-    // vTemp1 = x1,y1,z1,0
-    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
-    // vTemp2 = x2,y2,z2,0
-    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
-    // vTemp3 = x3,y3,z3,0
-    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
-    // vTemp4i = x4,y4,z4,0
-    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
-    // vTemp4i = x4,y4,z4,1.0f
-    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
-    XMMATRIX M(vTemp1,
-            vTemp2,
-            vTemp3,
-            _mm_castsi128_ps(vTemp4i));
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A
-(
-    const XMFLOAT4X3A* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 );
-    float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 );
-    float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 );
-
-    float32x4_t T1 = vextq_f32( v0, v1, 3 );
-    float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
-    float32x4_t T3 = vextq_f32( v2, v2, 1 );
-
-    XMMATRIX M;
-    M.r[0] = vandq_u32( v0, g_XMMask3 );
-    M.r[1] = vandq_u32( T1, g_XMMask3 );
-    M.r[2] = vandq_u32( T2, g_XMMask3 );
-    M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Use aligned load instructions to 
-    // load the 12 floats
-    // vTemp1 = x1,y1,z1,x2
-    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
-    // vTemp2 = y2,z2,x3,y3
-    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
-    // vTemp4 = z3,x4,y4,z4
-    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
-    // vTemp3 = x3,y3,z3,z3
-    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
-    // vTemp2 = y2,z2,x2,x2
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
-    // vTemp2 = x2,y2,z2,z2
-    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
-    // vTemp1 = x1,y1,z1,0
-    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
-    // vTemp2 = x2,y2,z2,0
-    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
-    // vTemp3 = x3,y3,z3,0
-    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
-    // vTemp4i = x4,y4,z4,0
-    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
-    // vTemp4i = x4,y4,z4,1.0f
-    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
-    XMMATRIX M(vTemp1,
-            vTemp2,
-            vTemp3,
-            _mm_castsi128_ps(vTemp4i));
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMMATRIX XM_CALLCONV XMLoadFloat4x4
-(
-    const XMFLOAT4X4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = pSource->m[0][3];
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = pSource->m[1][3];
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = pSource->m[2][3];
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = pSource->m[3][3];
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) );
-    M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) );
-    M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) );
-    M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_loadu_ps( &pSource->_11 );
-    M.r[1] = _mm_loadu_ps( &pSource->_21 );
-    M.r[2] = _mm_loadu_ps( &pSource->_31 );
-    M.r[3] = _mm_loadu_ps( &pSource->_41 );
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A
-(
-    const XMFLOAT4X4A* pSource
-)
-{
-    assert(pSource);
-    assert(((uintptr_t)pSource & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = pSource->m[0][3];
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = pSource->m[1][3];
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = pSource->m[2][3];
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = pSource->m[3][3];
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 );
-    M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 );
-    M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 );
-    M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_load_ps( &pSource->_11 );
-    M.r[1] = _mm_load_ps( &pSource->_21 );
-    M.r[2] = _mm_load_ps( &pSource->_31 );
-    M.r[3] = _mm_load_ps( &pSource->_41 );
-    return M;
-#endif
-}
-
-/****************************************************************************
- *
- * Vector and matrix store operations
- *
- ****************************************************************************/
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt
-(
-    uint32_t*    pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    *pDestination = XMVectorGetIntX( V );
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32( pDestination, *reinterpret_cast<const uint32x4_t*>(&V), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat
-(
-    float*    pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    *pDestination = XMVectorGetX( V );
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32( pDestination, V, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss( pDestination, V );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt2
-(
-    uint32_t*    pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
-    vst1_u32( pDestination, VL );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt2A
-(
-    uint32_t*    pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
-    vst1_u32_ex( pDestination, VL, 64 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat2
-(
-    XMFLOAT2* pDestination, 
-    FXMVECTOR  V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-    _mm_store_ss( &pDestination->x, V );
-    _mm_store_ss( &pDestination->y, T );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat2A
-(
-    XMFLOAT2A*   pDestination, 
-    FXMVECTOR     V
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreSInt2
-(
-    XMINT2* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = (int32_t)V.vector4_f32[0];
-    pDestination->y = (int32_t)V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x2_t v = vget_low_s32(V);
-    v = vcvt_s32_f32( v );
-    vst1_s32( reinterpret_cast<int32_t*>(pDestination), v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(V);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow,vResult);
-    // Write two ints
-    XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreUInt2
-(
-    XMUINT2* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = (uint32_t)V.vector4_f32[0];
-    pDestination->y = (uint32_t)V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t v = vget_low_f32(V);
-    uint32x2_t iv = vcvt_u32_f32( v );
-    vst1_u32( reinterpret_cast<uint32_t*>(pDestination), iv );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to >=0
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue,vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult,vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult,vOverflow);
-    // Write two uints
-    XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt3
-(
-    uint32_t*    pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
-    vst1_u32( pDestination, VL );
-    vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt3A
-(
-    uint32_t*    pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
-    vst1_u32_ex( pDestination, VL, 64 );
-    vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat3
-(
-    XMFLOAT3* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
-    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( &pDestination->x, V );
-    _mm_store_ss( &pDestination->y, T1 );
-    _mm_store_ss( &pDestination->z, T2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat3A
-(
-    XMFLOAT3A*   pDestination, 
-    FXMVECTOR     V
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
-    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
-    _mm_store_ss( &pDestination->z, T );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreSInt3
-(
-    XMINT3* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = (int32_t)V.vector4_f32[0];
-    pDestination->y = (int32_t)V.vector4_f32[1];
-    pDestination->z = (int32_t)V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t v = vcvtq_s32_f32(V);
-    int32x2_t vL = vget_low_s32(v);
-    vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL );
-    vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(V);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow,vResult);
-    // Write 3 uints
-    XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreUInt3
-(
-    XMUINT3* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = (uint32_t)V.vector4_f32[0];
-    pDestination->y = (uint32_t)V.vector4_f32[1];
-    pDestination->z = (uint32_t)V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t v = vcvtq_u32_f32(V);
-    uint32x2_t vL = vget_low_u32(v);
-    vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL );
-    vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to >=0
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue,vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult,vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult,vOverflow);
-    // Write 3 uints
-    XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt4
-(
-    uint32_t*    pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-    pDestination[3] = V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_u32( pDestination, V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreInt4A
-(
-    uint32_t*    pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-    pDestination[3] = V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_u32_ex( pDestination, V, 128 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat4
-(
-    XMFLOAT4* pDestination, 
-    FXMVECTOR  V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-    pDestination->w = V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_f32( reinterpret_cast<float*>(pDestination), V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_ps( &pDestination->x, V );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat4A
-(
-    XMFLOAT4A*   pDestination, 
-    FXMVECTOR     V
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-    pDestination->w = V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ps( &pDestination->x, V );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreSInt4
-(
-    XMINT4* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = (int32_t)V.vector4_f32[0];
-    pDestination->y = (int32_t)V.vector4_f32[1];
-    pDestination->z = (int32_t)V.vector4_f32[2];
-    pDestination->w = (int32_t)V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t v = vcvtq_s32_f32(V);
-    vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(V);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow,vResult);
-    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreUInt4
-(
-    XMUINT4* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = (uint32_t)V.vector4_f32[0];
-    pDestination->y = (uint32_t)V.vector4_f32[1];
-    pDestination->z = (uint32_t)V.vector4_f32[2];
-    pDestination->w = (uint32_t)V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t v = vcvtq_u32_f32(V);
-    vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to >=0
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue,vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult,vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult,vOverflow);
-    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat3x3
-(
-    XMFLOAT3X3*	pDestination, 
-    FXMMATRIX	M
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
-    float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
-    vst1q_f32( &pDestination->m[0][0], T2 );
-
-    T1 = vextq_f32( M.r[1], M.r[1], 1 );
-    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
-    vst1q_f32( &pDestination->m[1][1], T2 );
-
-    vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = M.r[0];
-    XMVECTOR vTemp2 = M.r[1];
-    XMVECTOR vTemp3 = M.r[2];
-    XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
-    vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
-    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
-    _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
-    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss(&pDestination->m[2][2],vTemp3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat4x3
-(
-    XMFLOAT4X3* pDestination, 
-    FXMMATRIX M
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
-    float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
-    vst1q_f32( &pDestination->m[0][0], T2 );
-
-    T1 = vextq_f32( M.r[1], M.r[1], 1 );
-    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
-    vst1q_f32( &pDestination->m[1][1], T2 );
-
-    T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
-    T2 = vextq_f32( T1, M.r[3], 3 );
-    vst1q_f32( &pDestination->m[2][2], T2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = M.r[0];
-    XMVECTOR vTemp2 = M.r[1];
-    XMVECTOR vTemp3 = M.r[2];
-    XMVECTOR vTemp4 = M.r[3];
-    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
-    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
-    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
-    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
-    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
-    _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
-    _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat4x3A
-(
-    XMFLOAT4X3A*	pDestination, 
-    FXMMATRIX		M
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
-    float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
-    vst1q_f32_ex( &pDestination->m[0][0], T2, 128 );
-
-    T1 = vextq_f32( M.r[1], M.r[1], 1 );
-    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
-    vst1q_f32_ex( &pDestination->m[1][1], T2, 128 );
-
-    T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
-    T2 = vextq_f32( T1, M.r[3], 3 );
-    vst1q_f32_ex( &pDestination->m[2][2], T2, 128 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // x1,y1,z1,w1
-    XMVECTOR vTemp1 = M.r[0];
-    // x2,y2,z2,w2
-    XMVECTOR vTemp2 = M.r[1];
-    // x3,y3,z3,w3
-    XMVECTOR vTemp3 = M.r[2];
-    // x4,y4,z4,w4
-    XMVECTOR vTemp4 = M.r[3];
-    // z1,z1,x2,y2
-    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
-    // y2,z2,x3,y3 (Final)
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
-    // x1,y1,z1,x2 (Final)
-    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
-    // z3,z3,x4,x4
-    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
-    // z3,x4,y4,z4 (Final)
-    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
-    // Store in 3 operations
-    _mm_store_ps(&pDestination->m[0][0],vTemp1);
-    _mm_store_ps(&pDestination->m[1][1],vTemp2);
-    _mm_store_ps(&pDestination->m[2][2],vTemp3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat4x4
-(
-    XMFLOAT4X4* pDestination, 
-    FXMMATRIX M
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-    pDestination->m[0][3] = M.r[0].vector4_f32[3];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-    pDestination->m[1][3] = M.r[1].vector4_f32[3];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-    pDestination->m[2][3] = M.r[2].vector4_f32[3];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-    pDestination->m[3][3] = M.r[3].vector4_f32[3];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] );
-    vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] );
-    vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] );
-    vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_ps( &pDestination->_11, M.r[0] );
-    _mm_storeu_ps( &pDestination->_21, M.r[1] );
-    _mm_storeu_ps( &pDestination->_31, M.r[2] );
-    _mm_storeu_ps( &pDestination->_41, M.r[3] );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMStoreFloat4x4A
-(
-    XMFLOAT4X4A*	pDestination, 
-    FXMMATRIX		M
-)
-{
-    assert(pDestination);
-    assert(((uintptr_t)pDestination & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-    pDestination->m[0][3] = M.r[0].vector4_f32[3];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-    pDestination->m[1][3] = M.r[1].vector4_f32[3];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-    pDestination->m[2][3] = M.r[2].vector4_f32[3];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-    pDestination->m[3][3] = M.r[3].vector4_f32[3];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 );
-    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 );
-    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 );
-    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ps( &pDestination->_11, M.r[0] );
-    _mm_store_ps( &pDestination->_21, M.r[1] );
-    _mm_store_ps( &pDestination->_31, M.r[2] );
-    _mm_store_ps( &pDestination->_41, M.r[3] );
-#endif
-}
-
+//-------------------------------------------------------------------------------------
+// DirectXMathConvert.inl -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Data conversion
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+#pragma warning(push)
+#pragma warning(disable:4701)
+// C4701: false positives
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
+(
+    FXMVECTOR    VInt,
+    uint32_t     DivExponent
+)
+{
+    assert(DivExponent<32);
+#if defined(_XM_NO_INTRINSICS_)
+    float fScale = 1.0f / (float)(1U << DivExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex];
+        Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale;
+    } while (++ElementIndex<4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fScale = 1.0f / (float)(1U << DivExponent);
+    float32x4_t vResult = vcvtq_f32_s32( VInt );
+    return vmulq_n_f32( vResult, fScale );
+#else // _XM_SSE_INTRINSICS_
+    // Convert to floats
+    XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
+    // Convert DivExponent into 1.0f/(1<<DivExponent)
+    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+    // Splat the scalar value
+    __m128i vScale = _mm_set1_epi32(uScale);
+    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
+(
+    FXMVECTOR    VFloat,
+    uint32_t     MulExponent
+)
+{
+    assert(MulExponent<32);
+#if defined(_XM_NO_INTRINSICS_)
+    // Get the scalar factor.
+    float fScale = (float)(1U << MulExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        int32_t iResult;
+        float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
+        if (fTemp <= -(65536.0f*32768.0f)) {
+            iResult = (-0x7FFFFFFF)-1;
+        } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
+            iResult = 0x7FFFFFFF;
+        } else {
+            iResult = (int32_t)fTemp;
+        }
+        Result.vector4_u32[ElementIndex] = (uint32_t)iResult;
+    } while (++ElementIndex<4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent));
+    // In case of positive overflow, detect it
+    uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt);
+    // Float to int conversion
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    vResult = vandq_u32(vOverflow,g_XMAbsMask);
+    vOverflow = vbicq_u32(vResulti,vOverflow);
+    vOverflow = vorrq_u32(vOverflow,vResult);
+    return vOverflow;
+#else // _XM_SSE_INTRINSICS_
+    XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent));
+    vResult = _mm_mul_ps(vResult,VFloat);
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow,vResult);
+    return vOverflow;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
+(
+    FXMVECTOR     VUInt,
+    uint32_t      DivExponent
+)
+{
+    assert(DivExponent<32);
+#if defined(_XM_NO_INTRINSICS_)
+    float fScale = 1.0f / (float)(1U << DivExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale;
+    } while (++ElementIndex<4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fScale = 1.0f / (float)(1U << DivExponent);
+    float32x4_t vResult = vcvtq_f32_u32( VUInt );
+    return vmulq_n_f32( vResult, fScale );
+#else // _XM_SSE_INTRINSICS_
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult,vMask);
+    // Convert DivExponent into 1.0f/(1<<DivExponent)
+    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+    // Splat
+    iMask = _mm_set1_epi32(uScale);
+    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
+(
+    FXMVECTOR     VFloat,
+    uint32_t      MulExponent
+)
+{
+    assert(MulExponent<32);
+#if defined(_XM_NO_INTRINSICS_)
+    // Get the scalar factor.
+    float fScale = (float)(1U << MulExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        uint32_t uResult;
+        float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
+        if (fTemp <= 0.0f) {
+            uResult = 0;
+        } else if (fTemp >= (65536.0f*65536.0f)) {
+            uResult = 0xFFFFFFFFU;
+        } else {
+            uResult = (uint32_t)fTemp;
+        }
+        Result.vector4_u32[ElementIndex] = uResult;
+    } while (++ElementIndex<4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent));
+    // In case of overflow, detect it
+    uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt);
+    // Float to int conversion
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    // If there was overflow, set to 0xFFFFFFFFU
+    vResult = vbicq_u32(vResulti,vOverflow);
+    vOverflow = vorrq_u32(vOverflow,vResult);
+    return vOverflow;
+#else // _XM_SSE_INTRINSICS_
+    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
+    vResult = _mm_mul_ps(vResult,VFloat);
+    // Clamp to >=0
+    vResult = _mm_max_ps(vResult,g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue,vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult,vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult,vOverflow);
+    return vResult;
+#endif
+}
+
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Vector and matrix load operations
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = *pSource;
+    V.vector4_u32[1] = 0;
+    V.vector4_u32[2] = 0;
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t zero = vdupq_n_u32(0);
+    return vld1q_lane_u32( pSource, zero, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = *pSource;
+    V.vector4_f32[1] = 0.f;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t zero = vdupq_n_f32(0);
+    return vld1q_lane_f32( pSource, zero, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ss( pSource );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt2
+(
+    const uint32_t* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = 0;
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32( pSource );
+    uint32x2_t zero = vdup_n_u32(0);
+    return vcombine_u32( x, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
+    return _mm_unpacklo_ps( x, y );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt2A
+(
+    const uint32_t* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = 0;
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32_ex( pSource, 64 );
+    uint32x2_t zero = vdup_n_u32(0);
+    return vcombine_u32( x, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat2
+(
+    const XMFLOAT2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32( x, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( &pSource->x );
+    __m128 y = _mm_load_ss( &pSource->y );
+    return _mm_unpacklo_ps( x, y );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat2A
+(
+    const XMFLOAT2A* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 );
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32( x, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadSInt2
+(
+    const XMINT2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = (float)pSource->x;
+    V.vector4_f32[1] = (float)pSource->y;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
+    float32x2_t v = vcvt_f32_s32( x );
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32( v, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
+    __m128 V = _mm_unpacklo_ps( x, y );
+    return _mm_cvtepi32_ps(_mm_castps_si128(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUInt2
+(
+    const XMUINT2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = (float)pSource->x;
+    V.vector4_f32[1] = (float)pSource->y;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
+    float32x2_t v = vcvt_f32_u32( x );
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32( v, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
+    __m128 V = _mm_unpacklo_ps( x, y );
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(V,vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult,vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt3
+(
+    const uint32_t* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32( pSource );
+    uint32x2_t zero = vdup_n_u32(0);
+    uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
+    return vcombine_u32( x, y );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
+    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    return _mm_movelh_ps( xy, z );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt3A
+(
+    const uint32_t* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Reads an extra integer which is zero'd
+    uint32x4_t V = vld1q_u32_ex( pSource, 128 );
+    return vsetq_lane_u32( 0, V, 3 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Reads an extra integer which is zero'd
+    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
+    V = _mm_and_si128( V, g_XMMask3 );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat3
+(
+    const XMFLOAT3* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
+    float32x2_t zero = vdup_n_f32(0);
+    float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
+    return vcombine_f32( x, y );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( &pSource->x );
+    __m128 y = _mm_load_ss( &pSource->y );
+    __m128 z = _mm_load_ss( &pSource->z );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    return _mm_movelh_ps( xy, z );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat3A
+(
+    const XMFLOAT3A* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Reads an extra float which is zero'd
+    float32x4_t V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
+    return vsetq_lane_f32( 0, V, 3 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Reads an extra float which is zero'd
+    __m128 V = _mm_load_ps( &pSource->x );
+    return _mm_and_ps( V, g_XMMask3 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadSInt3
+(
+    const XMINT3* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V;
+    V.vector4_f32[0] = (float)pSource->x;
+    V.vector4_f32[1] = (float)pSource->y;
+    V.vector4_f32[2] = (float)pSource->z;
+    V.vector4_f32[3] = 0.f;
+    return V;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
+    int32x2_t zero = vdup_n_s32(0);
+    int32x2_t y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 );
+    int32x4_t v = vcombine_s32( x, y );
+    return vcvtq_f32_s32( v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
+    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    __m128 V = _mm_movelh_ps( xy, z );
+    return _mm_cvtepi32_ps(_mm_castps_si128(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUInt3
+(
+    const XMUINT3* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = (float)pSource->x;
+    V.vector4_f32[1] = (float)pSource->y;
+    V.vector4_f32[2] = (float)pSource->z;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
+    uint32x2_t zero = vdup_n_u32(0);
+    uint32x2_t y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 );
+    uint32x4_t v = vcombine_u32( x, y );
+    return vcvtq_f32_u32( v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
+    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    __m128 V = _mm_movelh_ps( xy, z );
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(V,vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult,vMask);
+    return vResult; 
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt4
+(
+    const uint32_t* pSource
+)
+{
+    assert(pSource);
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = pSource[3];
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_u32( pSource );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt4A
+(
+    const uint32_t* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = pSource[3];
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_u32_ex( pSource, 128 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat4
+(
+    const XMFLOAT4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = pSource->w;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_f32( reinterpret_cast<const float*>(pSource) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_loadu_ps( &pSource->x );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat4A
+(
+    const XMFLOAT4A* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = pSource->w;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ps( &pSource->x );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadSInt4
+(
+    const XMINT4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V;
+    V.vector4_f32[0] = (float)pSource->x;
+    V.vector4_f32[1] = (float)pSource->y;
+    V.vector4_f32[2] = (float)pSource->z;
+    V.vector4_f32[3] = (float)pSource->w;
+    return V;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) );
+    return vcvtq_f32_s32( v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtepi32_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUInt4
+(
+    const XMUINT4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = (float)pSource->x;
+    V.vector4_f32[1] = (float)pSource->y;
+    V.vector4_f32[2] = (float)pSource->z;
+    V.vector4_f32[3] = (float)pSource->w;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) );
+    return vcvtq_f32_u32( v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult,vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat3x3
+(
+    const XMFLOAT3X3* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+    M.r[3].vector4_f32[0] = 0.0f;
+    M.r[3].vector4_f32[1] = 0.0f;
+    M.r[3].vector4_f32[2] = 0.0f;
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
+    float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
+    float32x2_t v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] );
+    float32x4_t T = vextq_f32( v0, v1, 3 );
+
+    XMMATRIX M;
+    M.r[0] = vandq_u32( v0, g_XMMask3 );
+    M.r[1] = vandq_u32( T, g_XMMask3 );
+    M.r[2] = vcombine_f32( vget_high_f32(v1), v2 );
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 Z = _mm_setzero_ps();
+
+    __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] );
+    __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] );
+    __m128 V3 = _mm_load_ss( &pSource->m[2][2] );
+
+    __m128 T1 = _mm_unpackhi_ps( V1, Z );
+    __m128 T2 = _mm_unpacklo_ps( V2, Z );
+    __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
+    __m128 T4 = _mm_movehl_ps( T2, T3 );
+    __m128 T5 = _mm_movehl_ps( Z, T1 );  
+
+    XMMATRIX M;
+    M.r[0] = _mm_movelh_ps( V1, T1 );
+    M.r[1] = _mm_add_ps( T4, T5 );
+    M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x3
+(
+    const XMFLOAT4X3* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
+    float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
+    float32x4_t v2 = vld1q_f32( &pSource->m[2][2] );
+
+    float32x4_t T1 = vextq_f32( v0, v1, 3 );
+    float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
+    float32x4_t T3 = vextq_f32( v2, v2, 1 );
+
+    XMMATRIX M;
+    M.r[0] = vandq_u32( v0, g_XMMask3 );
+    M.r[1] = vandq_u32( T1, g_XMMask3 );
+    M.r[2] = vandq_u32( T2, g_XMMask3 );
+    M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Use unaligned load instructions to 
+    // load the 12 floats
+    // vTemp1 = x1,y1,z1,x2
+    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
+    // vTemp2 = y2,z2,x3,y3
+    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
+    // vTemp4 = z3,x4,y4,z4
+    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
+    // vTemp3 = x3,y3,z3,z3
+    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
+    // vTemp2 = y2,z2,x2,x2
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
+    // vTemp2 = x2,y2,z2,z2
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
+    // vTemp1 = x1,y1,z1,0
+    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
+    // vTemp2 = x2,y2,z2,0
+    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
+    // vTemp3 = x3,y3,z3,0
+    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
+    // vTemp4i = x4,y4,z4,0
+    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
+    // vTemp4i = x4,y4,z4,1.0f
+    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
+    XMMATRIX M(vTemp1,
+            vTemp2,
+            vTemp3,
+            _mm_castsi128_ps(vTemp4i));
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A
+(
+    const XMFLOAT4X3A* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 );
+    float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 );
+    float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 );
+
+    float32x4_t T1 = vextq_f32( v0, v1, 3 );
+    float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
+    float32x4_t T3 = vextq_f32( v2, v2, 1 );
+
+    XMMATRIX M;
+    M.r[0] = vandq_u32( v0, g_XMMask3 );
+    M.r[1] = vandq_u32( T1, g_XMMask3 );
+    M.r[2] = vandq_u32( T2, g_XMMask3 );
+    M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Use aligned load instructions to 
+    // load the 12 floats
+    // vTemp1 = x1,y1,z1,x2
+    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
+    // vTemp2 = y2,z2,x3,y3
+    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
+    // vTemp4 = z3,x4,y4,z4
+    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
+    // vTemp3 = x3,y3,z3,z3
+    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
+    // vTemp2 = y2,z2,x2,x2
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
+    // vTemp2 = x2,y2,z2,z2
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
+    // vTemp1 = x1,y1,z1,0
+    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
+    // vTemp2 = x2,y2,z2,0
+    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
+    // vTemp3 = x3,y3,z3,0
+    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
+    // vTemp4i = x4,y4,z4,0
+    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
+    // vTemp4i = x4,y4,z4,1.0f
+    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
+    XMMATRIX M(vTemp1,
+            vTemp2,
+            vTemp3,
+            _mm_castsi128_ps(vTemp4i));
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x4
+(
+    const XMFLOAT4X4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = pSource->m[0][3];
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = pSource->m[1][3];
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = pSource->m[2][3];
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = pSource->m[3][3];
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) );
+    M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) );
+    M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) );
+    M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_loadu_ps( &pSource->_11 );
+    M.r[1] = _mm_loadu_ps( &pSource->_21 );
+    M.r[2] = _mm_loadu_ps( &pSource->_31 );
+    M.r[3] = _mm_loadu_ps( &pSource->_41 );
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A
+(
+    const XMFLOAT4X4A* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = pSource->m[0][3];
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = pSource->m[1][3];
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = pSource->m[2][3];
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = pSource->m[3][3];
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 );
+    M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 );
+    M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 );
+    M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_load_ps( &pSource->_11 );
+    M.r[1] = _mm_load_ps( &pSource->_21 );
+    M.r[2] = _mm_load_ps( &pSource->_31 );
+    M.r[3] = _mm_load_ps( &pSource->_41 );
+    return M;
+#endif
+}
+
+/****************************************************************************
+ *
+ * Vector and matrix store operations
+ *
+ ****************************************************************************/
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt
+(
+    uint32_t*    pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    *pDestination = XMVectorGetIntX( V );
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32( pDestination, *reinterpret_cast<const uint32x4_t*>(&V), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat
+(
+    float*    pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    *pDestination = XMVectorGetX( V );
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32( pDestination, V, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss( pDestination, V );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt2
+(
+    uint32_t*    pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(V);
+    vst1_u32( pDestination, VL );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt2A
+(
+    uint32_t*    pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(V);
+    vst1_u32_ex( pDestination, VL, 64 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat2
+(
+    XMFLOAT2* pDestination, 
+    FXMVECTOR  V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss( &pDestination->y, T );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat2A
+(
+    XMFLOAT2A*   pDestination, 
+    FXMVECTOR     V
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreSInt2
+(
+    XMINT2* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = (int32_t)V.vector4_f32[0];
+    pDestination->y = (int32_t)V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x2_t v = vget_low_s32(V);
+    v = vcvt_s32_f32( v );
+    vst1_s32( reinterpret_cast<int32_t*>(pDestination), v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(V);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow,vResult);
+    // Write two ints
+    XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUInt2
+(
+    XMUINT2* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = (uint32_t)V.vector4_f32[0];
+    pDestination->y = (uint32_t)V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v = vget_low_f32(V);
+    uint32x2_t iv = vcvt_u32_f32( v );
+    vst1_u32( reinterpret_cast<uint32_t*>(pDestination), iv );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to >=0
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue,vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult,vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult,vOverflow);
+    // Write two uints
+    XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt3
+(
+    uint32_t*    pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(V);
+    vst1_u32( pDestination, VL );
+    vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt3A
+(
+    uint32_t*    pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(V);
+    vst1_u32_ex( pDestination, VL, 64 );
+    vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3
+(
+    XMFLOAT3* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
+    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss( &pDestination->y, T1 );
+    _mm_store_ss( &pDestination->z, T2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3A
+(
+    XMFLOAT3A*   pDestination, 
+    FXMVECTOR     V
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
+    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+    _mm_store_ss( &pDestination->z, T );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreSInt3
+(
+    XMINT3* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = (int32_t)V.vector4_f32[0];
+    pDestination->y = (int32_t)V.vector4_f32[1];
+    pDestination->z = (int32_t)V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t v = vcvtq_s32_f32(V);
+    int32x2_t vL = vget_low_s32(v);
+    vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL );
+    vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(V);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow,vResult);
+    // Write 3 uints
+    XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUInt3
+(
+    XMUINT3* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = (uint32_t)V.vector4_f32[0];
+    pDestination->y = (uint32_t)V.vector4_f32[1];
+    pDestination->z = (uint32_t)V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t v = vcvtq_u32_f32(V);
+    uint32x2_t vL = vget_low_u32(v);
+    vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL );
+    vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to >=0
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue,vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult,vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult,vOverflow);
+    // Write 3 uints
+    XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt4
+(
+    uint32_t*    pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+    pDestination[3] = V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_u32( pDestination, V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt4A
+(
+    uint32_t*    pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+    pDestination[3] = V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_u32_ex( pDestination, V, 128 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4
+(
+    XMFLOAT4* pDestination, 
+    FXMVECTOR  V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+    pDestination->w = V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_f32( reinterpret_cast<float*>(pDestination), V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storeu_ps( &pDestination->x, V );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4A
+(
+    XMFLOAT4A*   pDestination, 
+    FXMVECTOR     V
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+    pDestination->w = V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ps( &pDestination->x, V );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreSInt4
+(
+    XMINT4* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = (int32_t)V.vector4_f32[0];
+    pDestination->y = (int32_t)V.vector4_f32[1];
+    pDestination->z = (int32_t)V.vector4_f32[2];
+    pDestination->w = (int32_t)V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t v = vcvtq_s32_f32(V);
+    vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(V);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow,vResult);
+    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUInt4
+(
+    XMUINT4* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = (uint32_t)V.vector4_f32[0];
+    pDestination->y = (uint32_t)V.vector4_f32[1];
+    pDestination->z = (uint32_t)V.vector4_f32[2];
+    pDestination->w = (uint32_t)V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t v = vcvtq_u32_f32(V);
+    vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to >=0
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue,vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult,vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult,vOverflow);
+    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3x3
+(
+    XMFLOAT3X3*	pDestination, 
+    FXMMATRIX	M
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
+    float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
+    vst1q_f32( &pDestination->m[0][0], T2 );
+
+    T1 = vextq_f32( M.r[1], M.r[1], 1 );
+    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
+    vst1q_f32( &pDestination->m[1][1], T2 );
+
+    vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = M.r[0];
+    XMVECTOR vTemp2 = M.r[1];
+    XMVECTOR vTemp3 = M.r[2];
+    XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
+    vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
+    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
+    _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
+    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss(&pDestination->m[2][2],vTemp3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x3
+(
+    XMFLOAT4X3* pDestination, 
+    FXMMATRIX M
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
+    float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
+    vst1q_f32( &pDestination->m[0][0], T2 );
+
+    T1 = vextq_f32( M.r[1], M.r[1], 1 );
+    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
+    vst1q_f32( &pDestination->m[1][1], T2 );
+
+    T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
+    T2 = vextq_f32( T1, M.r[3], 3 );
+    vst1q_f32( &pDestination->m[2][2], T2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = M.r[0];
+    XMVECTOR vTemp2 = M.r[1];
+    XMVECTOR vTemp3 = M.r[2];
+    XMVECTOR vTemp4 = M.r[3];
+    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
+    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
+    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
+    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
+    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
+    _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
+    _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x3A
+(
+    XMFLOAT4X3A*	pDestination, 
+    FXMMATRIX		M
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
+    float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
+    vst1q_f32_ex( &pDestination->m[0][0], T2, 128 );
+
+    T1 = vextq_f32( M.r[1], M.r[1], 1 );
+    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
+    vst1q_f32_ex( &pDestination->m[1][1], T2, 128 );
+
+    T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
+    T2 = vextq_f32( T1, M.r[3], 3 );
+    vst1q_f32_ex( &pDestination->m[2][2], T2, 128 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x1,y1,z1,w1
+    XMVECTOR vTemp1 = M.r[0];
+    // x2,y2,z2,w2
+    XMVECTOR vTemp2 = M.r[1];
+    // x3,y3,z3,w3
+    XMVECTOR vTemp3 = M.r[2];
+    // x4,y4,z4,w4
+    XMVECTOR vTemp4 = M.r[3];
+    // z1,z1,x2,y2
+    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
+    // y2,z2,x3,y3 (Final)
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
+    // x1,y1,z1,x2 (Final)
+    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
+    // z3,z3,x4,x4
+    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
+    // z3,x4,y4,z4 (Final)
+    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
+    // Store in 3 operations
+    _mm_store_ps(&pDestination->m[0][0],vTemp1);
+    _mm_store_ps(&pDestination->m[1][1],vTemp2);
+    _mm_store_ps(&pDestination->m[2][2],vTemp3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x4
+(
+    XMFLOAT4X4* pDestination, 
+    FXMMATRIX M
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+    pDestination->m[0][3] = M.r[0].vector4_f32[3];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+    pDestination->m[1][3] = M.r[1].vector4_f32[3];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[2].vector4_f32[3];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+    pDestination->m[3][3] = M.r[3].vector4_f32[3];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] );
+    vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] );
+    vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] );
+    vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storeu_ps( &pDestination->_11, M.r[0] );
+    _mm_storeu_ps( &pDestination->_21, M.r[1] );
+    _mm_storeu_ps( &pDestination->_31, M.r[2] );
+    _mm_storeu_ps( &pDestination->_41, M.r[3] );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x4A
+(
+    XMFLOAT4X4A*	pDestination, 
+    FXMMATRIX		M
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+    pDestination->m[0][3] = M.r[0].vector4_f32[3];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+    pDestination->m[1][3] = M.r[1].vector4_f32[3];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[2].vector4_f32[3];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+    pDestination->m[3][3] = M.r[3].vector4_f32[3];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 );
+    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 );
+    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 );
+    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ps( &pDestination->_11, M.r[0] );
+    _mm_store_ps( &pDestination->_21, M.r[1] );
+    _mm_store_ps( &pDestination->_31, M.r[2] );
+    _mm_store_ps( &pDestination->_41, M.r[3] );
+#endif
+}
+
diff --git a/Inc/DirectXMathMatrix.inl b/Inc/DirectXMathMatrix.inl
index 79157f3..5257938 100644
--- a/Inc/DirectXMathMatrix.inl
+++ b/Inc/DirectXMathMatrix.inl
@@ -1,3306 +1,3306 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathMatrix.inl -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Matrix
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-// Return true if any entry in the matrix is NaN
-inline bool XM_CALLCONV XMMatrixIsNaN
-(
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    size_t i = 16;
-    const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]);
-    do {
-        // Fetch value into integer unit
-        uint32_t uTest = pWork[0];
-        // Remove sign
-        uTest &= 0x7FFFFFFFU;
-        // NaN is 0x7F800001 through 0x7FFFFFFF inclusive
-        uTest -= 0x7F800001U;
-        if (uTest<0x007FFFFFU) {
-            break;      // NaN found
-        }
-        ++pWork;        // Next entry
-    } while (--i);
-    return (i!=0);      // i == 0 if nothing matched
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Load in registers
-    XMVECTOR vX = M.r[0];
-    XMVECTOR vY = M.r[1];
-    XMVECTOR vZ = M.r[2];
-    XMVECTOR vW = M.r[3];
-    // Test themselves to check for NaN
-    vX = vmvnq_u32(vceqq_f32(vX, vX));
-    vY = vmvnq_u32(vceqq_f32(vY, vY));
-    vZ = vmvnq_u32(vceqq_f32(vZ, vZ));
-    vW = vmvnq_u32(vceqq_f32(vW, vW));
-    // Or all the results
-    vX = vorrq_u32(vX,vZ);
-    vY = vorrq_u32(vY,vW);
-    vX = vorrq_u32(vX,vY);
-    // If any tested true, return true
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    return (r != 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Load in registers
-    XMVECTOR vX = M.r[0];
-    XMVECTOR vY = M.r[1];
-    XMVECTOR vZ = M.r[2];
-    XMVECTOR vW = M.r[3];
-    // Test themselves to check for NaN
-    vX = _mm_cmpneq_ps(vX,vX);
-    vY = _mm_cmpneq_ps(vY,vY);
-    vZ = _mm_cmpneq_ps(vZ,vZ);
-    vW = _mm_cmpneq_ps(vW,vW);
-    // Or all the results
-    vX = _mm_or_ps(vX,vZ);
-    vY = _mm_or_ps(vY,vW);
-    vX = _mm_or_ps(vX,vY);
-    // If any tested true, return true
-    return (_mm_movemask_ps(vX)!=0);
-#else
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return true if any entry in the matrix is +/-INF
-inline bool XM_CALLCONV XMMatrixIsInfinite
-(
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    size_t i = 16;
-    const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]);
-    do {
-        // Fetch value into integer unit
-        uint32_t uTest = pWork[0];
-        // Remove sign
-        uTest &= 0x7FFFFFFFU;
-        // INF is 0x7F800000
-        if (uTest==0x7F800000U) {
-            break;      // INF found
-        }
-        ++pWork;        // Next entry
-    } while (--i);
-    return (i!=0);      // i == 0 if nothing matched
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bits
-    XMVECTOR vTemp1 = vandq_u32(M.r[0],g_XMAbsMask);
-    XMVECTOR vTemp2 = vandq_u32(M.r[1],g_XMAbsMask);
-    XMVECTOR vTemp3 = vandq_u32(M.r[2],g_XMAbsMask);
-    XMVECTOR vTemp4 = vandq_u32(M.r[3],g_XMAbsMask);
-    // Compare to infinity
-    vTemp1 = vceqq_f32(vTemp1,g_XMInfinity);
-    vTemp2 = vceqq_f32(vTemp2,g_XMInfinity);
-    vTemp3 = vceqq_f32(vTemp3,g_XMInfinity);
-    vTemp4 = vceqq_f32(vTemp4,g_XMInfinity);
-    // Or the answers together
-    vTemp1 = vorrq_u32(vTemp1,vTemp2);
-    vTemp3 = vorrq_u32(vTemp3,vTemp4);
-    vTemp1 = vorrq_u32(vTemp1,vTemp3);
-    // If any are infinity, the signs are true.
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    return (r != 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bits
-    XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask);
-    XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask);
-    XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask);
-    XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask);
-    // Compare to infinity
-    vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity);
-    vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity);
-    vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity);
-    vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity);
-    // Or the answers together
-    vTemp1 = _mm_or_ps(vTemp1,vTemp2);
-    vTemp3 = _mm_or_ps(vTemp3,vTemp4);
-    vTemp1 = _mm_or_ps(vTemp1,vTemp3);
-    // If any are infinity, the signs are true.
-    return (_mm_movemask_ps(vTemp1)!=0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return true if the XMMatrix is equal to identity
-inline bool XM_CALLCONV XMMatrixIsIdentity
-(
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    // Use the integer pipeline to reduce branching to a minimum
-    const uint32_t *pWork = (const uint32_t*)(&M.m[0][0]);
-    // Convert 1.0f to zero and or them together
-    uint32_t uOne = pWork[0]^0x3F800000U;
-    // Or all the 0.0f entries together
-    uint32_t uZero = pWork[1];
-    uZero |= pWork[2];
-    uZero |= pWork[3];
-    // 2nd row
-    uZero |= pWork[4];
-    uOne |= pWork[5]^0x3F800000U;
-    uZero |= pWork[6];
-    uZero |= pWork[7];
-    // 3rd row
-    uZero |= pWork[8];
-    uZero |= pWork[9];
-    uOne |= pWork[10]^0x3F800000U;
-    uZero |= pWork[11];
-    // 4th row
-    uZero |= pWork[12];
-    uZero |= pWork[13];
-    uZero |= pWork[14];
-    uOne |= pWork[15]^0x3F800000U;
-    // If all zero entries are zero, the uZero==0
-    uZero &= 0x7FFFFFFF;    // Allow -0.0f
-    // If all 1.0f entries are 1.0f, then uOne==0
-    uOne |= uZero;
-    return (uOne==0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vTemp1 = vceqq_f32(M.r[0],g_XMIdentityR0);
-    XMVECTOR vTemp2 = vceqq_f32(M.r[1],g_XMIdentityR1);
-    XMVECTOR vTemp3 = vceqq_f32(M.r[2],g_XMIdentityR2);
-    XMVECTOR vTemp4 = vceqq_f32(M.r[3],g_XMIdentityR3);
-    vTemp1 = vandq_u32(vTemp1,vTemp2);
-    vTemp3 = vandq_u32(vTemp3,vTemp4);
-    vTemp1 = vandq_u32(vTemp1,vTemp3);
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    return ( r == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0);
-    XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1);
-    XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2);
-    XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3);
-    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
-    vTemp3 = _mm_and_ps(vTemp3,vTemp4);
-    vTemp1 = _mm_and_ps(vTemp1,vTemp3);
-    return (_mm_movemask_ps(vTemp1)==0x0f);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-// Perform a 4x4 matrix multiply by a 4x4 matrix
-inline XMMATRIX XM_CALLCONV XMMatrixMultiply
-(
-    FXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMMATRIX mResult;
-    // Cache the invariants in registers
-    float x = M1.m[0][0];
-    float y = M1.m[0][1];
-    float z = M1.m[0][2];
-    float w = M1.m[0][3];
-    // Perform the operation on the first row
-    mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
-    mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
-    mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
-    mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
-    // Repeat for all the other rows
-    x = M1.m[1][0];
-    y = M1.m[1][1];
-    z = M1.m[1][2];
-    w = M1.m[1][3];
-    mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
-    mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
-    mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
-    mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
-    x = M1.m[2][0];
-    y = M1.m[2][1];
-    z = M1.m[2][2];
-    w = M1.m[2][3];
-    mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
-    mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
-    mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
-    mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
-    x = M1.m[3][0];
-    y = M1.m[3][1];
-    z = M1.m[3][2];
-    w = M1.m[3][3];
-    mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
-    mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
-    mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
-    mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
-    return mResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX mResult;
-    float32x2_t VL = vget_low_f32( M1.r[0] );
-    float32x2_t VH = vget_high_f32( M1.r[0] );
-    // Perform the operation on the first row
-    XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[0] = vaddq_f32( vZ, vW );
-    // Repeat for the other 3 rows
-    VL = vget_low_f32( M1.r[1] );
-    VH = vget_high_f32( M1.r[1] );
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[1] = vaddq_f32( vZ, vW );
-    VL = vget_low_f32( M1.r[2] );
-    VH = vget_high_f32( M1.r[2] );
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[2] = vaddq_f32( vZ, vW );
-    VL = vget_low_f32( M1.r[3] );
-    VH = vget_high_f32( M1.r[3] );
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[3] = vaddq_f32( vZ, vW );
-    return mResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX mResult;
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    // Perform a binary add to reduce cumulative errors
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    mResult.r[0] = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    mResult.r[1] = vX;
-    vW = M1.r[2];
-    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    mResult.r[2] = vX;
-    vW = M1.r[3];
-    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    mResult.r[3] = vX;
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
-(
-    FXMMATRIX M1, 
-    CXMMATRIX M2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMMATRIX mResult;
-    // Cache the invariants in registers
-    float x = M2.m[0][0];
-    float y = M2.m[1][0];
-    float z = M2.m[2][0];
-    float w = M2.m[3][0];
-    // Perform the operation on the first row
-    mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
-    mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
-    mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
-    mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
-    // Repeat for all the other rows
-    x = M2.m[0][1];
-    y = M2.m[1][1];
-    z = M2.m[2][1];
-    w = M2.m[3][1];
-    mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
-    mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
-    mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
-    mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
-    x = M2.m[0][2];
-    y = M2.m[1][2];
-    z = M2.m[2][2];
-    w = M2.m[3][2];
-    mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
-    mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
-    mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
-    mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
-    x = M2.m[0][3];
-    y = M2.m[1][3];
-    z = M2.m[2][3];
-    w = M2.m[3][3];
-    mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
-    mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
-    mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
-    mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
-    return mResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( M1.r[0] );
-    float32x2_t VH = vget_high_f32( M1.r[0] );
-    // Perform the operation on the first row
-    XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r0 = vaddq_f32( vZ, vW );
-    // Repeat for the other 3 rows
-    VL = vget_low_f32( M1.r[1] );
-    VH = vget_high_f32( M1.r[1] );
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r1 = vaddq_f32( vZ, vW );
-    VL = vget_low_f32( M1.r[2] );
-    VH = vget_high_f32( M1.r[2] );
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r2 = vaddq_f32( vZ, vW );
-    VL = vget_low_f32( M1.r[3] );
-    VH = vget_high_f32( M1.r[3] );
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r3 = vaddq_f32( vZ, vW );
- 
-    // Transpose result
-    float32x4x2_t P0 = vzipq_f32( r0, r2 );
-    float32x4x2_t P1 = vzipq_f32( r1, r3 );
-
-    float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
-    float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
-
-    XMMATRIX mResult;
-    mResult.r[0] = T0.val[0];
-    mResult.r[1] = T0.val[1];
-    mResult.r[2] = T1.val[0];
-    mResult.r[3] = T1.val[1];
-    return mResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    // Splat the component X,Y,Z then W
-    XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    // Perform a binary add to reduce cumulative errors
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    __m128 r0 = vX;
-    // Repeat for the other 3 rows
-    vW = M1.r[1];
-    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    __m128 r1 = vX;
-    vW = M1.r[2];
-    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    __m128 r2 = vX;
-    vW = M1.r[3];
-    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
-    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
-    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
-    vX = _mm_mul_ps(vX,M2.r[0]);
-    vY = _mm_mul_ps(vY,M2.r[1]);
-    vZ = _mm_mul_ps(vZ,M2.r[2]);
-    vW = _mm_mul_ps(vW,M2.r[3]);
-    vX = _mm_add_ps(vX,vZ);
-    vY = _mm_add_ps(vY,vW);
-    vX = _mm_add_ps(vX,vY);
-    __m128 r3 = vX;
-
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
-
-    XMMATRIX mResult;
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTranspose
-(
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    // Original matrix:
-    //
-    //     m00m01m02m03
-    //     m10m11m12m13
-    //     m20m21m22m23
-    //     m30m31m32m33
-
-    XMMATRIX P;
-    P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21
-    P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31
-    P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23
-    P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33
-
-    XMMATRIX MT;
-    MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30
-    MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31
-    MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32
-    MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33
-    return MT;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] );
-    float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] );
-
-    float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
-    float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
-
-    XMMATRIX mResult;
-    mResult.r[0] = T0.val[0];
-    mResult.r[1] = T0.val[1];
-    mResult.r[2] = T1.val[0];
-    mResult.r[3] = T1.val[1];
-    return mResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2));
-    XMMATRIX mResult;
-
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return the inverse and the determinant of a 4x4 matrix
-_Use_decl_annotations_
-inline XMMATRIX XM_CALLCONV XMMatrixInverse
-(
-    XMVECTOR* pDeterminant, 
-    FXMMATRIX  M
-)
-{
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMMATRIX MT = XMMatrixTranspose(M);
-
-    XMVECTOR V0[4], V1[4];
-    V0[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[2]);
-    V1[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[3]);
-    V0[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[0]);
-    V1[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[1]);
-    V0[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[2], MT.r[0]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[3], MT.r[1]);
-
-    XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]);
-    XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]);
-    XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]);
-
-    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[2]);
-    V1[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[3]);
-    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[0]);
-    V1[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[1]);
-    V0[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[2], MT.r[0]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[3], MT.r[1]);
-
-    D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0);
-    D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1);
-    D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2);
-
-    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[1]);
-    V1[0] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D0, D2);
-    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[0]);
-    V1[1] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D0, D2);
-    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[3]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D1, D2);
-    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[2]);
-    V1[3] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D1, D2);
-
-    XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]);
-    XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]);
-    XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]);
-    XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]);
-
-    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[1]);
-    V1[0] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(D0, D2);
-    V0[1] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[0]);
-    V1[1] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X>(D0, D2);
-    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[3]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z>(D1, D2);
-    V0[3] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[2]);
-    V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(D1, D2);
-
-    C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
-    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
-    C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
-    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
-
-    V0[0] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[1]);
-    V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z>(D0, D2);
-    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[0]);
-    V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X>(D0, D2);
-    V0[2] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[3]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(D1, D2);
-    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[2]);
-    V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z>(D1, D2); 
-
-    XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
-    C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
-    XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
-    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
-    XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
-    C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
-    XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
-    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
-
-    XMMATRIX R;
-    R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
-    R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
-    R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
-    R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);
-
-    XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]);
-
-    if (pDeterminant != nullptr)
-        *pDeterminant = Determinant;
-
-    XMVECTOR Reciprocal = XMVectorReciprocal(Determinant);
-
-    XMMATRIX Result;
-    Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
-    Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
-    Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
-    Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX MT = XMMatrixTranspose(M);
-    XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0));
-    XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2));
-    XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0));
-    XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2));
-    XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0));
-    XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1));
-
-    XMVECTOR D0 = _mm_mul_ps(V00,V10);
-    XMVECTOR D1 = _mm_mul_ps(V01,V11);
-    XMVECTOR D2 = _mm_mul_ps(V02,V12);
-
-    V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2));
-    V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0));
-    V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2));
-    V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0));
-    V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1));
-    V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0));
-
-    V00 = _mm_mul_ps(V00,V10);
-    V01 = _mm_mul_ps(V01,V11);
-    V02 = _mm_mul_ps(V02,V12);
-    D0 = _mm_sub_ps(D0,V00);
-    D1 = _mm_sub_ps(D1,V01);
-    D2 = _mm_sub_ps(D2,V02);
-    // V11 = D0Y,D0W,D2Y,D2Y
-    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1));
-    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1));
-    V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2));
-    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2));
-    V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1));
-    // V13 = D1Y,D1W,D2W,D2W
-    XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1));
-    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1));
-    V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2));
-    XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2));
-    V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1));
-
-    XMVECTOR C0 = _mm_mul_ps(V00,V10);
-    XMVECTOR C2 = _mm_mul_ps(V01,V11);
-    XMVECTOR C4 = _mm_mul_ps(V02,V12);
-    XMVECTOR C6 = _mm_mul_ps(V03,V13);
-
-    // V11 = D0X,D0Y,D2X,D2X
-    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0));
-    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2));
-    V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3));
-    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3));
-    V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2));
-    // V13 = D1X,D1Y,D2Z,D2Z
-    V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0));
-    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2));
-    V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3));
-    V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3));
-    V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2));
-
-    V00 = _mm_mul_ps(V00,V10);
-    V01 = _mm_mul_ps(V01,V11);
-    V02 = _mm_mul_ps(V02,V12);
-    V03 = _mm_mul_ps(V03,V13);
-    C0 = _mm_sub_ps(C0,V00);
-    C2 = _mm_sub_ps(C2,V01);
-    C4 = _mm_sub_ps(C4,V02);
-    C6 = _mm_sub_ps(C6,V03);
-
-    V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3));
-    // V10 = D0Z,D0Z,D2X,D2Y
-    V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2));
-    V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0));
-    V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1));
-    // V11 = D0X,D0W,D2X,D2Y
-    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0));
-    V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3));
-    V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3));
-    // V12 = D1Z,D1Z,D2Z,D2W
-    V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2));
-    V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0));
-    V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1));
-    // V13 = D1X,D1W,D2Z,D2W
-    V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0));
-    V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3));
-
-    V00 = _mm_mul_ps(V00,V10);
-    V01 = _mm_mul_ps(V01,V11);
-    V02 = _mm_mul_ps(V02,V12);
-    V03 = _mm_mul_ps(V03,V13);
-    XMVECTOR C1 = _mm_sub_ps(C0,V00);
-    C0 = _mm_add_ps(C0,V00);
-    XMVECTOR C3 = _mm_add_ps(C2,V01);
-    C2 = _mm_sub_ps(C2,V01);
-    XMVECTOR C5 = _mm_sub_ps(C4,V02);
-    C4 = _mm_add_ps(C4,V02);
-    XMVECTOR C7 = _mm_add_ps(C6,V03);
-    C6 = _mm_sub_ps(C6,V03);
-
-    C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0));
-    C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0));
-    C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0));
-    C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0));
-    C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0));
-    C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0));
-    C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0));
-    C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0));
-    // Get the determinate
-    XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]);
-    if (pDeterminant != nullptr)
-        *pDeterminant = vTemp;
-    vTemp = _mm_div_ps(g_XMOne,vTemp);
-    XMMATRIX mResult;
-    mResult.r[0] = _mm_mul_ps(C0,vTemp);
-    mResult.r[1] = _mm_mul_ps(C2,vTemp);
-    mResult.r[2] = _mm_mul_ps(C4,vTemp);
-    mResult.r[3] = _mm_mul_ps(C6,vTemp);
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMMatrixDeterminant
-(
-    FXMMATRIX M
-)
-{
-    static const XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f};
-
-    XMVECTOR V0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
-    XMVECTOR V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
-    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
-    XMVECTOR V3 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
-    XMVECTOR V4 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
-    XMVECTOR V5 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
-
-    XMVECTOR P0 = XMVectorMultiply(V0, V1);
-    XMVECTOR P1 = XMVectorMultiply(V2, V3);
-    XMVECTOR P2 = XMVectorMultiply(V4, V5);
-
-    V0 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
-    V1 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
-    V2 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
-    V3 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
-    V4 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
-    V5 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
-
-    P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
-    P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
-    P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);
-
-    V0 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[1]);
-    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[1]);
-    V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[1]);
-
-    XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v);
-    XMVECTOR R = XMVectorMultiply(V0, P0);
-    R = XMVectorNegativeMultiplySubtract(V1, P1, R);
-    R = XMVectorMultiplyAdd(V2, P2, R);
-
-    return XMVector4Dot(S, R);
-}
-
-#define XM3RANKDECOMPOSE(a, b, c, x, y, z)      \
-    if((x) < (y))                   \
-    {                               \
-        if((y) < (z))               \
-        {                           \
-            (a) = 2;                \
-            (b) = 1;                \
-            (c) = 0;                \
-        }                           \
-        else                        \
-        {                           \
-            (a) = 1;                \
-                                    \
-            if((x) < (z))           \
-            {                       \
-                (b) = 2;            \
-                (c) = 0;            \
-            }                       \
-            else                    \
-            {                       \
-                (b) = 0;            \
-                (c) = 2;            \
-            }                       \
-        }                           \
-    }                               \
-    else                            \
-    {                               \
-        if((x) < (z))               \
-        {                           \
-            (a) = 2;                \
-            (b) = 0;                \
-            (c) = 1;                \
-        }                           \
-        else                        \
-        {                           \
-            (a) = 0;                \
-                                    \
-            if((y) < (z))           \
-            {                       \
-                (b) = 2;            \
-                (c) = 1;            \
-            }                       \
-            else                    \
-            {                       \
-                (b) = 1;            \
-                (c) = 2;            \
-            }                       \
-        }                           \
-    }
-                                    
-#define XM3_DECOMP_EPSILON 0.0001f
-
-_Use_decl_annotations_
-inline bool XM_CALLCONV XMMatrixDecompose
-(
-    XMVECTOR *outScale,
-    XMVECTOR *outRotQuat,
-    XMVECTOR *outTrans,
-    FXMMATRIX M
-)
-{
-    static const XMVECTOR *pvCanonicalBasis[3] = {
-        &g_XMIdentityR0.v,
-        &g_XMIdentityR1.v,
-        &g_XMIdentityR2.v
-    };
-
-    assert( outScale != nullptr );
-    assert( outRotQuat != nullptr );
-    assert( outTrans != nullptr );
-
-    // Get the translation
-    outTrans[0] = M.r[3];
-
-    XMVECTOR *ppvBasis[3];
-    XMMATRIX matTemp;
-    ppvBasis[0] = &matTemp.r[0];
-    ppvBasis[1] = &matTemp.r[1];
-    ppvBasis[2] = &matTemp.r[2];
-
-    matTemp.r[0] = M.r[0];
-    matTemp.r[1] = M.r[1];
-    matTemp.r[2] = M.r[2];
-    matTemp.r[3] = g_XMIdentityR3.v;
-
-    float *pfScales = (float *)outScale;
-
-    size_t a, b, c;
-    XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); 
-    XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); 
-    XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); 
-    pfScales[3] = 0.f;
-
-    XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2])
-
-    if(pfScales[a] < XM3_DECOMP_EPSILON)
-    {
-        ppvBasis[a][0] = pvCanonicalBasis[a][0];
-    }
-    ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]);
-
-    if(pfScales[b] < XM3_DECOMP_EPSILON)
-    {
-        size_t aa, bb, cc;
-        float fAbsX, fAbsY, fAbsZ;
-
-        fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0]));
-        fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0]));
-        fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0]));
-
-        XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ)
-
-        ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]);
-    }
-
-    ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]);
-
-    if(pfScales[c] < XM3_DECOMP_EPSILON)
-    {
-        ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]);
-    }
-        
-    ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]);
-
-    float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp));
-
-    // use Kramer's rule to check for handedness of coordinate system
-    if(fDet < 0.0f)
-    {
-        // switch coordinate system by negating the scale and inverting the basis vector on the x-axis
-        pfScales[a] = -pfScales[a];
-        ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]);
-
-        fDet = -fDet;
-    }
-
-    fDet -= 1.0f;
-    fDet *= fDet;
-
-    if(XM3_DECOMP_EPSILON < fDet)
-    {
-        // Non-SRT matrix encountered
-        return false;
-    }
-
-    // generate the quaternion from the matrix
-    outRotQuat[0] = XMQuaternionRotationMatrix(matTemp);
-    return true;
-}
-
-#undef XM3_DECOMP_EPSILON
-#undef XM3RANKDECOMPOSE
-
-//------------------------------------------------------------------------------
-// Transformation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixIdentity()
-{
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixSet
-(
-    float m00, float m01, float m02, float m03,
-    float m10, float m11, float m12, float m13,
-    float m20, float m21, float m22, float m23,
-    float m30, float m31, float m32, float m33
-)
-{
-    XMMATRIX M;
-#if defined(_XM_NO_INTRINSICS_)
-    M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03;
-    M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13;
-    M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23;
-    M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33;
-#else
-    M.r[0] = XMVectorSet(m00, m01, m02, m03);
-    M.r[1] = XMVectorSet(m10, m11, m12, m13);
-    M.r[2] = XMVectorSet(m20, m21, m22, m23);
-    M.r[3] = XMVectorSet(m30, m31, m32, m33);
-#endif
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTranslation
-(
-    float OffsetX, 
-    float OffsetY, 
-    float OffsetZ
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = 1.0f;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 1.0f;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = 1.0f;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = OffsetX;
-    M.m[3][1] = OffsetY;
-    M.m[3][2] = OffsetZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f );
-    return M;
-#endif
-}
-
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector
-(
-    FXMVECTOR Offset
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = 1.0f;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 1.0f;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = 1.0f;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = Offset.vector4_f32[0];
-    M.m[3][1] = Offset.vector4_f32[1];
-    M.m[3][2] = Offset.vector4_f32[2];
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = XMVectorSelect( g_XMIdentityR3.v, Offset, g_XMSelect1110.v );
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixScaling
-(
-    float ScaleX, 
-    float ScaleY, 
-    float ScaleZ
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = ScaleX;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = ScaleY;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = ScaleZ;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( ScaleX, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( ScaleY, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( ScaleZ, Zero, 2 );
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX );
-    M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 );
-    M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 );
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector
-(
-    FXMVECTOR Scale
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = Scale.vector4_f32[0];
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = Scale.vector4_f32[1];
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = Scale.vector4_f32[2];
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = vandq_u32(Scale,g_XMMaskX);
-    M.r[1] = vandq_u32(Scale,g_XMMaskY);
-    M.r[2] = vandq_u32(Scale,g_XMMaskZ);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_and_ps(Scale,g_XMMaskX);
-    M.r[1] = _mm_and_ps(Scale,g_XMMaskY);
-    M.r[2] = _mm_and_ps(Scale,g_XMMaskZ);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationX
-(
-    float Angle
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
- 
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMMATRIX M;
-    M.m[0][0] = 1.0f;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = fCosAngle;
-    M.m[1][2] = fSinAngle;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = -fSinAngle;
-    M.m[2][2] = fCosAngle;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMVECTOR T1 = vsetq_lane_f32( fCosAngle, Zero, 1 );
-    T1 = vsetq_lane_f32( fSinAngle, T1, 2 );
-
-    XMVECTOR T2 = vsetq_lane_f32( -fSinAngle, Zero, 1 );
-    T2 = vsetq_lane_f32( fCosAngle, T2, 2 );
-
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = T1;
-    M.r[2] = T2;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float    SinAngle;
-    float    CosAngle;
-    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
-
-    XMVECTOR vSin = _mm_set_ss(SinAngle);
-    XMVECTOR vCos = _mm_set_ss(CosAngle);
-    // x = 0,y = cos,z = sin, w = 0
-    vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3));
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0;
-    M.r[1] = vCos;
-    // x = 0,y = sin,z = cos, w = 0
-    vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0));
-    // x = 0,y = -sin,z = cos, w = 0
-    vCos = _mm_mul_ps(vCos,g_XMNegateY);
-    M.r[2] = vCos;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationY
-(
-    float Angle
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
- 
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMMATRIX M;
-    M.m[0][0] = fCosAngle;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = -fSinAngle;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 1.0f;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = fSinAngle;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fCosAngle;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 );
-    T0 = vsetq_lane_f32( -fSinAngle, T0, 2 );
-
-    XMVECTOR T2 = vsetq_lane_f32( fSinAngle, Zero, 0 );
-    T2 = vsetq_lane_f32( fCosAngle, T2, 2 );
-
-    XMMATRIX M;
-    M.r[0] = T0;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = T2;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float    SinAngle;
-    float    CosAngle;
-    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
-
-    XMVECTOR vSin = _mm_set_ss(SinAngle);
-    XMVECTOR vCos = _mm_set_ss(CosAngle);
-    // x = sin,y = 0,z = cos, w = 0
-    vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0));
-    XMMATRIX M;
-    M.r[2] = vSin;
-    M.r[1] = g_XMIdentityR1;
-    // x = cos,y = 0,z = sin, w = 0
-    vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2));
-    // x = cos,y = 0,z = -sin, w = 0
-    vSin = _mm_mul_ps(vSin,g_XMNegateZ);
-    M.r[0] = vSin;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationZ
-(
-    float Angle
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
- 
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMMATRIX M;
-    M.m[0][0] = fCosAngle;
-    M.m[0][1] = fSinAngle;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = -fSinAngle;
-    M.m[1][1] = fCosAngle;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = 1.0f;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 );
-    T0 = vsetq_lane_f32( fSinAngle, T0, 1 );
-
-    XMVECTOR T1 = vsetq_lane_f32( -fSinAngle, Zero, 0 );
-    T1 = vsetq_lane_f32( fCosAngle, T1, 1 );
-
-    XMMATRIX M;
-    M.r[0] = T0;
-    M.r[1] = T1;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float    SinAngle;
-    float    CosAngle;
-    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
-
-    XMVECTOR vSin = _mm_set_ss(SinAngle);
-    XMVECTOR vCos = _mm_set_ss(CosAngle);
-    // x = cos,y = sin,z = 0, w = 0
-    vCos = _mm_unpacklo_ps(vCos,vSin);
-    XMMATRIX M;
-    M.r[0] = vCos;
-    // x = sin,y = cos,z = 0, w = 0
-    vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1));
-    // x = cos,y = -sin,z = 0, w = 0
-    vCos = _mm_mul_ps(vCos,g_XMNegateX);
-    M.r[1] = vCos;
-    M.r[2] = g_XMIdentityR2;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw
-(
-    float Pitch, 
-    float Yaw, 
-    float Roll
-)
-{
-    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
-    return XMMatrixRotationRollPitchYawFromVector(Angles);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector
-(
-    FXMVECTOR Angles // <Pitch, Yaw, Roll, undefined>
-)
-{
-    XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
-    return XMMatrixRotationQuaternion(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal
-(
-    FXMVECTOR NormalAxis, 
-    float     Angle
-)
-{
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);
-
-    XMVECTOR C2 = XMVectorSplatZ(A);
-    XMVECTOR C1 = XMVectorSplatY(A);
-    XMVECTOR C0 = XMVectorSplatX(A);
-
-    XMVECTOR N0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(NormalAxis);
-    XMVECTOR N1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(NormalAxis);
-
-    XMVECTOR V0 = XMVectorMultiply(C2, N0);
-    V0 = XMVectorMultiply(V0, N1);
-
-    XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis);
-    R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1);
-
-    XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0);
-    XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0);
-
-    V0 = XMVectorSelect(A, R0, g_XMSelect1110.v);
-    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(R1, R2);
-    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(R1, R2);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(V0, V1);
-    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(V0, V1);
-    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(V0, V2);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    float    fSinAngle;
-    float    fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle);
-    XMVECTOR C1 = _mm_set_ps1(fCosAngle);
-    XMVECTOR C0 = _mm_set_ps1(fSinAngle);
-
-    XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1));
-    XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2));
-
-    XMVECTOR V0 = _mm_mul_ps(C2, N0);
-    V0 = _mm_mul_ps(V0, N1);
-
-    XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis);
-    R0 = _mm_mul_ps(R0, NormalAxis);
-    R0 = _mm_add_ps(R0, C1);
-
-    XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis);
-    R1 = _mm_add_ps(R1, V0);
-    XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis);
-    R2 = _mm_sub_ps(V0,R2);
-
-    V0 = _mm_and_ps(R0,g_XMMask3);
-    XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0));
-    V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1));
-    XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1));
-    V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0));
-
-    R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0));
-    R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0));
-
-    XMMATRIX M;
-    M.r[0] = R2;
-
-    R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1));
-    R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2));
-    M.r[1] = R2;
-
-    V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0));
-    M.r[2] = V2;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis
-(
-    FXMVECTOR Axis, 
-    float     Angle
-)
-{
-    assert(!XMVector3Equal(Axis, XMVectorZero()));
-    assert(!XMVector3IsInfinite(Axis));
-
-    XMVECTOR Normal = XMVector3Normalize(Axis);
-    return XMMatrixRotationNormal(Normal, Angle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion
-(
-    FXMVECTOR Quaternion
-)
-{
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f};
-
-    XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion);
-    XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0);
-
-    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W>(Q1, Constant1110.v);
-    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W>(Q1, Constant1110.v);
-    XMVECTOR R0 = XMVectorSubtract(Constant1110, V0);
-    R0 = XMVectorSubtract(R0, V1);
-
-    V0 = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(Quaternion);
-    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_W>(Q0);
-    V0 = XMVectorMultiply(V0, V1);
-
-    V1 = XMVectorSplatW(Quaternion);
-    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(Q0);
-    V1 = XMVectorMultiply(V1, V2);
-
-    XMVECTOR R1 = XMVectorAdd(V0, V1);
-    XMVECTOR R2 = XMVectorSubtract(V0, V1);
-
-    V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z>(R1, R2);
-    V1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z>(R1, R2);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(R0, V0);
-    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(R0, V0);
-    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(R0, V1);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32  Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f};
-
-    XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion);
-    XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0);
-
-    XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1));
-    V0 = _mm_and_ps(V0,g_XMMask3);
-    XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2));
-    V1 = _mm_and_ps(V1,g_XMMask3);
-    XMVECTOR R0 = _mm_sub_ps(Constant1110,V0);
-    R0 = _mm_sub_ps(R0, V1);
-
-    V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0));
-    V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2));
-    V0 = _mm_mul_ps(V0, V1);
-
-    V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3));
-    XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1));
-    V1 = _mm_mul_ps(V1, V2);
-
-    XMVECTOR R1 = _mm_add_ps(V0, V1);
-    XMVECTOR R2 = _mm_sub_ps(V0, V1);
-
-    V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1));
-    V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0));
-    V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0));
-    V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0));
-
-    Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0));
-    Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0));
-
-    XMMATRIX M;
-    M.r[0] = Q1;
-
-    Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1));
-    Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2));
-    M.r[1] = Q1;
-
-    Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0));
-    M.r[2] = Q1;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D
-(
-    FXMVECTOR ScalingOrigin, 
-    float     ScalingOrientation, 
-    FXMVECTOR Scaling, 
-    FXMVECTOR RotationOrigin, 
-    float     Rotation, 
-    GXMVECTOR Translation
-)
-{
-    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
-    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
-
-    XMVECTOR VScalingOrigin       = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v);
-    XMVECTOR NegScalingOrigin     = XMVectorNegate(VScalingOrigin);
-
-    XMMATRIX MScalingOriginI      = XMMatrixTranslationFromVector(NegScalingOrigin);
-    XMMATRIX MScalingOrientation  = XMMatrixRotationZ(ScalingOrientation);
-    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
-    XMVECTOR VScaling             = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
-    XMMATRIX MScaling             = XMMatrixScalingFromVector(VScaling);
-    XMVECTOR VRotationOrigin      = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
-    XMMATRIX MRotation            = XMMatrixRotationZ(Rotation);
-    XMVECTOR VTranslation         = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v);
-
-    XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
-    M      = XMMatrixMultiply(M, MScaling);
-    M      = XMMatrixMultiply(M, MScalingOrientation);
-    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M      = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTransformation
-(
-    FXMVECTOR ScalingOrigin, 
-    FXMVECTOR ScalingOrientationQuaternion, 
-    FXMVECTOR Scaling, 
-    GXMVECTOR RotationOrigin, 
-    HXMVECTOR RotationQuaternion, 
-    HXMVECTOR Translation
-)
-{
-    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
-    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
-
-    XMVECTOR VScalingOrigin       = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v);
-    XMVECTOR NegScalingOrigin     = XMVectorNegate(ScalingOrigin);
-
-    XMMATRIX MScalingOriginI      = XMMatrixTranslationFromVector(NegScalingOrigin);
-    XMMATRIX MScalingOrientation  = XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
-    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
-    XMMATRIX MScaling             = XMMatrixScalingFromVector(Scaling);
-    XMVECTOR VRotationOrigin      = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
-    XMMATRIX MRotation            = XMMatrixRotationQuaternion(RotationQuaternion);
-    XMVECTOR VTranslation         = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
-
-    XMMATRIX M;
-    M      = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
-    M      = XMMatrixMultiply(M, MScaling);
-    M      = XMMatrixMultiply(M, MScalingOrientation);
-    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M      = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D
-(
-    FXMVECTOR Scaling, 
-    FXMVECTOR RotationOrigin, 
-    float     Rotation, 
-    FXMVECTOR Translation
-)
-{
-    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
-
-    XMVECTOR VScaling        = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
-    XMMATRIX MScaling        = XMMatrixScalingFromVector(VScaling);
-    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
-    XMMATRIX MRotation       = XMMatrixRotationZ(Rotation);
-    XMVECTOR VTranslation    = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v);
-
-    XMMATRIX M;
-    M      = MScaling;
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M      = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation
-(
-    FXMVECTOR Scaling, 
-    FXMVECTOR RotationOrigin, 
-    FXMVECTOR RotationQuaternion, 
-    GXMVECTOR Translation
-)
-{
-    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
-
-    XMMATRIX MScaling        = XMMatrixScalingFromVector(Scaling);
-    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v);
-    XMMATRIX MRotation       = XMMatrixRotationQuaternion(RotationQuaternion);
-    XMVECTOR VTranslation    = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v);
-
-    XMMATRIX M;
-    M      = MScaling;
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M      = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixReflect
-(
-    FXMVECTOR ReflectionPlane
-)
-{
-    assert(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
-    assert(!XMPlaneIsInfinite(ReflectionPlane));
-
-    static const XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f};
-
-    XMVECTOR P = XMPlaneNormalize(ReflectionPlane);
-    XMVECTOR S = XMVectorMultiply(P, NegativeTwo);
-
-    XMVECTOR A = XMVectorSplatX(P);
-    XMVECTOR B = XMVectorSplatY(P);
-    XMVECTOR C = XMVectorSplatZ(P);
-    XMVECTOR D = XMVectorSplatW(P);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v);
-    M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v);
-    M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v);
-    M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixShadow
-(
-    FXMVECTOR ShadowPlane, 
-    FXMVECTOR LightPosition
-)
-{
-    static const XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1};
-
-    assert(!XMVector3Equal(ShadowPlane, XMVectorZero()));
-    assert(!XMPlaneIsInfinite(ShadowPlane));
-
-    XMVECTOR P = XMPlaneNormalize(ShadowPlane);
-    XMVECTOR Dot = XMPlaneDot(P, LightPosition);
-    P = XMVectorNegate(P);
-    XMVECTOR D = XMVectorSplatW(P);
-    XMVECTOR C = XMVectorSplatZ(P);
-    XMVECTOR B = XMVectorSplatY(P);
-    XMVECTOR A = XMVectorSplatX(P);
-    Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v);
-
-    XMMATRIX M;
-    M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot);
-    Dot = XMVectorRotateLeft(Dot, 1);
-    M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot);
-    Dot = XMVectorRotateLeft(Dot, 1);
-    M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot);
-    Dot = XMVectorRotateLeft(Dot, 1);
-    M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-// View and projection initialization operations
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH
-(
-    FXMVECTOR EyePosition, 
-    FXMVECTOR FocusPosition, 
-    FXMVECTOR UpDirection
-)
-{
-    XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition);
-    return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH
-(
-    FXMVECTOR EyePosition, 
-    FXMVECTOR FocusPosition, 
-    FXMVECTOR UpDirection
-)
-{
-    XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition);
-    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookToLH
-(
-    FXMVECTOR EyePosition, 
-    FXMVECTOR EyeDirection, 
-    FXMVECTOR UpDirection
-)
-{
-    assert(!XMVector3Equal(EyeDirection, XMVectorZero()));
-    assert(!XMVector3IsInfinite(EyeDirection));
-    assert(!XMVector3Equal(UpDirection, XMVectorZero()));
-    assert(!XMVector3IsInfinite(UpDirection));
-
-    XMVECTOR R2 = XMVector3Normalize(EyeDirection);
-
-    XMVECTOR R0 = XMVector3Cross(UpDirection, R2);
-    R0 = XMVector3Normalize(R0);
-
-    XMVECTOR R1 = XMVector3Cross(R2, R0);
-
-    XMVECTOR NegEyePosition = XMVectorNegate(EyePosition);
-
-    XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition);
-    XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition);
-    XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v);
-    M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v);
-    M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v);
-    M.r[3] = g_XMIdentityR3.v;
-
-    M = XMMatrixTranspose(M);
-
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookToRH
-(
-    FXMVECTOR EyePosition, 
-    FXMVECTOR EyeDirection, 
-    FXMVECTOR UpDirection
-)
-{
-    XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection);
-    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
-}
-
-//------------------------------------------------------------------------------
-
-#pragma prefast(push)
-#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
-(
-    float ViewWidth, 
-    float ViewHeight, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (FarZ - NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
-
-    M.m[3][0] = 0.0f;  
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (FarZ - NearZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 );
-    M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (FarZ - NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        TwoNearZ / ViewWidth,
-        TwoNearZ / ViewHeight,
-        fRange,
-        -fRange * NearZ
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // TwoNearZ / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,1.0f
-    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
-    // 0,0,fRange,1.0f
-    vTemp = _mm_setzero_ps();
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,0
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
-    M.r[3] = vTemp;
-
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
-(
-    float ViewWidth, 
-    float ViewHeight, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (NearZ - FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (NearZ - FarZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 );
-    M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (NearZ-FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        TwoNearZ / ViewWidth,
-        TwoNearZ / ViewHeight,
-        fRange,
-        fRange * NearZ
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // TwoNearZ / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,-1.0f
-    vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2));
-    // 0,0,fRange,-1.0f
-    vTemp = _mm_setzero_ps();
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,0
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
-(
-    float FovAngleY, 
-    float AspectRatio, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float    SinFov;
-    float    CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    float fRange = FarZ / (FarZ-NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = Width;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = Height;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float    SinFov;
-    float    CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float fRange = FarZ / (FarZ-NearZ);
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( Width, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( Height, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 );
-    M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float    SinFov;
-    float    CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float fRange = FarZ / (FarZ-NearZ);
-    // Note: This is recorded on the stack
-    float Height = CosFov / SinFov;
-    XMVECTOR rMem = {
-        Height / AspectRatio,
-        Height,
-        fRange,
-        -fRange * NearZ
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // CosFov / SinFov,0,0,0
-    XMMATRIX M;
-    M.r[0] = vTemp;
-    // 0,Height / AspectRatio,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
-    // 0,0,fRange,1.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,0.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
-(
-    float FovAngleY, 
-    float AspectRatio, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float    SinFov;
-    float    CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    float fRange = FarZ / (NearZ-FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = Width;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = Height;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float    SinFov;
-    float    CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-    float fRange = FarZ / (NearZ-FarZ);
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( Width, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( Height, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 );
-    M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float    SinFov;
-    float    CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-    float fRange = FarZ / (NearZ-FarZ);
-    // Note: This is recorded on the stack
-    float Height = CosFov / SinFov;
-    XMVECTOR rMem = {
-        Height / AspectRatio,
-        Height,
-        fRange,
-        fRange * NearZ
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // CosFov / SinFov,0,0,0
-    XMMATRIX M;
-    M.r[0] = vTemp;
-    // 0,Height / AspectRatio,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,-1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2));
-    // 0,0,fRange,-1.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
-    M.r[2] = vTemp;
-    // 0,0,fRange * NearZ,0.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
-(
-    float ViewLeft, 
-    float ViewRight, 
-    float ViewBottom, 
-    float ViewTop, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (FarZ-NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ * ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ * ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (FarZ-NearZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 );
-    M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         fRange,
-                         1.0f);
-    M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ+NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (FarZ-NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        TwoNearZ*ReciprocalWidth,
-        TwoNearZ*ReciprocalHeight,
-        -fRange * NearZ,
-        0
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // TwoNearZ*ReciprocalWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ*ReciprocalHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // 0,0,fRange,1.0f
-    M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth,
-                          -(ViewTop + ViewBottom) * ReciprocalHeight,
-                          fRange,
-                          1.0f );
-    // 0,0,-fRange * NearZ,0.0f
-    vValues = _mm_and_ps(vValues,g_XMMaskZ);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
-(
-    float ViewLeft, 
-    float ViewRight, 
-    float ViewBottom, 
-    float ViewTop, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (NearZ-FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ * ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ * ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (NearZ-FarZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 );
-    M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, 
-                         (ViewTop + ViewBottom) * ReciprocalHeight,
-                         fRange,
-                         -1.0f);
-    M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ+NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (NearZ-FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        TwoNearZ*ReciprocalWidth,
-        TwoNearZ*ReciprocalHeight,
-        fRange * NearZ,
-        0
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // TwoNearZ*ReciprocalWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ*ReciprocalHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // 0,0,fRange,1.0f
-    M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth,
-                          (ViewTop + ViewBottom) * ReciprocalHeight,
-                          fRange,
-                          -1.0f );
-    // 0,0,-fRange * NearZ,0.0f
-    vValues = _mm_and_ps(vValues,g_XMMaskZ);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
-(
-    float ViewWidth, 
-    float ViewHeight, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fRange = 1.0f / (FarZ-NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = 2.0f / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 2.0f / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fRange = 1.0f / (FarZ-NearZ);
-
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
-    M.r[3] = vsetq_lane_f32( -fRange * NearZ, g_XMIdentityR3.v, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fRange = 1.0f / (FarZ-NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        2.0f / ViewWidth,
-        2.0f / ViewHeight,
-        fRange,
-        -fRange * NearZ
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // 2.0f / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,2.0f / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
-    // 0,0,fRange,0.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,1.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
-(
-    float ViewWidth, 
-    float ViewHeight, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fRange = 1.0f / (NearZ-FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = 2.0f / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 2.0f / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fRange = 1.0f / (NearZ-FarZ);
-
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
-    M.r[3] = vsetq_lane_f32( fRange * NearZ, g_XMIdentityR3.v, 2 );
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fRange = 1.0f / (NearZ-FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        2.0f / ViewWidth,
-        2.0f / ViewHeight,
-        fRange,
-        fRange * NearZ
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // 2.0f / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,2.0f / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=fRange * NearZ,0,1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
-    // 0,0,fRange,0.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0));
-    M.r[2] = vTemp;
-    // 0,0,fRange * NearZ,1.0f
-    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
-(
-    float ViewLeft, 
-    float ViewRight, 
-    float ViewBottom, 
-    float ViewTop, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (FarZ-NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (FarZ-NearZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
-    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         -fRange * NearZ,
-                         1.0f);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (FarZ-NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        fReciprocalWidth,
-        fReciprocalHeight,
-        fRange,
-        1.0f
-    };
-    XMVECTOR rMem2 = {
-        -(ViewLeft + ViewRight),
-        -(ViewTop + ViewBottom),
-        -NearZ,
-        1.0f
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // fReciprocalWidth*2,0,0,0
-    vTemp = _mm_add_ss(vTemp,vTemp);
-    M.r[0] = vTemp;
-    // 0,fReciprocalHeight*2,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    vTemp = _mm_add_ps(vTemp,vTemp);
-    M.r[1] = vTemp;
-    // 0,0,fRange,0.0f
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskZ);
-    M.r[2] = vTemp;
-    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
-    vValues = _mm_mul_ps(vValues,rMem2);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
-(
-    float ViewLeft, 
-    float ViewRight, 
-    float ViewBottom, 
-    float ViewTop, 
-    float NearZ, 
-    float FarZ
-)
-{
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (NearZ-FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         fRange * NearZ,
-                         1.0f);
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (NearZ-FarZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 );
-    M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 );
-    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
-    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         fRange * NearZ,
-                         1.0f);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (NearZ-FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {
-        fReciprocalWidth,
-        fReciprocalHeight,
-        fRange,
-        1.0f
-    };
-    XMVECTOR rMem2 = {
-        -(ViewLeft + ViewRight),
-        -(ViewTop + ViewBottom),
-        NearZ,
-        1.0f
-    };
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps(); 
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp,vValues);
-    // fReciprocalWidth*2,0,0,0
-    vTemp = _mm_add_ss(vTemp,vTemp);
-    M.r[0] = vTemp;
-    // 0,fReciprocalHeight*2,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
-    vTemp = _mm_add_ps(vTemp,vTemp);
-    M.r[1] = vTemp;
-    // 0,0,fRange,0.0f
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp,g_XMMaskZ);
-    M.r[2] = vTemp;
-    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
-    vValues = _mm_mul_ps(vValues,rMem2);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-#pragma prefast(pop)
-
-/****************************************************************************
- *
- * XMMATRIX operators and methods
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX::XMMATRIX
-(
-    float m00, float m01, float m02, float m03,
-    float m10, float m11, float m12, float m13,
-    float m20, float m21, float m22, float m23,
-    float m30, float m31, float m32, float m33
-)
-{
-    r[0] = XMVectorSet(m00, m01, m02, m03);
-    r[1] = XMVectorSet(m10, m11, m12, m13);
-    r[2] = XMVectorSet(m20, m21, m22, m23);
-    r[3] = XMVectorSet(m30, m31, m32, m33);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMMATRIX::XMMATRIX
-(
-    const float* pArray
-)
-{
-    assert( pArray != nullptr );
-    r[0] = XMLoadFloat4((const XMFLOAT4*)pArray);
-    r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4));
-    r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8));
-    r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12));
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XMMATRIX::operator- () const
-{
-    XMMATRIX R;
-    R.r[0] = XMVectorNegate( r[0] );
-    R.r[1] = XMVectorNegate( r[1] );
-    R.r[2] = XMVectorNegate( r[2] );
-    R.r[3] = XMVectorNegate( r[3] );
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M)
-{
-    r[0] = XMVectorAdd( r[0], M.r[0] );
-    r[1] = XMVectorAdd( r[1], M.r[1] );
-    r[2] = XMVectorAdd( r[2], M.r[2] );
-    r[3] = XMVectorAdd( r[3], M.r[3] );
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M)
-{
-    r[0] = XMVectorSubtract( r[0], M.r[0] );
-    r[1] = XMVectorSubtract( r[1], M.r[1] );
-    r[2] = XMVectorSubtract( r[2], M.r[2] );
-    r[3] = XMVectorSubtract( r[3], M.r[3] );
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M)
-{
-    *this = XMMatrixMultiply( *this, M );
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XMMATRIX::operator*= (float S)
-{
-    r[0] = XMVectorScale( r[0], S );
-    r[1] = XMVectorScale( r[1], S );
-    r[2] = XMVectorScale( r[2], S );
-    r[3] = XMVectorScale( r[3], S );
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XMMATRIX::operator/= (float S)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vS = XMVectorReplicate( S );
-    r[0] = XMVectorDivide( r[0], vS );
-    r[1] = XMVectorDivide( r[1], vS );
-    r[2] = XMVectorDivide( r[2], vS );
-    r[3] = XMVectorDivide( r[3], vS );
-    return *this;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x2_t vS = vdup_n_f32( S );
-    float32x2_t R0 = vrecpe_f32( vS );
-    float32x2_t S0 = vrecps_f32( R0, vS );
-    R0 = vmul_f32( S0, R0 );
-    S0 = vrecps_f32( R0, vS );
-    R0 = vmul_f32( S0, R0 );
-    float32x4_t Reciprocal = vcombine_u32(R0, R0);
-    r[0] = vmulq_f32( r[0], Reciprocal );
-    r[1] = vmulq_f32( r[1], Reciprocal );
-    r[2] = vmulq_f32( r[2], Reciprocal );
-    r[3] = vmulq_f32( r[3], Reciprocal );
-    return *this;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 vS = _mm_set_ps1( S );
-    r[0] = _mm_div_ps( r[0], vS );
-    r[1] = _mm_div_ps( r[1], vS );
-    r[2] = _mm_div_ps( r[2], vS );
-    r[3] = _mm_div_ps( r[3], vS );
-    return *this;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const
-{
-    XMMATRIX R;
-    R.r[0] = XMVectorAdd( r[0], M.r[0] );
-    R.r[1] = XMVectorAdd( r[1], M.r[1] );
-    R.r[2] = XMVectorAdd( r[2], M.r[2] );
-    R.r[3] = XMVectorAdd( r[3], M.r[3] );
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const
-{
-    XMMATRIX R;
-    R.r[0] = XMVectorSubtract( r[0], M.r[0] );
-    R.r[1] = XMVectorSubtract( r[1], M.r[1] );
-    R.r[2] = XMVectorSubtract( r[2], M.r[2] );
-    R.r[3] = XMVectorSubtract( r[3], M.r[3] );
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const
-{
-    return XMMatrixMultiply(*this, M);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XMMATRIX::operator* (float S) const
-{
-    XMMATRIX R;
-    R.r[0] = XMVectorScale( r[0], S );
-    R.r[1] = XMVectorScale( r[1], S );
-    R.r[2] = XMVectorScale( r[2], S );
-    R.r[3] = XMVectorScale( r[3], S );
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XMMATRIX::operator/ (float S) const
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vS = XMVectorReplicate( S );
-    XMMATRIX R;
-    R.r[0] = XMVectorDivide( r[0], vS );
-    R.r[1] = XMVectorDivide( r[1], vS );
-    R.r[2] = XMVectorDivide( r[2], vS );
-    R.r[3] = XMVectorDivide( r[3], vS );
-    return R;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x2_t vS = vdup_n_f32( S );
-    float32x2_t R0 = vrecpe_f32( vS );
-    float32x2_t S0 = vrecps_f32( R0, vS );
-    R0 = vmul_f32( S0, R0 );
-    S0 = vrecps_f32( R0, vS );
-    R0 = vmul_f32( S0, R0 );
-    float32x4_t Reciprocal = vcombine_u32(R0, R0);
-    XMMATRIX R;
-    R.r[0] = vmulq_f32( r[0], Reciprocal );
-    R.r[1] = vmulq_f32( r[1], Reciprocal );
-    R.r[2] = vmulq_f32( r[2], Reciprocal );
-    R.r[3] = vmulq_f32( r[3], Reciprocal );
-    return R;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 vS = _mm_set_ps1( S );
-    XMMATRIX R;
-    R.r[0] = _mm_div_ps( r[0], vS );
-    R.r[1] = _mm_div_ps( r[1], vS );
-    R.r[2] = _mm_div_ps( r[2], vS );
-    R.r[3] = _mm_div_ps( r[3], vS );
-    return R;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV operator*
-(
-    float S,
-    FXMMATRIX M
-)
-{
-    XMMATRIX R;
-    R.r[0] = XMVectorScale( M.r[0], S );
-    R.r[1] = XMVectorScale( M.r[1], S );
-    R.r[2] = XMVectorScale( M.r[2], S );
-    R.r[3] = XMVectorScale( M.r[3], S );
-    return R;
-}
-
-/****************************************************************************
- *
- * XMFLOAT3X3 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMFLOAT3X3::XMFLOAT3X3
-(
-    const float* pArray
-)
-{
-    assert( pArray != nullptr );
-    for (size_t Row = 0; Row < 3; Row++)
-    {
-        for (size_t Column = 0; Column < 3; Column++)
-        {
-            m[Row][Column] = pArray[Row * 3 + Column];
-        }
-    }
-}
-
-//------------------------------------------------------------------------------
-
-inline XMFLOAT3X3& XMFLOAT3X3::operator=
-(
-    const XMFLOAT3X3& Float3x3
-)
-{
-    _11 = Float3x3._11;
-    _12 = Float3x3._12;
-    _13 = Float3x3._13;
-    _21 = Float3x3._21;
-    _22 = Float3x3._22;
-    _23 = Float3x3._23;
-    _31 = Float3x3._31;
-    _32 = Float3x3._32;
-    _33 = Float3x3._33;
-
-    return *this;
-}
-
-/****************************************************************************
- *
- * XMFLOAT4X3 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMFLOAT4X3::XMFLOAT4X3
-(
-    const float* pArray
-)
-{
-    assert( pArray != nullptr );
-
-    m[0][0] = pArray[0];
-    m[0][1] = pArray[1];
-    m[0][2] = pArray[2];
-
-    m[1][0] = pArray[3];
-    m[1][1] = pArray[4];
-    m[1][2] = pArray[5];
-
-    m[2][0] = pArray[6];
-    m[2][1] = pArray[7];
-    m[2][2] = pArray[8];
-
-    m[3][0] = pArray[9];
-    m[3][1] = pArray[10];
-    m[3][2] = pArray[11];
-}
-
-//------------------------------------------------------------------------------
-
-inline XMFLOAT4X3& XMFLOAT4X3::operator=
-(
-    const XMFLOAT4X3& Float4x3
-)
-{
-    XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11);
-    XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22);
-    XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33);
-
-    XMStoreFloat4((XMFLOAT4*)&_11, V1);
-    XMStoreFloat4((XMFLOAT4*)&_22, V2);
-    XMStoreFloat4((XMFLOAT4*)&_33, V3);
-
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMFLOAT4X3A& XMFLOAT4X3A::operator=
-(
-    const XMFLOAT4X3A& Float4x3
-)
-{
-    XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11);
-    XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22);
-    XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33);
-
-    XMStoreFloat4A((XMFLOAT4A*)&_11, V1);
-    XMStoreFloat4A((XMFLOAT4A*)&_22, V2);
-    XMStoreFloat4A((XMFLOAT4A*)&_33, V3);
-
-    return *this;
-}
-
-/****************************************************************************
- *
- * XMFLOAT4X4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMFLOAT4X4::XMFLOAT4X4
-(
-    const float* pArray
-)
-{
-    assert( pArray != nullptr );
-
-    m[0][0] = pArray[0];
-    m[0][1] = pArray[1];
-    m[0][2] = pArray[2];
-    m[0][3] = pArray[3];
-
-    m[1][0] = pArray[4];
-    m[1][1] = pArray[5];
-    m[1][2] = pArray[6];
-    m[1][3] = pArray[7];
-
-    m[2][0] = pArray[8];
-    m[2][1] = pArray[9];
-    m[2][2] = pArray[10];
-    m[2][3] = pArray[11];
-
-    m[3][0] = pArray[12];
-    m[3][1] = pArray[13];
-    m[3][2] = pArray[14];
-    m[3][3] = pArray[15];
-}
-
-//------------------------------------------------------------------------------
-
-inline XMFLOAT4X4& XMFLOAT4X4::operator=
-(
-    const XMFLOAT4X4& Float4x4
-)
-{
-    XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11);
-    XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21);
-    XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31);
-    XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41);
-
-    XMStoreFloat4((XMFLOAT4*)&_11, V1);
-    XMStoreFloat4((XMFLOAT4*)&_21, V2);
-    XMStoreFloat4((XMFLOAT4*)&_31, V3);
-    XMStoreFloat4((XMFLOAT4*)&_41, V4);
-
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMFLOAT4X4A& XMFLOAT4X4A::operator=
-(
-    const XMFLOAT4X4A& Float4x4
-)
-{
-    XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11);
-    XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21);
-    XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31);
-    XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41);
-
-    XMStoreFloat4A((XMFLOAT4A*)&_11, V1);
-    XMStoreFloat4A((XMFLOAT4A*)&_21, V2);
-    XMStoreFloat4A((XMFLOAT4A*)&_31, V3);
-    XMStoreFloat4A((XMFLOAT4A*)&_41, V4);
-
-    return *this;
-}
-
+//-------------------------------------------------------------------------------------
+// DirectXMathMatrix.inl -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Matrix
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+// Return true if any entry in the matrix is NaN
+inline bool XM_CALLCONV XMMatrixIsNaN
+(
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    size_t i = 16;
+    const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]);
+    do {
+        // Fetch value into integer unit
+        uint32_t uTest = pWork[0];
+        // Remove sign
+        uTest &= 0x7FFFFFFFU;
+        // NaN is 0x7F800001 through 0x7FFFFFFF inclusive
+        uTest -= 0x7F800001U;
+        if (uTest<0x007FFFFFU) {
+            break;      // NaN found
+        }
+        ++pWork;        // Next entry
+    } while (--i);
+    return (i!=0);      // i == 0 if nothing matched
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Load in registers
+    XMVECTOR vX = M.r[0];
+    XMVECTOR vY = M.r[1];
+    XMVECTOR vZ = M.r[2];
+    XMVECTOR vW = M.r[3];
+    // Test themselves to check for NaN
+    vX = vmvnq_u32(vceqq_f32(vX, vX));
+    vY = vmvnq_u32(vceqq_f32(vY, vY));
+    vZ = vmvnq_u32(vceqq_f32(vZ, vZ));
+    vW = vmvnq_u32(vceqq_f32(vW, vW));
+    // Or all the results
+    vX = vorrq_u32(vX,vZ);
+    vY = vorrq_u32(vY,vW);
+    vX = vorrq_u32(vX,vY);
+    // If any tested true, return true
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    return (r != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Load in registers
+    XMVECTOR vX = M.r[0];
+    XMVECTOR vY = M.r[1];
+    XMVECTOR vZ = M.r[2];
+    XMVECTOR vW = M.r[3];
+    // Test themselves to check for NaN
+    vX = _mm_cmpneq_ps(vX,vX);
+    vY = _mm_cmpneq_ps(vY,vY);
+    vZ = _mm_cmpneq_ps(vZ,vZ);
+    vW = _mm_cmpneq_ps(vW,vW);
+    // Or all the results
+    vX = _mm_or_ps(vX,vZ);
+    vY = _mm_or_ps(vY,vW);
+    vX = _mm_or_ps(vX,vY);
+    // If any tested true, return true
+    return (_mm_movemask_ps(vX)!=0);
+#else
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return true if any entry in the matrix is +/-INF
+inline bool XM_CALLCONV XMMatrixIsInfinite
+(
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    size_t i = 16;
+    const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]);
+    do {
+        // Fetch value into integer unit
+        uint32_t uTest = pWork[0];
+        // Remove sign
+        uTest &= 0x7FFFFFFFU;
+        // INF is 0x7F800000
+        if (uTest==0x7F800000U) {
+            break;      // INF found
+        }
+        ++pWork;        // Next entry
+    } while (--i);
+    return (i!=0);      // i == 0 if nothing matched
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bits
+    XMVECTOR vTemp1 = vandq_u32(M.r[0],g_XMAbsMask);
+    XMVECTOR vTemp2 = vandq_u32(M.r[1],g_XMAbsMask);
+    XMVECTOR vTemp3 = vandq_u32(M.r[2],g_XMAbsMask);
+    XMVECTOR vTemp4 = vandq_u32(M.r[3],g_XMAbsMask);
+    // Compare to infinity
+    vTemp1 = vceqq_f32(vTemp1,g_XMInfinity);
+    vTemp2 = vceqq_f32(vTemp2,g_XMInfinity);
+    vTemp3 = vceqq_f32(vTemp3,g_XMInfinity);
+    vTemp4 = vceqq_f32(vTemp4,g_XMInfinity);
+    // Or the answers together
+    vTemp1 = vorrq_u32(vTemp1,vTemp2);
+    vTemp3 = vorrq_u32(vTemp3,vTemp4);
+    vTemp1 = vorrq_u32(vTemp1,vTemp3);
+    // If any are infinity, the signs are true.
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    return (r != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bits
+    XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask);
+    XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask);
+    XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask);
+    XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask);
+    // Compare to infinity
+    vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity);
+    vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity);
+    vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity);
+    vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity);
+    // Or the answers together
+    vTemp1 = _mm_or_ps(vTemp1,vTemp2);
+    vTemp3 = _mm_or_ps(vTemp3,vTemp4);
+    vTemp1 = _mm_or_ps(vTemp1,vTemp3);
+    // If any are infinity, the signs are true.
+    return (_mm_movemask_ps(vTemp1)!=0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return true if the XMMatrix is equal to identity
+inline bool XM_CALLCONV XMMatrixIsIdentity
+(
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    // Use the integer pipeline to reduce branching to a minimum
+    const uint32_t *pWork = (const uint32_t*)(&M.m[0][0]);
+    // Convert 1.0f to zero and or them together
+    uint32_t uOne = pWork[0]^0x3F800000U;
+    // Or all the 0.0f entries together
+    uint32_t uZero = pWork[1];
+    uZero |= pWork[2];
+    uZero |= pWork[3];
+    // 2nd row
+    uZero |= pWork[4];
+    uOne |= pWork[5]^0x3F800000U;
+    uZero |= pWork[6];
+    uZero |= pWork[7];
+    // 3rd row
+    uZero |= pWork[8];
+    uZero |= pWork[9];
+    uOne |= pWork[10]^0x3F800000U;
+    uZero |= pWork[11];
+    // 4th row
+    uZero |= pWork[12];
+    uZero |= pWork[13];
+    uZero |= pWork[14];
+    uOne |= pWork[15]^0x3F800000U;
+    // If all zero entries are zero, the uZero==0
+    uZero &= 0x7FFFFFFF;    // Allow -0.0f
+    // If all 1.0f entries are 1.0f, then uOne==0
+    uOne |= uZero;
+    return (uOne==0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vTemp1 = vceqq_f32(M.r[0],g_XMIdentityR0);
+    XMVECTOR vTemp2 = vceqq_f32(M.r[1],g_XMIdentityR1);
+    XMVECTOR vTemp3 = vceqq_f32(M.r[2],g_XMIdentityR2);
+    XMVECTOR vTemp4 = vceqq_f32(M.r[3],g_XMIdentityR3);
+    vTemp1 = vandq_u32(vTemp1,vTemp2);
+    vTemp3 = vandq_u32(vTemp3,vTemp4);
+    vTemp1 = vandq_u32(vTemp1,vTemp3);
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    return ( r == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0);
+    XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1);
+    XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2);
+    XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3);
+    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+    vTemp3 = _mm_and_ps(vTemp3,vTemp4);
+    vTemp1 = _mm_and_ps(vTemp1,vTemp3);
+    return (_mm_movemask_ps(vTemp1)==0x0f);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Perform a 4x4 matrix multiply by a 4x4 matrix
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMMATRIX mResult;
+    // Cache the invariants in registers
+    float x = M1.m[0][0];
+    float y = M1.m[0][1];
+    float z = M1.m[0][2];
+    float w = M1.m[0][3];
+    // Perform the operation on the first row
+    mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+    mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+    mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+    mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+    // Repeat for all the other rows
+    x = M1.m[1][0];
+    y = M1.m[1][1];
+    z = M1.m[1][2];
+    w = M1.m[1][3];
+    mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+    mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+    mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+    mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+    x = M1.m[2][0];
+    y = M1.m[2][1];
+    z = M1.m[2][2];
+    w = M1.m[2][3];
+    mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+    mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+    mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+    mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+    x = M1.m[3][0];
+    y = M1.m[3][1];
+    z = M1.m[3][2];
+    w = M1.m[3][3];
+    mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+    mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+    mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+    mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+    return mResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX mResult;
+    float32x2_t VL = vget_low_f32( M1.r[0] );
+    float32x2_t VH = vget_high_f32( M1.r[0] );
+    // Perform the operation on the first row
+    XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[0] = vaddq_f32( vZ, vW );
+    // Repeat for the other 3 rows
+    VL = vget_low_f32( M1.r[1] );
+    VH = vget_high_f32( M1.r[1] );
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[1] = vaddq_f32( vZ, vW );
+    VL = vget_low_f32( M1.r[2] );
+    VH = vget_high_f32( M1.r[2] );
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[2] = vaddq_f32( vZ, vW );
+    VL = vget_low_f32( M1.r[3] );
+    VH = vget_high_f32( M1.r[3] );
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[3] = vaddq_f32( vZ, vW );
+    return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    // Perform a binary add to reduce cumulative errors
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    mResult.r[3] = vX;
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMMATRIX mResult;
+    // Cache the invariants in registers
+    float x = M2.m[0][0];
+    float y = M2.m[1][0];
+    float z = M2.m[2][0];
+    float w = M2.m[3][0];
+    // Perform the operation on the first row
+    mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+    mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+    mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+    mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+    // Repeat for all the other rows
+    x = M2.m[0][1];
+    y = M2.m[1][1];
+    z = M2.m[2][1];
+    w = M2.m[3][1];
+    mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+    mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+    mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+    mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+    x = M2.m[0][2];
+    y = M2.m[1][2];
+    z = M2.m[2][2];
+    w = M2.m[3][2];
+    mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+    mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+    mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+    mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+    x = M2.m[0][3];
+    y = M2.m[1][3];
+    z = M2.m[2][3];
+    w = M2.m[3][3];
+    mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+    mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+    mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+    mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+    return mResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( M1.r[0] );
+    float32x2_t VH = vget_high_f32( M1.r[0] );
+    // Perform the operation on the first row
+    XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r0 = vaddq_f32( vZ, vW );
+    // Repeat for the other 3 rows
+    VL = vget_low_f32( M1.r[1] );
+    VH = vget_high_f32( M1.r[1] );
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r1 = vaddq_f32( vZ, vW );
+    VL = vget_low_f32( M1.r[2] );
+    VH = vget_high_f32( M1.r[2] );
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r2 = vaddq_f32( vZ, vW );
+    VL = vget_low_f32( M1.r[3] );
+    VH = vget_high_f32( M1.r[3] );
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r3 = vaddq_f32( vZ, vW );
+ 
+    // Transpose result
+    float32x4x2_t P0 = vzipq_f32( r0, r2 );
+    float32x4x2_t P1 = vzipq_f32( r1, r3 );
+
+    float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
+    float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
+
+    XMMATRIX mResult;
+    mResult.r[0] = T0.val[0];
+    mResult.r[1] = T0.val[1];
+    mResult.r[2] = T1.val[0];
+    mResult.r[3] = T1.val[1];
+    return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    // Perform a binary add to reduce cumulative errors
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vY = _mm_mul_ps(vY,M2.r[1]);
+    vZ = _mm_mul_ps(vZ,M2.r[2]);
+    vW = _mm_mul_ps(vW,M2.r[3]);
+    vX = _mm_add_ps(vX,vZ);
+    vY = _mm_add_ps(vY,vW);
+    vX = _mm_add_ps(vX,vY);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTranspose
+(
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    // Original matrix:
+    //
+    //     m00m01m02m03
+    //     m10m11m12m13
+    //     m20m21m22m23
+    //     m30m31m32m33
+
+    XMMATRIX P;
+    P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21
+    P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31
+    P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23
+    P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33
+
+    XMMATRIX MT;
+    MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30
+    MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31
+    MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32
+    MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33
+    return MT;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] );
+    float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] );
+
+    float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
+    float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
+
+    XMMATRIX mResult;
+    mResult.r[0] = T0.val[0];
+    mResult.r[1] = T0.val[1];
+    mResult.r[2] = T1.val[0];
+    mResult.r[3] = T1.val[1];
+    return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2));
+    XMMATRIX mResult;
+
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return the inverse and the determinant of a 4x4 matrix
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMMatrixInverse
+(
+    XMVECTOR* pDeterminant, 
+    FXMMATRIX  M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMMATRIX MT = XMMatrixTranspose(M);
+
+    XMVECTOR V0[4], V1[4];
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[2]);
+    V1[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[3]);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[0]);
+    V1[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[1]);
+    V0[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[2], MT.r[0]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[3], MT.r[1]);
+
+    XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]);
+    XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]);
+    XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[2]);
+    V1[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[3]);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[0]);
+    V1[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[1]);
+    V0[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[2], MT.r[0]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[3], MT.r[1]);
+
+    D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0);
+    D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1);
+    D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[1]);
+    V1[0] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D0, D2);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[0]);
+    V1[1] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D0, D2);
+    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[3]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D1, D2);
+    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[2]);
+    V1[3] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D1, D2);
+
+    XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]);
+    XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]);
+    XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]);
+    XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[1]);
+    V1[0] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(D0, D2);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[0]);
+    V1[1] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X>(D0, D2);
+    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[3]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z>(D1, D2);
+    V0[3] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[2]);
+    V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(D1, D2);
+
+    C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
+    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
+    C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
+    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[1]);
+    V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z>(D0, D2);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[0]);
+    V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X>(D0, D2);
+    V0[2] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[3]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(D1, D2);
+    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[2]);
+    V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z>(D1, D2); 
+
+    XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
+    C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
+    XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
+    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
+    XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
+    C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
+    XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
+    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
+
+    XMMATRIX R;
+    R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
+    R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
+    R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
+    R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);
+
+    XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]);
+
+    if (pDeterminant != nullptr)
+        *pDeterminant = Determinant;
+
+    XMVECTOR Reciprocal = XMVectorReciprocal(Determinant);
+
+    XMMATRIX Result;
+    Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
+    Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
+    Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
+    Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX MT = XMMatrixTranspose(M);
+    XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0));
+    XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2));
+    XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0));
+    XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2));
+    XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0));
+    XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1));
+
+    XMVECTOR D0 = _mm_mul_ps(V00,V10);
+    XMVECTOR D1 = _mm_mul_ps(V01,V11);
+    XMVECTOR D2 = _mm_mul_ps(V02,V12);
+
+    V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2));
+    V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0));
+    V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2));
+    V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0));
+    V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1));
+    V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0));
+
+    V00 = _mm_mul_ps(V00,V10);
+    V01 = _mm_mul_ps(V01,V11);
+    V02 = _mm_mul_ps(V02,V12);
+    D0 = _mm_sub_ps(D0,V00);
+    D1 = _mm_sub_ps(D1,V01);
+    D2 = _mm_sub_ps(D2,V02);
+    // V11 = D0Y,D0W,D2Y,D2Y
+    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1));
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1));
+    V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2));
+    V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1));
+    // V13 = D1Y,D1W,D2W,D2W
+    XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1));
+    V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2));
+    XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2));
+    V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1));
+
+    XMVECTOR C0 = _mm_mul_ps(V00,V10);
+    XMVECTOR C2 = _mm_mul_ps(V01,V11);
+    XMVECTOR C4 = _mm_mul_ps(V02,V12);
+    XMVECTOR C6 = _mm_mul_ps(V03,V13);
+
+    // V11 = D0X,D0Y,D2X,D2X
+    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0));
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2));
+    V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3));
+    V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2));
+    // V13 = D1X,D1Y,D2Z,D2Z
+    V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2));
+    V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3));
+    V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3));
+    V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2));
+
+    V00 = _mm_mul_ps(V00,V10);
+    V01 = _mm_mul_ps(V01,V11);
+    V02 = _mm_mul_ps(V02,V12);
+    V03 = _mm_mul_ps(V03,V13);
+    C0 = _mm_sub_ps(C0,V00);
+    C2 = _mm_sub_ps(C2,V01);
+    C4 = _mm_sub_ps(C4,V02);
+    C6 = _mm_sub_ps(C6,V03);
+
+    V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3));
+    // V10 = D0Z,D0Z,D2X,D2Y
+    V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2));
+    V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0));
+    V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1));
+    // V11 = D0X,D0W,D2X,D2Y
+    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0));
+    V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3));
+    V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3));
+    // V12 = D1Z,D1Z,D2Z,D2W
+    V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2));
+    V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0));
+    V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1));
+    // V13 = D1X,D1W,D2Z,D2W
+    V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0));
+    V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3));
+
+    V00 = _mm_mul_ps(V00,V10);
+    V01 = _mm_mul_ps(V01,V11);
+    V02 = _mm_mul_ps(V02,V12);
+    V03 = _mm_mul_ps(V03,V13);
+    XMVECTOR C1 = _mm_sub_ps(C0,V00);
+    C0 = _mm_add_ps(C0,V00);
+    XMVECTOR C3 = _mm_add_ps(C2,V01);
+    C2 = _mm_sub_ps(C2,V01);
+    XMVECTOR C5 = _mm_sub_ps(C4,V02);
+    C4 = _mm_add_ps(C4,V02);
+    XMVECTOR C7 = _mm_add_ps(C6,V03);
+    C6 = _mm_sub_ps(C6,V03);
+
+    C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0));
+    C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0));
+    C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0));
+    C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0));
+    C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0));
+    C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0));
+    C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0));
+    C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0));
+    // Get the determinate
+    XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]);
+    if (pDeterminant != nullptr)
+        *pDeterminant = vTemp;
+    vTemp = _mm_div_ps(g_XMOne,vTemp);
+    XMMATRIX mResult;
+    mResult.r[0] = _mm_mul_ps(C0,vTemp);
+    mResult.r[1] = _mm_mul_ps(C2,vTemp);
+    mResult.r[2] = _mm_mul_ps(C4,vTemp);
+    mResult.r[3] = _mm_mul_ps(C6,vTemp);
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMMatrixDeterminant
+(
+    FXMMATRIX M
+)
+{
+    static const XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f};
+
+    XMVECTOR V0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
+    XMVECTOR V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
+    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
+    XMVECTOR V3 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
+    XMVECTOR V4 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
+    XMVECTOR V5 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
+
+    XMVECTOR P0 = XMVectorMultiply(V0, V1);
+    XMVECTOR P1 = XMVectorMultiply(V2, V3);
+    XMVECTOR P2 = XMVectorMultiply(V4, V5);
+
+    V0 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
+    V1 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
+    V2 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
+    V3 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
+    V4 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
+    V5 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
+
+    P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
+    P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
+    P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);
+
+    V0 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[1]);
+    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[1]);
+    V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[1]);
+
+    XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v);
+    XMVECTOR R = XMVectorMultiply(V0, P0);
+    R = XMVectorNegativeMultiplySubtract(V1, P1, R);
+    R = XMVectorMultiplyAdd(V2, P2, R);
+
+    return XMVector4Dot(S, R);
+}
+
+#define XM3RANKDECOMPOSE(a, b, c, x, y, z)      \
+    if((x) < (y))                   \
+    {                               \
+        if((y) < (z))               \
+        {                           \
+            (a) = 2;                \
+            (b) = 1;                \
+            (c) = 0;                \
+        }                           \
+        else                        \
+        {                           \
+            (a) = 1;                \
+                                    \
+            if((x) < (z))           \
+            {                       \
+                (b) = 2;            \
+                (c) = 0;            \
+            }                       \
+            else                    \
+            {                       \
+                (b) = 0;            \
+                (c) = 2;            \
+            }                       \
+        }                           \
+    }                               \
+    else                            \
+    {                               \
+        if((x) < (z))               \
+        {                           \
+            (a) = 2;                \
+            (b) = 0;                \
+            (c) = 1;                \
+        }                           \
+        else                        \
+        {                           \
+            (a) = 0;                \
+                                    \
+            if((y) < (z))           \
+            {                       \
+                (b) = 2;            \
+                (c) = 1;            \
+            }                       \
+            else                    \
+            {                       \
+                (b) = 1;            \
+                (c) = 2;            \
+            }                       \
+        }                           \
+    }
+                                    
+#define XM3_DECOMP_EPSILON 0.0001f
+
+_Use_decl_annotations_
+inline bool XM_CALLCONV XMMatrixDecompose
+(
+    XMVECTOR *outScale,
+    XMVECTOR *outRotQuat,
+    XMVECTOR *outTrans,
+    FXMMATRIX M
+)
+{
+    static const XMVECTOR *pvCanonicalBasis[3] = {
+        &g_XMIdentityR0.v,
+        &g_XMIdentityR1.v,
+        &g_XMIdentityR2.v
+    };
+
+    assert( outScale != nullptr );
+    assert( outRotQuat != nullptr );
+    assert( outTrans != nullptr );
+
+    // Get the translation
+    outTrans[0] = M.r[3];
+
+    XMVECTOR *ppvBasis[3];
+    XMMATRIX matTemp;
+    ppvBasis[0] = &matTemp.r[0];
+    ppvBasis[1] = &matTemp.r[1];
+    ppvBasis[2] = &matTemp.r[2];
+
+    matTemp.r[0] = M.r[0];
+    matTemp.r[1] = M.r[1];
+    matTemp.r[2] = M.r[2];
+    matTemp.r[3] = g_XMIdentityR3.v;
+
+    float *pfScales = (float *)outScale;
+
+    size_t a, b, c;
+    XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); 
+    XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); 
+    XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); 
+    pfScales[3] = 0.f;
+
+    XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2])
+
+    if(pfScales[a] < XM3_DECOMP_EPSILON)
+    {
+        ppvBasis[a][0] = pvCanonicalBasis[a][0];
+    }
+    ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]);
+
+    if(pfScales[b] < XM3_DECOMP_EPSILON)
+    {
+        size_t aa, bb, cc;
+        float fAbsX, fAbsY, fAbsZ;
+
+        fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0]));
+        fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0]));
+        fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0]));
+
+        XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ)
+
+        ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]);
+    }
+
+    ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]);
+
+    if(pfScales[c] < XM3_DECOMP_EPSILON)
+    {
+        ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]);
+    }
+        
+    ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]);
+
+    float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp));
+
+    // use Kramer's rule to check for handedness of coordinate system
+    if(fDet < 0.0f)
+    {
+        // switch coordinate system by negating the scale and inverting the basis vector on the x-axis
+        pfScales[a] = -pfScales[a];
+        ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]);
+
+        fDet = -fDet;
+    }
+
+    fDet -= 1.0f;
+    fDet *= fDet;
+
+    if(XM3_DECOMP_EPSILON < fDet)
+    {
+        // Non-SRT matrix encountered
+        return false;
+    }
+
+    // generate the quaternion from the matrix
+    outRotQuat[0] = XMQuaternionRotationMatrix(matTemp);
+    return true;
+}
+
+#undef XM3_DECOMP_EPSILON
+#undef XM3RANKDECOMPOSE
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixIdentity()
+{
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixSet
+(
+    float m00, float m01, float m02, float m03,
+    float m10, float m11, float m12, float m13,
+    float m20, float m21, float m22, float m23,
+    float m30, float m31, float m32, float m33
+)
+{
+    XMMATRIX M;
+#if defined(_XM_NO_INTRINSICS_)
+    M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03;
+    M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13;
+    M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23;
+    M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33;
+#else
+    M.r[0] = XMVectorSet(m00, m01, m02, m03);
+    M.r[1] = XMVectorSet(m10, m11, m12, m13);
+    M.r[2] = XMVectorSet(m20, m21, m22, m23);
+    M.r[3] = XMVectorSet(m30, m31, m32, m33);
+#endif
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTranslation
+(
+    float OffsetX, 
+    float OffsetY, 
+    float OffsetZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = 1.0f;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 1.0f;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = 1.0f;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = OffsetX;
+    M.m[3][1] = OffsetY;
+    M.m[3][2] = OffsetZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f );
+    return M;
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector
+(
+    FXMVECTOR Offset
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = 1.0f;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 1.0f;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = 1.0f;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = Offset.vector4_f32[0];
+    M.m[3][1] = Offset.vector4_f32[1];
+    M.m[3][2] = Offset.vector4_f32[2];
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = XMVectorSelect( g_XMIdentityR3.v, Offset, g_XMSelect1110.v );
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixScaling
+(
+    float ScaleX, 
+    float ScaleY, 
+    float ScaleZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = ScaleX;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = ScaleY;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = ScaleZ;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( ScaleX, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( ScaleY, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( ScaleZ, Zero, 2 );
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX );
+    M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 );
+    M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 );
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector
+(
+    FXMVECTOR Scale
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = Scale.vector4_f32[0];
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = Scale.vector4_f32[1];
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = Scale.vector4_f32[2];
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = vandq_u32(Scale,g_XMMaskX);
+    M.r[1] = vandq_u32(Scale,g_XMMaskY);
+    M.r[2] = vandq_u32(Scale,g_XMMaskZ);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_and_ps(Scale,g_XMMaskX);
+    M.r[1] = _mm_and_ps(Scale,g_XMMaskY);
+    M.r[2] = _mm_and_ps(Scale,g_XMMaskZ);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationX
+(
+    float Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ 
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMMATRIX M;
+    M.m[0][0] = 1.0f;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = fCosAngle;
+    M.m[1][2] = fSinAngle;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = -fSinAngle;
+    M.m[2][2] = fCosAngle;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMVECTOR T1 = vsetq_lane_f32( fCosAngle, Zero, 1 );
+    T1 = vsetq_lane_f32( fSinAngle, T1, 2 );
+
+    XMVECTOR T2 = vsetq_lane_f32( -fSinAngle, Zero, 1 );
+    T2 = vsetq_lane_f32( fCosAngle, T2, 2 );
+
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = T1;
+    M.r[2] = T2;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinAngle;
+    float    CosAngle;
+    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
+
+    XMVECTOR vSin = _mm_set_ss(SinAngle);
+    XMVECTOR vCos = _mm_set_ss(CosAngle);
+    // x = 0,y = cos,z = sin, w = 0
+    vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3));
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0;
+    M.r[1] = vCos;
+    // x = 0,y = sin,z = cos, w = 0
+    vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0));
+    // x = 0,y = -sin,z = cos, w = 0
+    vCos = _mm_mul_ps(vCos,g_XMNegateY);
+    M.r[2] = vCos;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationY
+(
+    float Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ 
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMMATRIX M;
+    M.m[0][0] = fCosAngle;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = -fSinAngle;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 1.0f;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = fSinAngle;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fCosAngle;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 );
+    T0 = vsetq_lane_f32( -fSinAngle, T0, 2 );
+
+    XMVECTOR T2 = vsetq_lane_f32( fSinAngle, Zero, 0 );
+    T2 = vsetq_lane_f32( fCosAngle, T2, 2 );
+
+    XMMATRIX M;
+    M.r[0] = T0;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = T2;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinAngle;
+    float    CosAngle;
+    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
+
+    XMVECTOR vSin = _mm_set_ss(SinAngle);
+    XMVECTOR vCos = _mm_set_ss(CosAngle);
+    // x = sin,y = 0,z = cos, w = 0
+    vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0));
+    XMMATRIX M;
+    M.r[2] = vSin;
+    M.r[1] = g_XMIdentityR1;
+    // x = cos,y = 0,z = sin, w = 0
+    vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2));
+    // x = cos,y = 0,z = -sin, w = 0
+    vSin = _mm_mul_ps(vSin,g_XMNegateZ);
+    M.r[0] = vSin;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationZ
+(
+    float Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ 
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMMATRIX M;
+    M.m[0][0] = fCosAngle;
+    M.m[0][1] = fSinAngle;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = -fSinAngle;
+    M.m[1][1] = fCosAngle;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = 1.0f;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 );
+    T0 = vsetq_lane_f32( fSinAngle, T0, 1 );
+
+    XMVECTOR T1 = vsetq_lane_f32( -fSinAngle, Zero, 0 );
+    T1 = vsetq_lane_f32( fCosAngle, T1, 1 );
+
+    XMMATRIX M;
+    M.r[0] = T0;
+    M.r[1] = T1;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinAngle;
+    float    CosAngle;
+    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
+
+    XMVECTOR vSin = _mm_set_ss(SinAngle);
+    XMVECTOR vCos = _mm_set_ss(CosAngle);
+    // x = cos,y = sin,z = 0, w = 0
+    vCos = _mm_unpacklo_ps(vCos,vSin);
+    XMMATRIX M;
+    M.r[0] = vCos;
+    // x = sin,y = cos,z = 0, w = 0
+    vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1));
+    // x = cos,y = -sin,z = 0, w = 0
+    vCos = _mm_mul_ps(vCos,g_XMNegateX);
+    M.r[1] = vCos;
+    M.r[2] = g_XMIdentityR2;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw
+(
+    float Pitch, 
+    float Yaw, 
+    float Roll
+)
+{
+    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+    return XMMatrixRotationRollPitchYawFromVector(Angles);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector
+(
+    FXMVECTOR Angles // <Pitch, Yaw, Roll, undefined>
+)
+{
+    XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
+    return XMMatrixRotationQuaternion(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal
+(
+    FXMVECTOR NormalAxis, 
+    float     Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);
+
+    XMVECTOR C2 = XMVectorSplatZ(A);
+    XMVECTOR C1 = XMVectorSplatY(A);
+    XMVECTOR C0 = XMVectorSplatX(A);
+
+    XMVECTOR N0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(NormalAxis);
+    XMVECTOR N1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(NormalAxis);
+
+    XMVECTOR V0 = XMVectorMultiply(C2, N0);
+    V0 = XMVectorMultiply(V0, N1);
+
+    XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis);
+    R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1);
+
+    XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0);
+    XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0);
+
+    V0 = XMVectorSelect(A, R0, g_XMSelect1110.v);
+    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(R1, R2);
+    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(R1, R2);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(V0, V1);
+    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(V0, V1);
+    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(V0, V2);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle);
+    XMVECTOR C1 = _mm_set_ps1(fCosAngle);
+    XMVECTOR C0 = _mm_set_ps1(fSinAngle);
+
+    XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1));
+    XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2));
+
+    XMVECTOR V0 = _mm_mul_ps(C2, N0);
+    V0 = _mm_mul_ps(V0, N1);
+
+    XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis);
+    R0 = _mm_mul_ps(R0, NormalAxis);
+    R0 = _mm_add_ps(R0, C1);
+
+    XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis);
+    R1 = _mm_add_ps(R1, V0);
+    XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis);
+    R2 = _mm_sub_ps(V0,R2);
+
+    V0 = _mm_and_ps(R0,g_XMMask3);
+    XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0));
+    V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1));
+    XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1));
+    V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0));
+
+    R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0));
+    R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0));
+
+    XMMATRIX M;
+    M.r[0] = R2;
+
+    R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1));
+    R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2));
+    M.r[1] = R2;
+
+    V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0));
+    M.r[2] = V2;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis
+(
+    FXMVECTOR Axis, 
+    float     Angle
+)
+{
+    assert(!XMVector3Equal(Axis, XMVectorZero()));
+    assert(!XMVector3IsInfinite(Axis));
+
+    XMVECTOR Normal = XMVector3Normalize(Axis);
+    return XMMatrixRotationNormal(Normal, Angle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion
+(
+    FXMVECTOR Quaternion
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f};
+
+    XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion);
+    XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0);
+
+    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W>(Q1, Constant1110.v);
+    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W>(Q1, Constant1110.v);
+    XMVECTOR R0 = XMVectorSubtract(Constant1110, V0);
+    R0 = XMVectorSubtract(R0, V1);
+
+    V0 = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(Quaternion);
+    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_W>(Q0);
+    V0 = XMVectorMultiply(V0, V1);
+
+    V1 = XMVectorSplatW(Quaternion);
+    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(Q0);
+    V1 = XMVectorMultiply(V1, V2);
+
+    XMVECTOR R1 = XMVectorAdd(V0, V1);
+    XMVECTOR R2 = XMVectorSubtract(V0, V1);
+
+    V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z>(R1, R2);
+    V1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z>(R1, R2);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(R0, V0);
+    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(R0, V0);
+    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(R0, V1);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32  Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f};
+
+    XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion);
+    XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0);
+
+    XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1));
+    V0 = _mm_and_ps(V0,g_XMMask3);
+    XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2));
+    V1 = _mm_and_ps(V1,g_XMMask3);
+    XMVECTOR R0 = _mm_sub_ps(Constant1110,V0);
+    R0 = _mm_sub_ps(R0, V1);
+
+    V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0));
+    V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2));
+    V0 = _mm_mul_ps(V0, V1);
+
+    V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3));
+    XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1));
+    V1 = _mm_mul_ps(V1, V2);
+
+    XMVECTOR R1 = _mm_add_ps(V0, V1);
+    XMVECTOR R2 = _mm_sub_ps(V0, V1);
+
+    V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1));
+    V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0));
+    V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0));
+    V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0));
+
+    Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0));
+    Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0));
+
+    XMMATRIX M;
+    M.r[0] = Q1;
+
+    Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1));
+    Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2));
+    M.r[1] = Q1;
+
+    Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0));
+    M.r[2] = Q1;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D
+(
+    FXMVECTOR ScalingOrigin, 
+    float     ScalingOrientation, 
+    FXMVECTOR Scaling, 
+    FXMVECTOR RotationOrigin, 
+    float     Rotation, 
+    GXMVECTOR Translation
+)
+{
+    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMVECTOR VScalingOrigin       = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v);
+    XMVECTOR NegScalingOrigin     = XMVectorNegate(VScalingOrigin);
+
+    XMMATRIX MScalingOriginI      = XMMatrixTranslationFromVector(NegScalingOrigin);
+    XMMATRIX MScalingOrientation  = XMMatrixRotationZ(ScalingOrientation);
+    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+    XMVECTOR VScaling             = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
+    XMMATRIX MScaling             = XMMatrixScalingFromVector(VScaling);
+    XMVECTOR VRotationOrigin      = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
+    XMMATRIX MRotation            = XMMatrixRotationZ(Rotation);
+    XMVECTOR VTranslation         = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v);
+
+    XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+    M      = XMMatrixMultiply(M, MScaling);
+    M      = XMMatrixMultiply(M, MScalingOrientation);
+    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M      = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTransformation
+(
+    FXMVECTOR ScalingOrigin, 
+    FXMVECTOR ScalingOrientationQuaternion, 
+    FXMVECTOR Scaling, 
+    GXMVECTOR RotationOrigin, 
+    HXMVECTOR RotationQuaternion, 
+    HXMVECTOR Translation
+)
+{
+    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMVECTOR VScalingOrigin       = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v);
+    XMVECTOR NegScalingOrigin     = XMVectorNegate(ScalingOrigin);
+
+    XMMATRIX MScalingOriginI      = XMMatrixTranslationFromVector(NegScalingOrigin);
+    XMMATRIX MScalingOrientation  = XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
+    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+    XMMATRIX MScaling             = XMMatrixScalingFromVector(Scaling);
+    XMVECTOR VRotationOrigin      = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
+    XMMATRIX MRotation            = XMMatrixRotationQuaternion(RotationQuaternion);
+    XMVECTOR VTranslation         = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
+
+    XMMATRIX M;
+    M      = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+    M      = XMMatrixMultiply(M, MScaling);
+    M      = XMMatrixMultiply(M, MScalingOrientation);
+    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M      = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D
+(
+    FXMVECTOR Scaling, 
+    FXMVECTOR RotationOrigin, 
+    float     Rotation, 
+    FXMVECTOR Translation
+)
+{
+    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMVECTOR VScaling        = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
+    XMMATRIX MScaling        = XMMatrixScalingFromVector(VScaling);
+    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
+    XMMATRIX MRotation       = XMMatrixRotationZ(Rotation);
+    XMVECTOR VTranslation    = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v);
+
+    XMMATRIX M;
+    M      = MScaling;
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M      = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation
+(
+    FXMVECTOR Scaling, 
+    FXMVECTOR RotationOrigin, 
+    FXMVECTOR RotationQuaternion, 
+    GXMVECTOR Translation
+)
+{
+    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMMATRIX MScaling        = XMMatrixScalingFromVector(Scaling);
+    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v);
+    XMMATRIX MRotation       = XMMatrixRotationQuaternion(RotationQuaternion);
+    XMVECTOR VTranslation    = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v);
+
+    XMMATRIX M;
+    M      = MScaling;
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M      = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixReflect
+(
+    FXMVECTOR ReflectionPlane
+)
+{
+    assert(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
+    assert(!XMPlaneIsInfinite(ReflectionPlane));
+
+    static const XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f};
+
+    XMVECTOR P = XMPlaneNormalize(ReflectionPlane);
+    XMVECTOR S = XMVectorMultiply(P, NegativeTwo);
+
+    XMVECTOR A = XMVectorSplatX(P);
+    XMVECTOR B = XMVectorSplatY(P);
+    XMVECTOR C = XMVectorSplatZ(P);
+    XMVECTOR D = XMVectorSplatW(P);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v);
+    M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v);
+    M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v);
+    M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixShadow
+(
+    FXMVECTOR ShadowPlane, 
+    FXMVECTOR LightPosition
+)
+{
+    static const XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1};
+
+    assert(!XMVector3Equal(ShadowPlane, XMVectorZero()));
+    assert(!XMPlaneIsInfinite(ShadowPlane));
+
+    XMVECTOR P = XMPlaneNormalize(ShadowPlane);
+    XMVECTOR Dot = XMPlaneDot(P, LightPosition);
+    P = XMVectorNegate(P);
+    XMVECTOR D = XMVectorSplatW(P);
+    XMVECTOR C = XMVectorSplatZ(P);
+    XMVECTOR B = XMVectorSplatY(P);
+    XMVECTOR A = XMVectorSplatX(P);
+    Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v);
+
+    XMMATRIX M;
+    M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot);
+    Dot = XMVectorRotateLeft(Dot, 1);
+    M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot);
+    Dot = XMVectorRotateLeft(Dot, 1);
+    M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot);
+    Dot = XMVectorRotateLeft(Dot, 1);
+    M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+// View and projection initialization operations
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH
+(
+    FXMVECTOR EyePosition, 
+    FXMVECTOR FocusPosition, 
+    FXMVECTOR UpDirection
+)
+{
+    XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition);
+    return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH
+(
+    FXMVECTOR EyePosition, 
+    FXMVECTOR FocusPosition, 
+    FXMVECTOR UpDirection
+)
+{
+    XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition);
+    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookToLH
+(
+    FXMVECTOR EyePosition, 
+    FXMVECTOR EyeDirection, 
+    FXMVECTOR UpDirection
+)
+{
+    assert(!XMVector3Equal(EyeDirection, XMVectorZero()));
+    assert(!XMVector3IsInfinite(EyeDirection));
+    assert(!XMVector3Equal(UpDirection, XMVectorZero()));
+    assert(!XMVector3IsInfinite(UpDirection));
+
+    XMVECTOR R2 = XMVector3Normalize(EyeDirection);
+
+    XMVECTOR R0 = XMVector3Cross(UpDirection, R2);
+    R0 = XMVector3Normalize(R0);
+
+    XMVECTOR R1 = XMVector3Cross(R2, R0);
+
+    XMVECTOR NegEyePosition = XMVectorNegate(EyePosition);
+
+    XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition);
+    XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition);
+    XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v);
+    M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v);
+    M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v);
+    M.r[3] = g_XMIdentityR3.v;
+
+    M = XMMatrixTranspose(M);
+
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookToRH
+(
+    FXMVECTOR EyePosition, 
+    FXMVECTOR EyeDirection, 
+    FXMVECTOR UpDirection
+)
+{
+    XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection);
+    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
+}
+
+//------------------------------------------------------------------------------
+
+#pragma prefast(push)
+#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
+(
+    float ViewWidth, 
+    float ViewHeight, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (FarZ - NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 1.0f;
+
+    M.m[3][0] = 0.0f;  
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (FarZ - NearZ);
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 );
+    M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (FarZ - NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ / ViewWidth,
+        TwoNearZ / ViewHeight,
+        fRange,
+        -fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // TwoNearZ / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,1.0f
+    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+    // 0,0,fRange,1.0f
+    vTemp = _mm_setzero_ps();
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,0
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+    M.r[3] = vTemp;
+
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
+(
+    float ViewWidth, 
+    float ViewHeight, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (NearZ - FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = -1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (NearZ - FarZ);
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 );
+    M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (NearZ-FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ / ViewWidth,
+        TwoNearZ / ViewHeight,
+        fRange,
+        fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // TwoNearZ / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,-1.0f
+    vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2));
+    // 0,0,fRange,-1.0f
+    vTemp = _mm_setzero_ps();
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,0
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
+(
+    float FovAngleY, 
+    float AspectRatio, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    float fRange = FarZ / (FarZ-NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = Width;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = Height;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float fRange = FarZ / (FarZ-NearZ);
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( Width, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( Height, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 );
+    M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float fRange = FarZ / (FarZ-NearZ);
+    // Note: This is recorded on the stack
+    float Height = CosFov / SinFov;
+    XMVECTOR rMem = {
+        Height / AspectRatio,
+        Height,
+        fRange,
+        -fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // CosFov / SinFov,0,0,0
+    XMMATRIX M;
+    M.r[0] = vTemp;
+    // 0,Height / AspectRatio,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+    // 0,0,fRange,1.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,0.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
+(
+    float FovAngleY, 
+    float AspectRatio, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    float fRange = FarZ / (NearZ-FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = Width;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = Height;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = -1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+    float fRange = FarZ / (NearZ-FarZ);
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( Width, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( Height, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 );
+    M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+    float fRange = FarZ / (NearZ-FarZ);
+    // Note: This is recorded on the stack
+    float Height = CosFov / SinFov;
+    XMVECTOR rMem = {
+        Height / AspectRatio,
+        Height,
+        fRange,
+        fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // CosFov / SinFov,0,0,0
+    XMMATRIX M;
+    M.r[0] = vTemp;
+    // 0,Height / AspectRatio,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,-1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2));
+    // 0,0,fRange,-1.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+    M.r[2] = vTemp;
+    // 0,0,fRange * NearZ,0.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
+(
+    float ViewLeft, 
+    float ViewRight, 
+    float ViewBottom, 
+    float ViewTop, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (FarZ-NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ * ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ * ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
+    M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (FarZ-NearZ);
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 );
+    M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
+                         -(ViewTop + ViewBottom) * ReciprocalHeight,
+                         fRange,
+                         1.0f);
+    M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ+NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (FarZ-NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ*ReciprocalWidth,
+        TwoNearZ*ReciprocalHeight,
+        -fRange * NearZ,
+        0
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // TwoNearZ*ReciprocalWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ*ReciprocalHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // 0,0,fRange,1.0f
+    M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth,
+                          -(ViewTop + ViewBottom) * ReciprocalHeight,
+                          fRange,
+                          1.0f );
+    // 0,0,-fRange * NearZ,0.0f
+    vValues = _mm_and_ps(vValues,g_XMMaskZ);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
+(
+    float ViewLeft, 
+    float ViewRight, 
+    float ViewBottom, 
+    float ViewTop, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (NearZ-FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ * ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ * ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
+    M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
+    M.m[2][2] = fRange;
+    M.m[2][3] = -1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (NearZ-FarZ);
+    const XMVECTOR Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 );
+    M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, 
+                         (ViewTop + ViewBottom) * ReciprocalHeight,
+                         fRange,
+                         -1.0f);
+    M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ+NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (NearZ-FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ*ReciprocalWidth,
+        TwoNearZ*ReciprocalHeight,
+        fRange * NearZ,
+        0
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // TwoNearZ*ReciprocalWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ*ReciprocalHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // 0,0,fRange,1.0f
+    M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth,
+                          (ViewTop + ViewBottom) * ReciprocalHeight,
+                          fRange,
+                          -1.0f );
+    // 0,0,-fRange * NearZ,0.0f
+    vValues = _mm_and_ps(vValues,g_XMMaskZ);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
+(
+    float ViewWidth, 
+    float ViewHeight, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fRange = 1.0f / (FarZ-NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = 2.0f / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 2.0f / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fRange = 1.0f / (FarZ-NearZ);
+
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
+    M.r[3] = vsetq_lane_f32( -fRange * NearZ, g_XMIdentityR3.v, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fRange = 1.0f / (FarZ-NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        2.0f / ViewWidth,
+        2.0f / ViewHeight,
+        fRange,
+        -fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // 2.0f / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,2.0f / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+    // 0,0,fRange,0.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,1.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
+(
+    float ViewWidth, 
+    float ViewHeight, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fRange = 1.0f / (NearZ-FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = 2.0f / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 2.0f / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fRange = 1.0f / (NearZ-FarZ);
+
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
+    M.r[3] = vsetq_lane_f32( fRange * NearZ, g_XMIdentityR3.v, 2 );
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fRange = 1.0f / (NearZ-FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        2.0f / ViewWidth,
+        2.0f / ViewHeight,
+        fRange,
+        fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // 2.0f / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,2.0f / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=fRange * NearZ,0,1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+    // 0,0,fRange,0.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0));
+    M.r[2] = vTemp;
+    // 0,0,fRange * NearZ,1.0f
+    vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
+(
+    float ViewLeft, 
+    float ViewRight, 
+    float ViewBottom, 
+    float ViewTop, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (FarZ-NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
+    M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (FarZ-NearZ);
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
+    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
+                         -(ViewTop + ViewBottom) * ReciprocalHeight,
+                         -fRange * NearZ,
+                         1.0f);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (FarZ-NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        fReciprocalWidth,
+        fReciprocalHeight,
+        fRange,
+        1.0f
+    };
+    XMVECTOR rMem2 = {
+        -(ViewLeft + ViewRight),
+        -(ViewTop + ViewBottom),
+        -NearZ,
+        1.0f
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // fReciprocalWidth*2,0,0,0
+    vTemp = _mm_add_ss(vTemp,vTemp);
+    M.r[0] = vTemp;
+    // 0,fReciprocalHeight*2,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    vTemp = _mm_add_ps(vTemp,vTemp);
+    M.r[1] = vTemp;
+    // 0,0,fRange,0.0f
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskZ);
+    M.r[2] = vTemp;
+    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
+    vValues = _mm_mul_ps(vValues,rMem2);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
+(
+    float ViewLeft, 
+    float ViewRight, 
+    float ViewBottom, 
+    float ViewTop, 
+    float NearZ, 
+    float FarZ
+)
+{
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (NearZ-FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
+                         -(ViewTop + ViewBottom) * ReciprocalHeight,
+                         fRange * NearZ,
+                         1.0f);
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (NearZ-FarZ);
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 );
+    M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 );
+    M.r[2] = vsetq_lane_f32( fRange, Zero, 2 );
+    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, 
+                         -(ViewTop + ViewBottom) * ReciprocalHeight,
+                         fRange * NearZ,
+                         1.0f);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (NearZ-FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        fReciprocalWidth,
+        fReciprocalHeight,
+        fRange,
+        1.0f
+    };
+    XMVECTOR rMem2 = {
+        -(ViewLeft + ViewRight),
+        -(ViewTop + ViewBottom),
+        NearZ,
+        1.0f
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps(); 
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp,vValues);
+    // fReciprocalWidth*2,0,0,0
+    vTemp = _mm_add_ss(vTemp,vTemp);
+    M.r[0] = vTemp;
+    // 0,fReciprocalHeight*2,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+    vTemp = _mm_add_ps(vTemp,vTemp);
+    M.r[1] = vTemp;
+    // 0,0,fRange,0.0f
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp,g_XMMaskZ);
+    M.r[2] = vTemp;
+    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
+    vValues = _mm_mul_ps(vValues,rMem2);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+#pragma prefast(pop)
+
+/****************************************************************************
+ *
+ * XMMATRIX operators and methods
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX::XMMATRIX
+(
+    float m00, float m01, float m02, float m03,
+    float m10, float m11, float m12, float m13,
+    float m20, float m21, float m22, float m23,
+    float m30, float m31, float m32, float m33
+)
+{
+    r[0] = XMVectorSet(m00, m01, m02, m03);
+    r[1] = XMVectorSet(m10, m11, m12, m13);
+    r[2] = XMVectorSet(m20, m21, m22, m23);
+    r[3] = XMVectorSet(m30, m31, m32, m33);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX::XMMATRIX
+(
+    const float* pArray
+)
+{
+    assert( pArray != nullptr );
+    r[0] = XMLoadFloat4((const XMFLOAT4*)pArray);
+    r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4));
+    r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8));
+    r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12));
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XMMATRIX::operator- () const
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorNegate( r[0] );
+    R.r[1] = XMVectorNegate( r[1] );
+    R.r[2] = XMVectorNegate( r[2] );
+    R.r[3] = XMVectorNegate( r[3] );
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M)
+{
+    r[0] = XMVectorAdd( r[0], M.r[0] );
+    r[1] = XMVectorAdd( r[1], M.r[1] );
+    r[2] = XMVectorAdd( r[2], M.r[2] );
+    r[3] = XMVectorAdd( r[3], M.r[3] );
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M)
+{
+    r[0] = XMVectorSubtract( r[0], M.r[0] );
+    r[1] = XMVectorSubtract( r[1], M.r[1] );
+    r[2] = XMVectorSubtract( r[2], M.r[2] );
+    r[3] = XMVectorSubtract( r[3], M.r[3] );
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M)
+{
+    *this = XMMatrixMultiply( *this, M );
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XMMATRIX::operator*= (float S)
+{
+    r[0] = XMVectorScale( r[0], S );
+    r[1] = XMVectorScale( r[1], S );
+    r[2] = XMVectorScale( r[2], S );
+    r[3] = XMVectorScale( r[3], S );
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XMMATRIX::operator/= (float S)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vS = XMVectorReplicate( S );
+    r[0] = XMVectorDivide( r[0], vS );
+    r[1] = XMVectorDivide( r[1], vS );
+    r[2] = XMVectorDivide( r[2], vS );
+    r[3] = XMVectorDivide( r[3], vS );
+    return *this;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x2_t vS = vdup_n_f32( S );
+    float32x2_t R0 = vrecpe_f32( vS );
+    float32x2_t S0 = vrecps_f32( R0, vS );
+    R0 = vmul_f32( S0, R0 );
+    S0 = vrecps_f32( R0, vS );
+    R0 = vmul_f32( S0, R0 );
+    float32x4_t Reciprocal = vcombine_u32(R0, R0);
+    r[0] = vmulq_f32( r[0], Reciprocal );
+    r[1] = vmulq_f32( r[1], Reciprocal );
+    r[2] = vmulq_f32( r[2], Reciprocal );
+    r[3] = vmulq_f32( r[3], Reciprocal );
+    return *this;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 vS = _mm_set_ps1( S );
+    r[0] = _mm_div_ps( r[0], vS );
+    r[1] = _mm_div_ps( r[1], vS );
+    r[2] = _mm_div_ps( r[2], vS );
+    r[3] = _mm_div_ps( r[3], vS );
+    return *this;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorAdd( r[0], M.r[0] );
+    R.r[1] = XMVectorAdd( r[1], M.r[1] );
+    R.r[2] = XMVectorAdd( r[2], M.r[2] );
+    R.r[3] = XMVectorAdd( r[3], M.r[3] );
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorSubtract( r[0], M.r[0] );
+    R.r[1] = XMVectorSubtract( r[1], M.r[1] );
+    R.r[2] = XMVectorSubtract( r[2], M.r[2] );
+    R.r[3] = XMVectorSubtract( r[3], M.r[3] );
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const
+{
+    return XMMatrixMultiply(*this, M);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XMMATRIX::operator* (float S) const
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorScale( r[0], S );
+    R.r[1] = XMVectorScale( r[1], S );
+    R.r[2] = XMVectorScale( r[2], S );
+    R.r[3] = XMVectorScale( r[3], S );
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XMMATRIX::operator/ (float S) const
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vS = XMVectorReplicate( S );
+    XMMATRIX R;
+    R.r[0] = XMVectorDivide( r[0], vS );
+    R.r[1] = XMVectorDivide( r[1], vS );
+    R.r[2] = XMVectorDivide( r[2], vS );
+    R.r[3] = XMVectorDivide( r[3], vS );
+    return R;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x2_t vS = vdup_n_f32( S );
+    float32x2_t R0 = vrecpe_f32( vS );
+    float32x2_t S0 = vrecps_f32( R0, vS );
+    R0 = vmul_f32( S0, R0 );
+    S0 = vrecps_f32( R0, vS );
+    R0 = vmul_f32( S0, R0 );
+    float32x4_t Reciprocal = vcombine_u32(R0, R0);
+    XMMATRIX R;
+    R.r[0] = vmulq_f32( r[0], Reciprocal );
+    R.r[1] = vmulq_f32( r[1], Reciprocal );
+    R.r[2] = vmulq_f32( r[2], Reciprocal );
+    R.r[3] = vmulq_f32( r[3], Reciprocal );
+    return R;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 vS = _mm_set_ps1( S );
+    XMMATRIX R;
+    R.r[0] = _mm_div_ps( r[0], vS );
+    R.r[1] = _mm_div_ps( r[1], vS );
+    R.r[2] = _mm_div_ps( r[2], vS );
+    R.r[3] = _mm_div_ps( r[3], vS );
+    return R;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV operator*
+(
+    float S,
+    FXMMATRIX M
+)
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorScale( M.r[0], S );
+    R.r[1] = XMVectorScale( M.r[1], S );
+    R.r[2] = XMVectorScale( M.r[2], S );
+    R.r[3] = XMVectorScale( M.r[3], S );
+    return R;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3X3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT3X3::XMFLOAT3X3
+(
+    const float* pArray
+)
+{
+    assert( pArray != nullptr );
+    for (size_t Row = 0; Row < 3; Row++)
+    {
+        for (size_t Column = 0; Column < 3; Column++)
+        {
+            m[Row][Column] = pArray[Row * 3 + Column];
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+
+inline XMFLOAT3X3& XMFLOAT3X3::operator=
+(
+    const XMFLOAT3X3& Float3x3
+)
+{
+    _11 = Float3x3._11;
+    _12 = Float3x3._12;
+    _13 = Float3x3._13;
+    _21 = Float3x3._21;
+    _22 = Float3x3._22;
+    _23 = Float3x3._23;
+    _31 = Float3x3._31;
+    _32 = Float3x3._32;
+    _33 = Float3x3._33;
+
+    return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4X3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4X3::XMFLOAT4X3
+(
+    const float* pArray
+)
+{
+    assert( pArray != nullptr );
+
+    m[0][0] = pArray[0];
+    m[0][1] = pArray[1];
+    m[0][2] = pArray[2];
+
+    m[1][0] = pArray[3];
+    m[1][1] = pArray[4];
+    m[1][2] = pArray[5];
+
+    m[2][0] = pArray[6];
+    m[2][1] = pArray[7];
+    m[2][2] = pArray[8];
+
+    m[3][0] = pArray[9];
+    m[3][1] = pArray[10];
+    m[3][2] = pArray[11];
+}
+
+//------------------------------------------------------------------------------
+
+inline XMFLOAT4X3& XMFLOAT4X3::operator=
+(
+    const XMFLOAT4X3& Float4x3
+)
+{
+    XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11);
+    XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22);
+    XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33);
+
+    XMStoreFloat4((XMFLOAT4*)&_11, V1);
+    XMStoreFloat4((XMFLOAT4*)&_22, V2);
+    XMStoreFloat4((XMFLOAT4*)&_33, V3);
+
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMFLOAT4X3A& XMFLOAT4X3A::operator=
+(
+    const XMFLOAT4X3A& Float4x3
+)
+{
+    XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11);
+    XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22);
+    XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33);
+
+    XMStoreFloat4A((XMFLOAT4A*)&_11, V1);
+    XMStoreFloat4A((XMFLOAT4A*)&_22, V2);
+    XMStoreFloat4A((XMFLOAT4A*)&_33, V3);
+
+    return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4X4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4X4::XMFLOAT4X4
+(
+    const float* pArray
+)
+{
+    assert( pArray != nullptr );
+
+    m[0][0] = pArray[0];
+    m[0][1] = pArray[1];
+    m[0][2] = pArray[2];
+    m[0][3] = pArray[3];
+
+    m[1][0] = pArray[4];
+    m[1][1] = pArray[5];
+    m[1][2] = pArray[6];
+    m[1][3] = pArray[7];
+
+    m[2][0] = pArray[8];
+    m[2][1] = pArray[9];
+    m[2][2] = pArray[10];
+    m[2][3] = pArray[11];
+
+    m[3][0] = pArray[12];
+    m[3][1] = pArray[13];
+    m[3][2] = pArray[14];
+    m[3][3] = pArray[15];
+}
+
+//------------------------------------------------------------------------------
+
+inline XMFLOAT4X4& XMFLOAT4X4::operator=
+(
+    const XMFLOAT4X4& Float4x4
+)
+{
+    XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11);
+    XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21);
+    XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31);
+    XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41);
+
+    XMStoreFloat4((XMFLOAT4*)&_11, V1);
+    XMStoreFloat4((XMFLOAT4*)&_21, V2);
+    XMStoreFloat4((XMFLOAT4*)&_31, V3);
+    XMStoreFloat4((XMFLOAT4*)&_41, V4);
+
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMFLOAT4X4A& XMFLOAT4X4A::operator=
+(
+    const XMFLOAT4X4A& Float4x4
+)
+{
+    XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11);
+    XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21);
+    XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31);
+    XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41);
+
+    XMStoreFloat4A((XMFLOAT4A*)&_11, V1);
+    XMStoreFloat4A((XMFLOAT4A*)&_21, V2);
+    XMStoreFloat4A((XMFLOAT4A*)&_31, V3);
+    XMStoreFloat4A((XMFLOAT4A*)&_41, V4);
+
+    return *this;
+}
+
diff --git a/Inc/DirectXMathMisc.inl b/Inc/DirectXMathMisc.inl
index ed6d423..69acff3 100644
--- a/Inc/DirectXMathMisc.inl
+++ b/Inc/DirectXMathMisc.inl
@@ -1,2512 +1,2512 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathMisc.inl -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Quaternion
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionEqual
-(
-    FXMVECTOR Q1,
-    FXMVECTOR Q2
-)
-{
-    return XMVector4Equal(Q1, Q2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionNotEqual
-(
-    FXMVECTOR Q1,
-    FXMVECTOR Q2
-)
-{
-    return XMVector4NotEqual(Q1, Q2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionIsNaN
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4IsNaN(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionIsInfinite
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4IsInfinite(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionIsIdentity
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4Equal(Q, g_XMIdentityR3.v);
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionDot
-(
-    FXMVECTOR Q1,
-    FXMVECTOR Q2
-)
-{
-    return XMVector4Dot(Q1, Q2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
-(
-    FXMVECTOR Q1,
-    FXMVECTOR Q2
-)
-{
-    // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2)
-
-    // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
-    //   (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
-    //   (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
-    //   (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result = {
-        (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]),
-        (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]),
-        (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]),
-        (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) };
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
-    static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
-    static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
-
-    float32x2_t Q2L = vget_low_f32(Q2);
-    float32x2_t Q2H = vget_high_f32(Q2);
-
-    float32x4_t Q2X = vdupq_lane_f32( Q2L, 0 );
-    float32x4_t Q2Y = vdupq_lane_f32( Q2L, 1 );
-    float32x4_t Q2Z = vdupq_lane_f32( Q2H, 0 );
-    XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1);
-
-    // Mul by Q1WZYX
-    float32x4_t vTemp = vrev64q_f32(Q1);
-    vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) );
-    Q2X = vmulq_f32(Q2X,vTemp);
-    vResult = vmlaq_f32( vResult, Q2X, ControlWZYX );
-
-    // Mul by Q1ZWXY
-    vTemp = vrev64q_u32(vTemp);
-    Q2Y = vmulq_f32(Q2Y,vTemp);
-    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
-
-    // Mul by Q1YXWZ
-    vTemp = vrev64q_u32(vTemp);
-    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
-    Q2Z = vmulq_f32(Q2Z,vTemp);
-    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
-    static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
-    static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
-    // Copy to SSE registers and use as few as possible for x86
-    XMVECTOR Q2X = Q2;
-    XMVECTOR Q2Y = Q2;
-    XMVECTOR Q2Z = Q2;
-    XMVECTOR vResult = Q2;
-    // Splat with one instruction
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3));
-    Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0));
-    Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1));
-    Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2));
-    // Retire Q1 and perform Q1*Q2W
-    vResult = _mm_mul_ps(vResult,Q1);
-    XMVECTOR Q1Shuffle = Q1;
-    // Shuffle the copies of Q1
-    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
-    // Mul by Q1WZYX
-    Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
-    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
-    // Flip the signs on y and z
-    Q2X = _mm_mul_ps(Q2X,ControlWZYX);
-    // Mul by Q1ZWXY
-    Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
-    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
-    // Flip the signs on z and w
-    Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
-    // Mul by Q1YXWZ
-    Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle);
-    vResult = _mm_add_ps(vResult,Q2X);
-    // Flip the signs on x and w
-    Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ);
-    Q2Y = _mm_add_ps(Q2Y,Q2Z);
-    vResult = _mm_add_ps(vResult,Q2Y);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4LengthSq(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4ReciprocalLength(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionLength
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4Length(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4NormalizeEst(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionNormalize
-(
-    FXMVECTOR Q
-)
-{
-    return XMVector4Normalize(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionConjugate
-(
-    FXMVECTOR Q
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result = {
-        -Q.vector4_f32[0],
-        -Q.vector4_f32[1],
-        -Q.vector4_f32[2],
-        Q.vector4_f32[3]
-    };
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
-    return vmulq_f32(Q, NegativeOne3.v );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
-    return _mm_mul_ps(Q,NegativeOne3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionInverse
-(
-    FXMVECTOR Q
-)
-{
-    const XMVECTOR  Zero = XMVectorZero();
-
-    XMVECTOR L = XMVector4LengthSq(Q);
-    XMVECTOR Conjugate = XMQuaternionConjugate(Q);
-
-    XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
-
-    XMVECTOR Result = XMVectorDivide(Conjugate, L);
-
-    Result = XMVectorSelect(Result, Zero, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionLn
-(
-    FXMVECTOR Q
-)
-{
-    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
-
-    XMVECTOR QW = XMVectorSplatW(Q);
-    XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
-
-    XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v);
-
-    XMVECTOR Theta = XMVectorACos(QW);
-    XMVECTOR SinTheta = XMVectorSin(Theta);
-
-    XMVECTOR S = XMVectorDivide(Theta,SinTheta);
-
-    XMVECTOR Result = XMVectorMultiply(Q0, S);
-    Result = XMVectorSelect(Q0, Result, ControlW);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionExp
-(
-    FXMVECTOR Q
-)
-{
-    XMVECTOR Theta = XMVector3Length(Q);
-
-    XMVECTOR SinTheta, CosTheta;
-    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
-
-    XMVECTOR S = XMVectorDivide(SinTheta, Theta);
-
-    XMVECTOR Result = XMVectorMultiply(Q, S);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
-    Result = XMVectorSelect(Result, Q, Control);
-
-    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSlerp
-(
-    FXMVECTOR Q0,
-    FXMVECTOR Q1,
-    float    t
-)
-{
-    XMVECTOR T = XMVectorReplicate(t);
-    return XMQuaternionSlerpV(Q0, Q1, T);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV
-(
-    FXMVECTOR Q0,
-    FXMVECTOR Q1,
-    FXMVECTOR T
-)
-{
-    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
-
-    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
-
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
-
-    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
-    XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
-
-    CosOmega = XMVectorMultiply(CosOmega, Sign);
-
-    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
-
-    XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
-    SinOmega = XMVectorSqrt(SinOmega);
-
-    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
-
-    XMVECTOR SignMask = XMVectorSplatSignMask();
-    XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2);
-    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
-    V01 = XMVectorXorInt(V01, SignMask);
-    V01 = XMVectorAdd(g_XMIdentityR0.v, V01);
-
-    XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega);
-
-    XMVECTOR S0 = XMVectorMultiply(V01, Omega);
-    S0 = XMVectorSin(S0);
-    S0 = XMVectorMultiply(S0, InvSinOmega);
-
-    S0 = XMVectorSelect(V01, S0, Control);
-
-    XMVECTOR S1 = XMVectorSplatY(S0);
-    S0 = XMVectorSplatX(S0);
-
-    S1 = XMVectorMultiply(S1, Sign);
-
-    XMVECTOR Result = XMVectorMultiply(Q0, S0);
-    Result = XMVectorMultiplyAdd(Q1, S1, Result);
-
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
-    static const XMVECTORU32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000};
-
-    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
-    XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
-
-    CosOmega = _mm_mul_ps(CosOmega, Sign);
-
-    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
-
-    XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega);
-    SinOmega = _mm_sub_ps(g_XMOne,SinOmega);
-    SinOmega = _mm_sqrt_ps(SinOmega);
-
-    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
-
-    XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1));
-    V01 = _mm_and_ps(V01,g_XMMaskXY);
-    V01 = _mm_xor_ps(V01,SignMask2);
-    V01 = _mm_add_ps(g_XMIdentityR0, V01);
-
-    XMVECTOR S0 = _mm_mul_ps(V01, Omega);
-    S0 = XMVectorSin(S0);
-    S0 = _mm_div_ps(S0, SinOmega);
-
-    S0 = XMVectorSelect(V01, S0, Control);
-
-    XMVECTOR S1 = XMVectorSplatY(S0);
-    S0 = XMVectorSplatX(S0);
-
-    S1 = _mm_mul_ps(S1, Sign);
-    XMVECTOR Result = _mm_mul_ps(Q0, S0);
-    S1 = _mm_mul_ps(S1, Q1);
-    Result = _mm_add_ps(Result,S1);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSquad
-(
-    FXMVECTOR Q0,
-    FXMVECTOR Q1,
-    FXMVECTOR Q2,
-    GXMVECTOR Q3,
-    float    t
-)
-{
-    XMVECTOR T = XMVectorReplicate(t);
-    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSquadV
-(
-    FXMVECTOR Q0,
-    FXMVECTOR Q1,
-    FXMVECTOR Q2,
-    GXMVECTOR Q3,
-    HXMVECTOR T
-)
-{
-    assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) );
-
-    XMVECTOR TP = T;
-    const XMVECTOR Two = XMVectorSplatConstant(2, 0);
-
-    XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T);
-    XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T);
-
-    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
-    TP = XMVectorMultiply(TP, Two);
-
-    XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMQuaternionSquadSetup
-(
-    XMVECTOR* pA,
-    XMVECTOR* pB,
-    XMVECTOR* pC,
-    FXMVECTOR  Q0,
-    FXMVECTOR  Q1,
-    FXMVECTOR  Q2,
-    GXMVECTOR  Q3
-)
-{
-    assert(pA);
-    assert(pB);
-    assert(pC);
-
-    XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
-    XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
-    XMVECTOR SQ2 = XMVectorNegate(Q2);
-
-    XMVECTOR Control1 = XMVectorLess(LS12, LD12);
-    SQ2 = XMVectorSelect(Q2, SQ2, Control1);
-
-    XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
-    XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
-    XMVECTOR SQ0 = XMVectorNegate(Q0);
-
-    XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
-    XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
-    XMVECTOR SQ3 = XMVectorNegate(Q3);
-
-    XMVECTOR Control0 = XMVectorLess(LS01, LD01);
-    XMVECTOR Control2 = XMVectorLess(LS23, LD23);
-
-    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
-    SQ3 = XMVectorSelect(Q3, SQ3, Control2);
-
-    XMVECTOR InvQ1 = XMQuaternionInverse(Q1);
-    XMVECTOR InvQ2 = XMQuaternionInverse(SQ2);
-
-    XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
-    XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
-    XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
-    XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
-
-    const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
-
-    XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
-    XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
-    ExpQ02 = XMQuaternionExp(ExpQ02);
-    ExpQ13 = XMQuaternionExp(ExpQ13);
-
-    *pA = XMQuaternionMultiply(Q1, ExpQ02);
-    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
-    *pC = SQ2;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric
-(
-    FXMVECTOR Q0,
-    FXMVECTOR Q1,
-    FXMVECTOR Q2,
-    float    f,
-    float    g
-)
-{
-    float s = f + g;
-
-    XMVECTOR Result;
-    if ((s < 0.00001f) && (s > -0.00001f))
-    {
-        Result = Q0;
-    }
-    else
-    {
-        XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s);
-        XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s);
-
-        Result = XMQuaternionSlerp(Q01, Q02, g / s);
-    }
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV
-(
-    FXMVECTOR Q0,
-    FXMVECTOR Q1,
-    FXMVECTOR Q2,
-    GXMVECTOR F,
-    HXMVECTOR G
-)
-{
-    assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) );
-    assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) );
-
-    const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16);
-
-    XMVECTOR S = XMVectorAdd(F, G);
-
-    XMVECTOR Result;
-    if (XMVector4InBounds(S, Epsilon))
-    {
-        Result = Q0;
-    }
-    else
-    {
-        XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S);
-        XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S);
-        XMVECTOR GS = XMVectorReciprocal(S);
-        GS = XMVectorMultiply(G, GS);
-
-        Result = XMQuaternionSlerpV(Q01, Q02, GS);
-    }
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-// Transformation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionIdentity()
-{
-    return g_XMIdentityR3.v;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw
-(
-    float Pitch,
-    float Yaw,
-    float Roll
-)
-{
-    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
-    XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
-    return Q;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector
-(
-    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
-)
-{
-    static const XMVECTORF32  Sign = {1.0f, -1.0f, -1.0f, 1.0f};
-
-    XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
-
-    XMVECTOR SinAngles, CosAngles;
-    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
-
-    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(SinAngles, CosAngles);
-    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
-    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
-    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(CosAngles, SinAngles);
-    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(CosAngles, SinAngles);
-    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(CosAngles, SinAngles);
-
-    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
-    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
-    Q1 = XMVectorMultiply(Q1, Y1);
-    Q0 = XMVectorMultiply(Q0, R0);
-    XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0);
-
-    return Q;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal
-(
-    FXMVECTOR NormalAxis,
-    float    Angle
-)
-{
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
-
-    float SinV, CosV;
-    XMScalarSinCos(&SinV, &CosV, 0.5f * Angle);
-
-    XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV );
-    return XMVectorMultiply(N, Scale);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3);
-    N = _mm_or_ps(N,g_XMIdentityR3);
-    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
-    XMVECTOR vSine;
-    XMVECTOR vCosine;
-    XMVectorSinCos(&vSine,&vCosine,Scale);
-    Scale = _mm_and_ps(vSine,g_XMMask3);
-    vCosine = _mm_and_ps(vCosine,g_XMMaskW);
-    Scale = _mm_or_ps(Scale,vCosine);
-    N = _mm_mul_ps(N,Scale);
-    return N;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis
-(
-    FXMVECTOR Axis,
-    float    Angle
-)
-{
-    assert(!XMVector3Equal(Axis, XMVectorZero()));
-    assert(!XMVector3IsInfinite(Axis));
-
-    XMVECTOR Normal = XMVector3Normalize(Axis);
-    XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle);
-    return Q;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix
-(
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 q;
-    float r22 = M.m[2][2];
-    if (r22 <= 0.f)  // x^2 + y^2 >= z^2 + w^2
-    {
-        float dif10 = M.m[1][1] - M.m[0][0];
-        float omr22 = 1.f - r22;
-        if (dif10 <= 0.f)  // x^2 >= y^2
-        {
-            float fourXSqr = omr22 - dif10;
-            float inv4x = 0.5f / sqrtf(fourXSqr);
-            q.f[0] = fourXSqr*inv4x;
-            q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x;
-            q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x;
-            q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x;
-        }
-        else  // y^2 >= x^2
-        {
-            float fourYSqr = omr22 + dif10;
-            float inv4y = 0.5f / sqrtf(fourYSqr);
-            q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y;
-            q.f[1] = fourYSqr*inv4y;
-            q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y;
-            q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y;
-        }
-    }
-    else  // z^2 + w^2 >= x^2 + y^2
-    {
-        float sum10 = M.m[1][1] + M.m[0][0];
-        float opr22 = 1.f + r22;
-        if (sum10 <= 0.f)  // z^2 >= w^2
-        {
-            float fourZSqr = opr22 - sum10;
-            float inv4z = 0.5f / sqrtf(fourZSqr);
-            q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z;
-            q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z;
-            q.f[2] = fourZSqr*inv4z;
-            q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z;
-        }
-        else  // w^2 >= z^2
-        {
-            float fourWSqr = opr22 + sum10;
-            float inv4w = 0.5f / sqrtf(fourWSqr);
-            q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w;
-            q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w;
-            q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w;
-            q.f[3] = fourWSqr*inv4w;
-        }
-    }
-    return q.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f};
-    static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f};
-    static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; 
-    static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 };
-    static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 };
-
-    XMVECTOR r0 = M.r[0];
-    XMVECTOR r1 = M.r[1];
-    XMVECTOR r2 = M.r[2];
-
-    XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
-    XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
-    XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
-
-    // x^2 >= y^2 equivalent to r11 - r00 <= 0
-    XMVECTOR r11mr00 = vsubq_f32(r11, r00);
-    XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero);
-
-    // z^2 >= w^2 equivalent to r11 + r00 <= 0
-    XMVECTOR r11pr00 = vaddq_f32(r11, r00);
-    XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero);
-    
-    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
-    XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
-
-    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
-    XMVECTOR t0 = vmulq_f32( XMPMMP, r00 );
-    XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 );
-    x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 );
-    x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne );
-
-    // (r01, r02, r12, r11)
-    t0 = vextq_f32(r0, r0, 1);
-    XMVECTOR t1 = vextq_f32(r1, r1, 1);
-    t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) );
-
-    // (r10, r20, r21, r10)
-    t1 = vextq_f32(r2, r2, 3);
-    XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 );
-    t1 = vbslq_f32( Select0110, t1, r10 );
-
-    // (4*x*y, 4*x*z, 4*y*z, unused)
-    XMVECTOR xyxzyz = vaddq_f32(t0, t1);
-
-    // (r21, r20, r10, r10)
-    t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) );
-
-    // (r12, r02, r01, r12)
-    XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) );
-    XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 );
-    t1 = vbslq_f32( Select0110, t2, t3 );
-
-    // (4*x*w, 4*y*w, 4*z*w, unused)
-    XMVECTOR xwywzw = vsubq_f32(t0, t1);
-    xwywzw = vmulq_f32(XMMPMP, xwywzw);
-
-    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
-    t0 = vextq_f32( xyxzyz, xyxzyz, 3 );
-    t1 = vbslq_f32( Select0110, t0, x2y2z2w2 );
-    t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 );
-    XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 );
-
-    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
-    t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 );
-    t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 );
-    XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 );
-
-    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
-    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
-    t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) );
-    XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 );
-
-    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
-    XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 );
-
-    // Select the row of the tensor-product matrix that has the largest
-    // magnitude.
-    t0 = vbslq_f32( x2gey2, tensor0, tensor1 );
-    t1 = vbslq_f32( z2gew2, tensor2, tensor3 );
-    t2 = vbslq_f32( x2py2gez2pw2, t0, t1 );
-
-    // Normalize the row.  No division by zero is possible because the
-    // quaternion is unit-length (and the row is a nonzero multiple of
-    // the quaternion).
-    t0 = XMVector4Length(t2);
-    return XMVectorDivide(t2, t0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f};
-    static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f};
-    static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; 
-
-    XMVECTOR r0 = M.r[0];  // (r00, r01, r02, 0)
-    XMVECTOR r1 = M.r[1];  // (r10, r11, r12, 0)
-    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)
-
-    // (r00, r00, r00, r00)
-    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0));
-    // (r11, r11, r11, r11)
-    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1));
-    // (r22, r22, r22, r22)
-    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2));
-
-    // x^2 >= y^2 equivalent to r11 - r00 <= 0
-    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
-    XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
-    XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);
-
-    // z^2 >= w^2 equivalent to r11 + r00 <= 0
-    // (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
-    XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
-    XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);
-
-    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
-    XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);
-
-    // (+r00, -r00, -r00, +r00)
-    XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00);
-
-    // (-r11, +r11, -r11, +r11)
-    XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
-
-    // (-r22, -r22, +r22, +r22)
-    XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22);
-
-    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
-    XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1);
-    x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2);
-    x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne);
-
-    // (r01, r02, r12, r11)
-    t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1));
-    // (r10, r10, r20, r21)
-    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0));
-    // (r10, r20, r21, r10)
-    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
-    // (4*x*y, 4*x*z, 4*y*z, unused)
-    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
-
-    // (r21, r20, r10, r10)
-    t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1));
-    // (r12, r12, r02, r01)
-    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2));
-    // (r12, r02, r01, r12)
-    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
-    // (4*x*w, 4*y*w, 4*z*w, unused)
-    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
-    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
-
-    // (4*x^2, 4*y^2, 4*x*y, unused)
-    t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0));
-    // (4*z^2, 4*w^2, 4*z*w, unused)
-    t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2));
-    // (4*x*z, 4*y*z, 4*x*w, 4*y*w)
-    t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1));
-
-    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
-    XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0));
-    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
-    XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2));
-    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
-    XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0));
-    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
-    XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2));
-
-    // Select the row of the tensor-product matrix that has the largest
-    // magnitude.
-    t0 = _mm_and_ps(x2gey2, tensor0);
-    t1 = _mm_andnot_ps(x2gey2, tensor1);
-    t0 = _mm_or_ps(t0, t1);
-    t1 = _mm_and_ps(z2gew2, tensor2);
-    t2 = _mm_andnot_ps(z2gew2, tensor3);
-    t1 = _mm_or_ps(t1, t2);
-    t0 = _mm_and_ps(x2py2gez2pw2, t0);
-    t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
-    t2 = _mm_or_ps(t0, t1);
-
-    // Normalize the row.  No division by zero is possible because the
-    // quaternion is unit-length (and the row is a nonzero multiple of
-    // the quaternion).
-    t0 = XMVector4Length(t2);
-    return _mm_div_ps(t2, t0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Conversion operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMQuaternionToAxisAngle
-(
-    XMVECTOR* pAxis,
-    float*    pAngle,
-    FXMVECTOR  Q
-)
-{
-    assert(pAxis);
-    assert(pAngle);
-
-    *pAxis = Q;
-
-    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
-}
-
-/****************************************************************************
- *
- * Plane
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneEqual
-(
-    FXMVECTOR P1,
-    FXMVECTOR P2
-)
-{
-    return XMVector4Equal(P1, P2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneNearEqual
-(
-    FXMVECTOR P1,
-    FXMVECTOR P2,
-    FXMVECTOR Epsilon
-)
-{
-    XMVECTOR NP1 = XMPlaneNormalize(P1);
-    XMVECTOR NP2 = XMPlaneNormalize(P2);
-    return XMVector4NearEqual(NP1, NP2, Epsilon);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneNotEqual
-(
-    FXMVECTOR P1,
-    FXMVECTOR P2
-)
-{
-    return XMVector4NotEqual(P1, P2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneIsNaN
-(
-    FXMVECTOR P
-)
-{
-    return XMVector4IsNaN(P);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneIsInfinite
-(
-    FXMVECTOR P
-)
-{
-    return XMVector4IsInfinite(P);
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneDot
-(
-    FXMVECTOR P,
-    FXMVECTOR V
-)
-{
-    return XMVector4Dot(P, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneDotCoord
-(
-    FXMVECTOR P,
-    FXMVECTOR V
-)
-{
-    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
-
-    XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
-    XMVECTOR Result = XMVector4Dot(P, V3);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneDotNormal
-(
-    FXMVECTOR P,
-    FXMVECTOR V
-)
-{
-    return XMVector3Dot(P, V);
-}
-
-//------------------------------------------------------------------------------
-// XMPlaneNormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
-(
-    FXMVECTOR P
-)
-{
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
-    return XMVectorMultiply(P, Result);
-
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, P);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(P,P);
-    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
-    // Result.x = x+y
-    vDot = _mm_add_ss(vDot,vTemp);
-    // x=Dot.z
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // Result.x = (x+y)+z
-    vDot = _mm_add_ss(vDot,vTemp);
-    // Splat x
-    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
-    // Get the reciprocal
-    vDot = _mm_rsqrt_ps(vDot);
-    // Get the reciprocal
-    vDot = _mm_mul_ps(vDot,P);
-    return vDot;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneNormalize
-(
-    FXMVECTOR P
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2]));
-    // Prevent divide by zero
-    if (fLengthSq) {
-        fLengthSq = 1.0f/fLengthSq;
-    }
-    {
-    XMVECTOR vResult = {
-        P.vector4_f32[0]*fLengthSq,
-        P.vector4_f32[1]*fLengthSq,
-        P.vector4_f32[2]*fLengthSq,
-        P.vector4_f32[3]*fLengthSq
-    };
-    return vResult;
-    }
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vLength = XMVector3ReciprocalLength(P);
-    return XMVectorMultiply( P, vLength );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(P,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vLengthSq);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z only
-    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(P,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vLengthSq);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine
-(
-    FXMVECTOR P,
-    FXMVECTOR LinePoint1,
-    FXMVECTOR LinePoint2
-)
-{
-    XMVECTOR V1 = XMVector3Dot(P, LinePoint1);
-    XMVECTOR V2 = XMVector3Dot(P, LinePoint2);
-    XMVECTOR D = XMVectorSubtract(V1, V2);
-
-    XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1);
-    VT = XMVectorDivide(VT, D);
-
-    XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1);
-    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
-
-    return XMVectorSelect(Point, g_XMQNaN.v, Control);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV XMPlaneIntersectPlane
-(
-    XMVECTOR* pLinePoint1,
-    XMVECTOR* pLinePoint2,
-    FXMVECTOR  P1,
-    FXMVECTOR  P2
-)
-{
-    assert(pLinePoint1);
-    assert(pLinePoint2);
-
-    XMVECTOR V1 = XMVector3Cross(P2, P1);
-
-    XMVECTOR LengthSq = XMVector3LengthSq(V1);
-
-    XMVECTOR V2 = XMVector3Cross(P2, V1);
-
-    XMVECTOR P1W = XMVectorSplatW(P1);
-    XMVECTOR Point = XMVectorMultiply(V2, P1W);
-
-    XMVECTOR V3 = XMVector3Cross(V1, P1);
-
-    XMVECTOR P2W = XMVectorSplatW(P2);
-    Point = XMVectorMultiplyAdd(V3, P2W, Point);
-
-    XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq);
-
-    XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1);
-
-    XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
-    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control);
-    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneTransform
-(
-    FXMVECTOR P,
-    FXMMATRIX M
-)
-{
-    XMVECTOR W = XMVectorSplatW(P);
-    XMVECTOR Z = XMVectorSplatZ(P);
-    XMVECTOR Y = XMVectorSplatY(P);
-    XMVECTOR X = XMVectorSplatX(P);
-
-    XMVECTOR Result = XMVectorMultiply(W, M.r[3]);
-    Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream
-(
-    XMFLOAT4*       pOutputStream,
-    size_t          OutputStride,
-    const XMFLOAT4* pInputStream,    
-    size_t          InputStride,
-    size_t          PlaneCount,
-    FXMMATRIX       M
-)
-{
-    return XMVector4TransformStream(pOutputStream,
-                                    OutputStride,
-                                    pInputStream,
-                                    InputStride,
-                                    PlaneCount,
-                                    M);
-}
-
-//------------------------------------------------------------------------------
-// Conversion operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal
-(
-    FXMVECTOR Point,
-    FXMVECTOR Normal
-)
-{
-    XMVECTOR W = XMVector3Dot(Point, Normal);
-    W = XMVectorNegate(W);
-    return XMVectorSelect(W, Normal, g_XMSelect1110.v);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneFromPoints
-(
-    FXMVECTOR Point1,
-    FXMVECTOR Point2,
-    FXMVECTOR Point3
-)
-{
-    XMVECTOR V21 = XMVectorSubtract(Point1, Point2);
-    XMVECTOR V31 = XMVectorSubtract(Point1, Point3);
-
-    XMVECTOR N = XMVector3Cross(V21, V31);
-    N = XMVector3Normalize(N);
-
-    XMVECTOR D = XMPlaneDotNormal(N, Point1);
-    D = XMVectorNegate(D);
-
-    XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v);
-
-    return Result;
-}
-
-/****************************************************************************
- *
- * Color
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorEqual
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVector4Equal(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorNotEqual
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVector4NotEqual(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorGreater
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVector4Greater(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorGreaterOrEqual
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVector4GreaterOrEqual(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorLess
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVector4Less(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorLessOrEqual
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVector4LessOrEqual(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorIsNaN
-(
-    FXMVECTOR C
-)
-{
-    return XMVector4IsNaN(C);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorIsInfinite
-(
-    FXMVECTOR C
-)
-{
-    return XMVector4IsInfinite(C);
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorNegative
-(
-    FXMVECTOR vColor
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        1.0f - vColor.vector4_f32[0],
-        1.0f - vColor.vector4_f32[1],
-        1.0f - vColor.vector4_f32[2],
-        vColor.vector4_f32[3]
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3);
-    return vaddq_f32(vTemp,g_XMOne3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Negate only x,y and z.
-    XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3);
-    // Add 1,1,1,0 to -x,-y,-z,w
-    return _mm_add_ps(vTemp,g_XMOne3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorModulate
-(
-    FXMVECTOR C1,
-    FXMVECTOR C2
-)
-{
-    return XMVectorMultiply(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation
-(
-    FXMVECTOR vColor,
-    float    fSaturation
-)
-{
-    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
-    // Result = (C - Luminance) * Saturation + Luminance;
-
-#if defined(_XM_NO_INTRINSICS_)
-    const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
-
-    float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]);
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance;
-    vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance;
-    vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance;
-    vResult.vector4_f32[3] = vColor.vector4_f32[3];
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
-    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
-    XMVECTOR vResult = vsubq_f32(vColor, vLuminance);
-    vResult = vmlaq_n_f32( vLuminance, vResult, fSaturation );
-    return vbslq_f32( g_XMSelect1110, vResult, vColor );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
-    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
-// Splat fSaturation
-    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
-// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
-    XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance);
-    vResult = _mm_mul_ps(vResult,vSaturation);
-    vResult = _mm_add_ps(vResult,vLuminance);
-// Retain w from the source color
-    vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
-    vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorAdjustContrast
-(
-    FXMVECTOR vColor,
-    float    fContrast
-)
-{
-    // Result = (vColor - 0.5f) * fContrast + 0.5f;
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f,
-        ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f,
-        ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f,
-        vColor.vector4_f32[3]        // Leave W untouched
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v);
-    vResult = vmlaq_n_f32( g_XMOneHalf.v, vResult, fContrast );
-    return vbslq_f32( g_XMSelect1110, vResult, vColor );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
-    XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
-    vResult = _mm_mul_ps(vResult,vScale);               // Mul by scale
-    vResult = _mm_add_ps(vResult,g_XMOneHalf);          // Add 0.5f
-// Retain w from the source color
-    vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
-    vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb )
-{
-    XMVECTOR r = XMVectorSplatX( rgb );
-    XMVECTOR g = XMVectorSplatY( rgb );
-    XMVECTOR b = XMVectorSplatZ( rgb );
-
-    XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) );
-    XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) );
-
-    XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf );
-
-    XMVECTOR d = XMVectorSubtract( max, min );
-
-    XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 );
-
-    if ( XMVector3Less( d, g_XMEpsilon ) )
-    {
-        // Achromatic, assume H and S of 0
-        return XMVectorSelect( la, g_XMZero, g_XMSelect1100 );
-    }
-    else
-    {
-        XMVECTOR s, h;
-
-        XMVECTOR d2 = XMVectorAdd( min, max );
-
-        if ( XMVector3Greater( l, g_XMOneHalf ) )
-        {
-            // d / (2-max-min)
-            s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); 
-        }
-        else
-        {
-            // d / (max+min)
-            s = XMVectorDivide( d, d2 ); 
-        }
-
-        if ( XMVector3Equal( r, max ) )
-        {
-            // Red is max
-            h = XMVectorDivide( XMVectorSubtract( g, b ), d );
-        }
-        else if ( XMVector3Equal( g, max ) )
-        {
-            // Green is max
-            h = XMVectorDivide( XMVectorSubtract( b, r ), d );
-            h = XMVectorAdd( h, g_XMTwo );
-        }
-        else
-        {
-            // Blue is max
-            h = XMVectorDivide( XMVectorSubtract( r, g ), d );
-            h = XMVectorAdd( h, g_XMFour );
-        }
-
-        h = XMVectorDivide( h, g_XMSix );
-
-        if ( XMVector3Less( h, g_XMZero ) )
-            h = XMVectorAdd( h, g_XMOne );
-
-        XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 );
-        return XMVectorSelect( s, lha, g_XMSelect1011 );
-    }
-}
-
-//------------------------------------------------------------------------------
-
-namespace Internal
-{
-
-inline XMVECTOR XM_CALLCONV XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h )
-{
-    static const XMVECTORF32 oneSixth  = { 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f };
-    static const XMVECTORF32 twoThirds = { 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f };
-    
-    XMVECTOR t = h;
-
-    if ( XMVector3Less( t, g_XMZero ) )
-        t = XMVectorAdd( t, g_XMOne );
-
-    if ( XMVector3Greater( t, g_XMOne ) )
-        t = XMVectorSubtract( t, g_XMOne );
-
-    if ( XMVector3Less( t, oneSixth ) )
-    {
-        // p + (q - p) * 6 * t
-        XMVECTOR t1 = XMVectorSubtract( q, p );
-        XMVECTOR t2 = XMVectorMultiply( g_XMSix, t );
-        return XMVectorMultiplyAdd( t1, t2, p );
-    }
-
-    if ( XMVector3Less( t, g_XMOneHalf ) )
-        return q;
-
-    if ( XMVector3Less( t, twoThirds ) )
-    {
-        // p + (q - p) * 6 * (2/3 - t)
-        XMVECTOR t1 = XMVectorSubtract( q, p );
-        XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) );
-        return XMVectorMultiplyAdd( t1, t2, p );
-    }
-
-    return p;
-}
-
-}; // namespace Internal
-
-inline XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl )
-{
-    static const XMVECTORF32 oneThird = { 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f };
-
-    XMVECTOR s = XMVectorSplatY( hsl );
-    XMVECTOR l = XMVectorSplatZ( hsl );
-
-    if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) )
-    {
-        // Achromatic
-        return XMVectorSelect( hsl, l, g_XMSelect1110 );
-    }
-    else
-    {
-        XMVECTOR h = XMVectorSplatX( hsl );
-
-        XMVECTOR q;
-        if ( XMVector3Less( l, g_XMOneHalf ) )
-        {
-            q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) );
-        }
-        else
-        {
-            q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) );
-        }
-
-        XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q );
-
-        XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) );
-        XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h );
-        XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) );
-
-        XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 );
-        XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 );
-
-        return XMVectorSelect( ba, rg, g_XMSelect1100 );
-    }
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb )
-{
-    XMVECTOR r = XMVectorSplatX( rgb );
-    XMVECTOR g = XMVectorSplatY( rgb );
-    XMVECTOR b = XMVectorSplatZ( rgb );
-
-    XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) );
-    XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) );
-
-    XMVECTOR d = XMVectorSubtract( v, min );
-
-    XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v );
-
-    if ( XMVector3Less( d, g_XMEpsilon ) )
-    {
-        // Achromatic, assume H of 0
-        XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 );
-        XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 );
-        return XMVectorSelect( s, hva, g_XMSelect1011 );
-    }
-    else
-    {
-        XMVECTOR h;
-
-        if ( XMVector3Equal( r, v ) )
-        {
-            // Red is max
-            h = XMVectorDivide( XMVectorSubtract( g, b ), d );
-
-            if ( XMVector3Less( g, b ) )
-                h = XMVectorAdd( h, g_XMSix );
-        }
-        else if ( XMVector3Equal( g, v ) )
-        {
-            // Green is max
-            h = XMVectorDivide( XMVectorSubtract( b, r ), d );
-            h = XMVectorAdd( h, g_XMTwo );
-        }
-        else
-        {
-            // Blue is max
-            h = XMVectorDivide( XMVectorSubtract( r, g ), d );
-            h = XMVectorAdd( h, g_XMFour );
-        }
-
-        h = XMVectorDivide( h, g_XMSix );
-
-        XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 );
-        XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 );
-        return XMVectorSelect( s, hva, g_XMSelect1011 );
-    }
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv )
-{
-    XMVECTOR h = XMVectorSplatX( hsv );
-    XMVECTOR s = XMVectorSplatY( hsv );
-    XMVECTOR v = XMVectorSplatZ( hsv );
-
-    XMVECTOR h6 = XMVectorMultiply( h, g_XMSix );
-
-    XMVECTOR i = XMVectorFloor( h6 );
-    XMVECTOR f = XMVectorSubtract( h6, i );
-
-    // p = v* (1-s)
-    XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) );
-
-    // q = v*(1-f*s)
-    XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) );
-
-    // t = v*(1 - (1-f)*s)
-    XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) );
-
-    int ii = static_cast<int>( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) );
-
-    XMVECTOR _rgb;
-
-    switch (ii)
-    {
-    case 0: // rgb = vtp
-        {
-            XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 );
-            _rgb = XMVectorSelect( p, vt, g_XMSelect1100 );
-        }
-        break;
-    case 1: // rgb = qvp
-        {
-            XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 );
-            _rgb = XMVectorSelect( p, qv, g_XMSelect1100 );
-        }
-        break;
-    case 2: // rgb = pvt
-        {
-            XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 );
-            _rgb = XMVectorSelect( t, pv, g_XMSelect1100 );
-        }
-        break;
-    case 3: // rgb = pqv
-        {
-            XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 );
-            _rgb = XMVectorSelect( v, pq, g_XMSelect1100 );
-        }
-        break;
-    case 4: // rgb = tpv
-        {
-            XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 );
-            _rgb = XMVectorSelect( v, tp, g_XMSelect1100 );
-        }
-        break;
-    default: // rgb = vpq
-        {
-            XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 );
-            _rgb = XMVectorSelect( q, vp, g_XMSelect1100 );
-        }
-        break;
-    }
-
-    return XMVectorSelect( hsv, _rgb, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb )
-{
-    static const XMVECTORF32 Scale0 = {  0.299f, -0.147f,  0.615f, 0.0f }; 
-    static const XMVECTORF32 Scale1 = {  0.587f, -0.289f, -0.515f, 0.0f };
-    static const XMVECTORF32 Scale2 = {  0.114f,  0.436f, -0.100f, 0.0f };
-
-    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVector3Transform( rgb, M );
-
-    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv )
-{
-    static const XMVECTORF32 Scale1 = {   0.0f, -0.395f, 2.032f, 0.0f };
-    static const XMVECTORF32 Scale2 = { 1.140f, -0.581f,   0.0f, 0.0f };
-
-    XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVector3Transform( yuv, M );
-
-    return XMVectorSelect( yuv, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb )
-{
-    static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f,  0.6150f, 0.0f };
-    static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f };
-    static const XMVECTORF32 Scale2 = { 0.0722f,  0.4351f, -0.0564f, 0.0f };
-
-    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVector3Transform( rgb, M );
-
-    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv )
-{
-    static const XMVECTORF32 Scale1 = {    0.0f, -0.2153f, 2.1324f, 0.0f };
-    static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f,    0.0f, 0.0f };
-        
-    XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVector3Transform( yuv, M );
-
-    return XMVectorSelect( yuv, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb )
-{
-    static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f };
-    static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f };
-    static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f };
-    static const XMVECTORF32 Scale = { 1.f/0.17697f, 1.f/0.17697f, 1.f/0.17697f, 0.0f };
-
-    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale );
-
-    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
-}
-
-inline XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz )
-{
-    static const XMVECTORF32 Scale0 = {  2.3706743f, -0.5138850f,  0.0052982f, 0.0f };
-    static const XMVECTORF32 Scale1 = { -0.9000405f,  1.4253036f, -0.0146949f, 0.0f };
-    static const XMVECTORF32 Scale2 = { -0.4706338f,  0.0885814f,  1.0093968f, 0.0f };
-    static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f };
-
-    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M );
-
-    return XMVectorSelect( xyz, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz )
-{
-    static const XMVECTORF32 Scale0 = {  3.2406f, -0.9689f,  0.0557f, 0.0f };
-    static const XMVECTORF32 Scale1 = { -1.5372f,  1.8758f, -0.2040f, 0.0f };
-    static const XMVECTORF32 Scale2 = { -0.4986f,  0.0415f,  1.0570f, 0.0f };
-    static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f };
-    static const XMVECTORF32 Exp    = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.0f };
-
-    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
-    XMVECTOR lclr = XMVector3Transform( xyz, M );
-
-    XMVECTOR sel = XMVectorGreater( lclr, Cutoff );
-
-    // clr = 12.92 * lclr for lclr <= 0.0031308f
-    XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale );
-
-    // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055)
-    XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA );
-
-    XMVECTOR clr = XMVectorSelect( smallC, largeC, sel );
-
-    return XMVectorSelect( xyz, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb )
-{
-    static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f };
-    static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f };
-    static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f };
-    static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f };
-    static const XMVECTORF32 Exp    = { 2.4f, 2.4f, 2.4f, 1.0f };
-
-    XMVECTOR sel = XMVectorGreater( srgb, Cutoff );
-
-    // lclr = clr / 12.92
-    XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale );
-
-    // lclr = pow( (clr + a) / (1+a), 2.4 )
-    XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp );
-
-    XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel );
-
-    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
-    XMVECTOR clr = XMVector3Transform( lclr, M );
-
-    return XMVectorSelect( srgb, clr, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb )
-{
-    static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 1.f };
-    static const XMVECTORF32 Linear = { 12.92f, 12.92f, 12.92f, 1.f };
-    static const XMVECTORF32 Scale = { 1.055f, 1.055f, 1.055f, 1.f };
-    static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f };
-    static const XMVECTORF32 InvGamma = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.f };
-
-    XMVECTOR V = XMVectorSaturate(rgb);
-    XMVECTOR V0 = XMVectorMultiply( V, Linear );
-    XMVECTOR V1 = Scale * XMVectorPow( V, InvGamma ) - Bias;
-    XMVECTOR select = XMVectorLess( V, Cutoff );
-    V = XMVectorSelect( V1, V0, select );
-    return XMVectorSelect( rgb, V, g_XMSelect1110 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
-{
-    static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 1.f };
-    static const XMVECTORF32 ILinear = { 1.f/12.92f, 1.f/12.92f, 1.f/12.92f, 1.f };
-    static const XMVECTORF32 Scale = { 1.f/1.055f, 1.f/1.055f, 1.f/1.055f, 1.f };
-    static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f };
-    static const XMVECTORF32 Gamma = { 2.4f, 2.4f, 2.4f, 1.f };
-
-    XMVECTOR V = XMVectorSaturate(srgb);
-    XMVECTOR V0 = XMVectorMultiply( V, ILinear );
-    XMVECTOR V1 = XMVectorPow( (V + Bias) * Scale, Gamma );
-    XMVECTOR select = XMVectorGreater( V, Cutoff );
-    V = XMVectorSelect( V0, V1, select );
-    return XMVectorSelect( srgb, V, g_XMSelect1110 );
-}
-
-/****************************************************************************
- *
- * Miscellaneous
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline bool XMVerifyCPUSupport()
-{
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    int CPUInfo[4] = { -1 };
-    __cpuid(CPUInfo, 0);
-
-#ifdef __AVX2__
-    if (CPUInfo[0] < 7)
-        return false;
-#else
-    if (CPUInfo[0] < 1)
-        return false;
-#endif
-
-    __cpuid(CPUInfo, 1);
-
-#ifdef __AVX2__
-    // The compiler can emit FMA3 instructions even without explicit intrinsics use
-    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
-        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
-#elif defined(_XM_F16C_INTRINSICS_)
-    if ((CPUInfo[2] & 0x38080001) != 0x38080001)
-        return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
-#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_)
-    if ((CPUInfo[2] & 0x18080001) != 0x18080001)
-        return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support
-#elif defined(_XM_SSE4_INTRINSICS_)
-    if ((CPUInfo[2] & 0x80001) != 0x80001)
-        return false; // No SSE3/SSE4.1 support
-#elif defined(_XM_SSE3_INTRINSICS_)
-    if (!(CPUInfo[2] & 0x1))
-        return false; // No SSE3 support  
-#endif
-
-    // The x64 processor model requires SSE2 support, but no harm in checking
-    if ((CPUInfo[3] & 0x6000000) != 0x6000000)
-        return false; // No SSE2/SSE support
-
-#ifdef __AVX2__
-    __cpuidex(CPUInfo, 7, 0);
-    if (!(CPUInfo[1] & 0x20))
-        return false; // No AVX2 support
-#endif
-
-    return true;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    // ARM-NEON support is required for the Windows on ARM platform
-    return true;
-#else
-    // No intrinsics path always supported
-    return true;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMFresnelTerm
-(
-    FXMVECTOR CosIncidentAngle,
-    FXMVECTOR RefractionIndex
-)
-{
-    assert(!XMVector4IsInfinite(CosIncidentAngle));
-
-    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
-    // c = CosIncidentAngle
-    // g = sqrt(c^2 + RefractionIndex^2 - 1)
-
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
-    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
-    G = XMVectorAbs(G);
-    G = XMVectorSqrt(G);
-
-    XMVECTOR S = XMVectorAdd(G, CosIncidentAngle);
-    XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle);
-
-    XMVECTOR V0 = XMVectorMultiply(D, D);
-    XMVECTOR V1 = XMVectorMultiply(S, S);
-    V1 = XMVectorReciprocal(V1);
-    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
-    V0 = XMVectorMultiply(V0, V1);
-
-    XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
-    XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
-    V2 = XMVectorMultiply(V2, V2);
-    V3 = XMVectorMultiply(V3, V3);
-    V3 = XMVectorReciprocal(V3);
-    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
-
-    XMVECTOR Result = XMVectorMultiply(V0, V2);
-
-    Result = XMVectorSaturate(Result);
-
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
-    XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex);
-    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle);
-    G = _mm_sub_ps(G,g_XMOne);
-    vTemp = _mm_add_ps(vTemp,G);
-    // max((0-vTemp),vTemp) == abs(vTemp)
-    // The abs is needed to deal with refraction and cosine being zero
-    G = _mm_setzero_ps();
-    G = _mm_sub_ps(G,vTemp);
-    G = _mm_max_ps(G,vTemp);
-    // Last operation, the sqrt()
-    G = _mm_sqrt_ps(G);
-
-    // Calc G-C and G+C
-    XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle);
-    XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle);
-    // Perform the term (0.5f *(g - c)^2) / (g + c)^2 
-    XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC);
-    vTemp = _mm_mul_ps(GAddC,GAddC);
-    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
-    vResult = _mm_div_ps(vResult,vTemp);
-    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
-    GAddC = _mm_mul_ps(GAddC,CosIncidentAngle);
-    GSubC = _mm_mul_ps(GSubC,CosIncidentAngle);
-    GAddC = _mm_sub_ps(GAddC,g_XMOne);
-    GSubC = _mm_add_ps(GSubC,g_XMOne);
-    GAddC = _mm_mul_ps(GAddC,GAddC);
-    GSubC = _mm_mul_ps(GSubC,GSubC);
-    GAddC = _mm_div_ps(GAddC,GSubC);
-    GAddC = _mm_add_ps(GAddC,g_XMOne);
-    // Multiply the two term parts
-    vResult = _mm_mul_ps(vResult,GAddC);
-    // Clamp to 0.0 - 1.0f
-    vResult = _mm_max_ps(vResult,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XMScalarNearEqual
-(
-    float S1,
-    float S2,
-    float Epsilon
-)
-{
-    float Delta = S1 - S2;
-    return (fabsf(Delta) <= Epsilon);
-}
-
-//------------------------------------------------------------------------------
-// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
-inline float XMScalarModAngle
-(
-    float Angle
-)
-{
-    // Note: The modulo is performed with unsigned math only to work
-    // around a precision error on numbers that are close to PI
-
-    // Normalize the range from 0.0f to XM_2PI
-    Angle = Angle + XM_PI;
-    // Perform the modulo, unsigned
-    float fTemp = fabsf(Angle);
-    fTemp = fTemp - (XM_2PI * (float)((int32_t)(fTemp/XM_2PI)));
-    // Restore the number to the range of -XM_PI to XM_PI-epsilon
-    fTemp = fTemp - XM_PI;
-    // If the modulo'd value was negative, restore negation
-    if (Angle<0.0f) {
-        fTemp = -fTemp;
-    }
-    return fTemp;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarSin
-(
-    float Value
-)
-{
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI*Value;
-    if (Value >= 0.0f)
-    {
-        quotient = (float)((int)(quotient + 0.5f));
-    }
-    else
-    {
-        quotient = (float)((int)(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI*quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    if (y > XM_PIDIV2)
-    {
-        y = XM_PI - y;
-    }
-    else if (y < -XM_PIDIV2)
-    {
-        y = -XM_PI - y;
-    }
-
-    // 11-degree minimax approximation
-    float y2 = y * y;
-    return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarSinEst
-(
-    float Value
-)
-{
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI*Value;
-    if (Value >= 0.0f)
-    {
-        quotient = (float)((int)(quotient + 0.5f));
-    }
-    else
-    {
-        quotient = (float)((int)(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI*quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    if (y > XM_PIDIV2)
-    {
-        y = XM_PI - y;
-    }
-    else if (y < -XM_PIDIV2)
-    {
-        y = -XM_PI - y;
-    }
-
-    // 7-degree minimax approximation
-    float y2 = y * y;
-    return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarCos
-(
-    float Value
-)
-{
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI*Value;
-    if (Value >= 0.0f)
-    {
-        quotient = (float)((int)(quotient + 0.5f));
-    }
-    else
-    {
-        quotient = (float)((int)(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI*quotient;
-
-    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    float sign;
-    if (y > XM_PIDIV2)
-    {
-        y = XM_PI - y;
-        sign = -1.0f;
-    }
-    else if (y < -XM_PIDIV2)
-    {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    }
-    else
-    {
-        sign = +1.0f;
-    }
-
-    // 10-degree minimax approximation
-    float y2 = y*y;
-    float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f;
-    return sign*p;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarCosEst
-(
-    float Value
-)
-{
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI*Value;
-    if (Value >= 0.0f)
-    {
-        quotient = (float)((int)(quotient + 0.5f));
-    }
-    else
-    {
-        quotient = (float)((int)(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI*quotient;
-
-    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    float sign;
-    if (y > XM_PIDIV2)
-    {
-        y = XM_PI - y;
-        sign = -1.0f;
-    }
-    else if (y < -XM_PIDIV2)
-    {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    }
-    else
-    {
-        sign = +1.0f;
-    }
-
-    // 6-degree minimax approximation
-    float y2 = y * y;
-    float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f;
-    return sign*p;
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline void XMScalarSinCos
-(
-    float* pSin,
-    float* pCos,
-    float  Value
-)
-{
-    assert(pSin);
-    assert(pCos);
-
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI*Value;
-    if (Value >= 0.0f)
-    {
-        quotient = (float)((int)(quotient + 0.5f));
-    }
-    else
-    {
-        quotient = (float)((int)(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI*quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    float sign;
-    if (y > XM_PIDIV2)
-    {
-        y = XM_PI - y;
-        sign = -1.0f;
-    }
-    else if (y < -XM_PIDIV2)
-    {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    }
-    else
-    {
-        sign = +1.0f;
-    }
-
-    float y2 = y * y;
-
-    // 11-degree minimax approximation
-    *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y;
-
-    // 10-degree minimax approximation
-    float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f;
-    *pCos = sign*p;
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline void XMScalarSinCosEst
-(
-    float* pSin,
-    float* pCos,
-    float  Value
-)
-{
-    assert(pSin);
-    assert(pCos);
-
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI*Value;
-    if (Value >= 0.0f)
-    {
-        quotient = (float)((int)(quotient + 0.5f));
-    }
-    else
-    {
-        quotient = (float)((int)(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI*quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    float sign;
-    if (y > XM_PIDIV2)
-    {
-        y = XM_PI - y;
-        sign = -1.0f;
-    }
-    else if (y < -XM_PIDIV2)
-    {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    }
-    else
-    {
-        sign = +1.0f;
-    }
-
-    float y2 = y * y;
-
-    // 7-degree minimax approximation
-    *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y;
-
-    // 6-degree minimax approximation
-    float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f;
-    *pCos = sign*p;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarASin
-(
-    float Value
-)
-{
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f)
-    {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 7-degree minimax approximation
-    float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f;
-    result *= root;  // acos(|x|)
-
-    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
-    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarASinEst
-(
-    float Value
-)
-{
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f)
-    {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 3-degree minimax approximation
-    float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f;
-    result *= root;  // acos(|x|)
-
-    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
-    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarACos
-(
-    float Value
-)
-{
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f)
-    {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 7-degree minimax approximation
-    float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f;
-    result *= root;
-
-    // acos(x) = pi - acos(-x) when x < 0
-    return (nonnegative ? result : XM_PI - result);
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarACosEst
-(
-    float Value
-)
-{
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f)
-    {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 3-degree minimax approximation
-    float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f;
-    result *= root;
-
-    // acos(x) = pi - acos(-x) when x < 0
-    return (nonnegative ? result : XM_PI - result);
-}
-
+//-------------------------------------------------------------------------------------
+// DirectXMathMisc.inl -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Quaternion
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionEqual
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    return XMVector4Equal(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionNotEqual
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    return XMVector4NotEqual(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionIsNaN
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4IsNaN(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionIsInfinite
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4IsInfinite(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionIsIdentity
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4Equal(Q, g_XMIdentityR3.v);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionDot
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    return XMVector4Dot(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2)
+
+    // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
+    //   (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
+    //   (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
+    //   (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result = {
+        (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]),
+        (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]),
+        (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]),
+        (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) };
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
+    static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
+    static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
+
+    float32x2_t Q2L = vget_low_f32(Q2);
+    float32x2_t Q2H = vget_high_f32(Q2);
+
+    float32x4_t Q2X = vdupq_lane_f32( Q2L, 0 );
+    float32x4_t Q2Y = vdupq_lane_f32( Q2L, 1 );
+    float32x4_t Q2Z = vdupq_lane_f32( Q2H, 0 );
+    XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1);
+
+    // Mul by Q1WZYX
+    float32x4_t vTemp = vrev64q_f32(Q1);
+    vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) );
+    Q2X = vmulq_f32(Q2X,vTemp);
+    vResult = vmlaq_f32( vResult, Q2X, ControlWZYX );
+
+    // Mul by Q1ZWXY
+    vTemp = vrev64q_u32(vTemp);
+    Q2Y = vmulq_f32(Q2Y,vTemp);
+    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
+
+    // Mul by Q1YXWZ
+    vTemp = vrev64q_u32(vTemp);
+    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
+    Q2Z = vmulq_f32(Q2Z,vTemp);
+    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
+    static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
+    static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
+    // Copy to SSE registers and use as few as possible for x86
+    XMVECTOR Q2X = Q2;
+    XMVECTOR Q2Y = Q2;
+    XMVECTOR Q2Z = Q2;
+    XMVECTOR vResult = Q2;
+    // Splat with one instruction
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3));
+    Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0));
+    Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1));
+    Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2));
+    // Retire Q1 and perform Q1*Q2W
+    vResult = _mm_mul_ps(vResult,Q1);
+    XMVECTOR Q1Shuffle = Q1;
+    // Shuffle the copies of Q1
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+    // Mul by Q1WZYX
+    Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
+    // Flip the signs on y and z
+    Q2X = _mm_mul_ps(Q2X,ControlWZYX);
+    // Mul by Q1ZWXY
+    Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+    // Flip the signs on z and w
+    Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
+    // Mul by Q1YXWZ
+    Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle);
+    vResult = _mm_add_ps(vResult,Q2X);
+    // Flip the signs on x and w
+    Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ);
+    Q2Y = _mm_add_ps(Q2Y,Q2Z);
+    vResult = _mm_add_ps(vResult,Q2Y);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4LengthSq(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4ReciprocalLength(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionLength
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4Length(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4NormalizeEst(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionNormalize
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4Normalize(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionConjugate
+(
+    FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result = {
+        -Q.vector4_f32[0],
+        -Q.vector4_f32[1],
+        -Q.vector4_f32[2],
+        Q.vector4_f32[3]
+    };
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
+    return vmulq_f32(Q, NegativeOne3.v );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
+    return _mm_mul_ps(Q,NegativeOne3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionInverse
+(
+    FXMVECTOR Q
+)
+{
+    const XMVECTOR  Zero = XMVectorZero();
+
+    XMVECTOR L = XMVector4LengthSq(Q);
+    XMVECTOR Conjugate = XMQuaternionConjugate(Q);
+
+    XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
+
+    XMVECTOR Result = XMVectorDivide(Conjugate, L);
+
+    Result = XMVectorSelect(Result, Zero, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionLn
+(
+    FXMVECTOR Q
+)
+{
+    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+
+    XMVECTOR QW = XMVectorSplatW(Q);
+    XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
+
+    XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v);
+
+    XMVECTOR Theta = XMVectorACos(QW);
+    XMVECTOR SinTheta = XMVectorSin(Theta);
+
+    XMVECTOR S = XMVectorDivide(Theta,SinTheta);
+
+    XMVECTOR Result = XMVectorMultiply(Q0, S);
+    Result = XMVectorSelect(Q0, Result, ControlW);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionExp
+(
+    FXMVECTOR Q
+)
+{
+    XMVECTOR Theta = XMVector3Length(Q);
+
+    XMVECTOR SinTheta, CosTheta;
+    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
+
+    XMVECTOR S = XMVectorDivide(SinTheta, Theta);
+
+    XMVECTOR Result = XMVectorMultiply(Q, S);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
+    Result = XMVectorSelect(Result, Q, Control);
+
+    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSlerp
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    float    t
+)
+{
+    XMVECTOR T = XMVectorReplicate(t);
+    return XMQuaternionSlerpV(Q0, Q1, T);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR T
+)
+{
+    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
+
+    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+
+    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
+    XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
+
+    CosOmega = XMVectorMultiply(CosOmega, Sign);
+
+    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+    XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
+    SinOmega = XMVectorSqrt(SinOmega);
+
+    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
+
+    XMVECTOR SignMask = XMVectorSplatSignMask();
+    XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2);
+    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
+    V01 = XMVectorXorInt(V01, SignMask);
+    V01 = XMVectorAdd(g_XMIdentityR0.v, V01);
+
+    XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega);
+
+    XMVECTOR S0 = XMVectorMultiply(V01, Omega);
+    S0 = XMVectorSin(S0);
+    S0 = XMVectorMultiply(S0, InvSinOmega);
+
+    S0 = XMVectorSelect(V01, S0, Control);
+
+    XMVECTOR S1 = XMVectorSplatY(S0);
+    S0 = XMVectorSplatX(S0);
+
+    S1 = XMVectorMultiply(S1, Sign);
+
+    XMVECTOR Result = XMVectorMultiply(Q0, S0);
+    Result = XMVectorMultiplyAdd(Q1, S1, Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+    static const XMVECTORU32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000};
+
+    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
+    XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
+
+    CosOmega = _mm_mul_ps(CosOmega, Sign);
+
+    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+    XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega);
+    SinOmega = _mm_sub_ps(g_XMOne,SinOmega);
+    SinOmega = _mm_sqrt_ps(SinOmega);
+
+    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
+
+    XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1));
+    V01 = _mm_and_ps(V01,g_XMMaskXY);
+    V01 = _mm_xor_ps(V01,SignMask2);
+    V01 = _mm_add_ps(g_XMIdentityR0, V01);
+
+    XMVECTOR S0 = _mm_mul_ps(V01, Omega);
+    S0 = XMVectorSin(S0);
+    S0 = _mm_div_ps(S0, SinOmega);
+
+    S0 = XMVectorSelect(V01, S0, Control);
+
+    XMVECTOR S1 = XMVectorSplatY(S0);
+    S0 = XMVectorSplatX(S0);
+
+    S1 = _mm_mul_ps(S1, Sign);
+    XMVECTOR Result = _mm_mul_ps(Q0, S0);
+    S1 = _mm_mul_ps(S1, Q1);
+    Result = _mm_add_ps(Result,S1);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSquad
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    GXMVECTOR Q3,
+    float    t
+)
+{
+    XMVECTOR T = XMVectorReplicate(t);
+    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSquadV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    GXMVECTOR Q3,
+    HXMVECTOR T
+)
+{
+    assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) );
+
+    XMVECTOR TP = T;
+    const XMVECTOR Two = XMVectorSplatConstant(2, 0);
+
+    XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T);
+    XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T);
+
+    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
+    TP = XMVectorMultiply(TP, Two);
+
+    XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMQuaternionSquadSetup
+(
+    XMVECTOR* pA,
+    XMVECTOR* pB,
+    XMVECTOR* pC,
+    FXMVECTOR  Q0,
+    FXMVECTOR  Q1,
+    FXMVECTOR  Q2,
+    GXMVECTOR  Q3
+)
+{
+    assert(pA);
+    assert(pB);
+    assert(pC);
+
+    XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
+    XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
+    XMVECTOR SQ2 = XMVectorNegate(Q2);
+
+    XMVECTOR Control1 = XMVectorLess(LS12, LD12);
+    SQ2 = XMVectorSelect(Q2, SQ2, Control1);
+
+    XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
+    XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
+    XMVECTOR SQ0 = XMVectorNegate(Q0);
+
+    XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
+    XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
+    XMVECTOR SQ3 = XMVectorNegate(Q3);
+
+    XMVECTOR Control0 = XMVectorLess(LS01, LD01);
+    XMVECTOR Control2 = XMVectorLess(LS23, LD23);
+
+    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
+    SQ3 = XMVectorSelect(Q3, SQ3, Control2);
+
+    XMVECTOR InvQ1 = XMQuaternionInverse(Q1);
+    XMVECTOR InvQ2 = XMQuaternionInverse(SQ2);
+
+    XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
+    XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
+    XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
+    XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
+
+    const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
+
+    XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
+    XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
+    ExpQ02 = XMQuaternionExp(ExpQ02);
+    ExpQ13 = XMQuaternionExp(ExpQ13);
+
+    *pA = XMQuaternionMultiply(Q1, ExpQ02);
+    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
+    *pC = SQ2;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    float    f,
+    float    g
+)
+{
+    float s = f + g;
+
+    XMVECTOR Result;
+    if ((s < 0.00001f) && (s > -0.00001f))
+    {
+        Result = Q0;
+    }
+    else
+    {
+        XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s);
+        XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s);
+
+        Result = XMQuaternionSlerp(Q01, Q02, g / s);
+    }
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    GXMVECTOR F,
+    HXMVECTOR G
+)
+{
+    assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) );
+    assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) );
+
+    const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16);
+
+    XMVECTOR S = XMVectorAdd(F, G);
+
+    XMVECTOR Result;
+    if (XMVector4InBounds(S, Epsilon))
+    {
+        Result = Q0;
+    }
+    else
+    {
+        XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S);
+        XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S);
+        XMVECTOR GS = XMVectorReciprocal(S);
+        GS = XMVectorMultiply(G, GS);
+
+        Result = XMQuaternionSlerpV(Q01, Q02, GS);
+    }
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionIdentity()
+{
+    return g_XMIdentityR3.v;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw
+(
+    float Pitch,
+    float Yaw,
+    float Roll
+)
+{
+    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+    XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
+    return Q;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector
+(
+    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
+)
+{
+    static const XMVECTORF32  Sign = {1.0f, -1.0f, -1.0f, 1.0f};
+
+    XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
+
+    XMVECTOR SinAngles, CosAngles;
+    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
+
+    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(SinAngles, CosAngles);
+    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
+    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
+    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(CosAngles, SinAngles);
+    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(CosAngles, SinAngles);
+    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(CosAngles, SinAngles);
+
+    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
+    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
+    Q1 = XMVectorMultiply(Q1, Y1);
+    Q0 = XMVectorMultiply(Q0, R0);
+    XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0);
+
+    return Q;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal
+(
+    FXMVECTOR NormalAxis,
+    float    Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
+
+    float SinV, CosV;
+    XMScalarSinCos(&SinV, &CosV, 0.5f * Angle);
+
+    XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV );
+    return XMVectorMultiply(N, Scale);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3);
+    N = _mm_or_ps(N,g_XMIdentityR3);
+    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
+    XMVECTOR vSine;
+    XMVECTOR vCosine;
+    XMVectorSinCos(&vSine,&vCosine,Scale);
+    Scale = _mm_and_ps(vSine,g_XMMask3);
+    vCosine = _mm_and_ps(vCosine,g_XMMaskW);
+    Scale = _mm_or_ps(Scale,vCosine);
+    N = _mm_mul_ps(N,Scale);
+    return N;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis
+(
+    FXMVECTOR Axis,
+    float    Angle
+)
+{
+    assert(!XMVector3Equal(Axis, XMVectorZero()));
+    assert(!XMVector3IsInfinite(Axis));
+
+    XMVECTOR Normal = XMVector3Normalize(Axis);
+    XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle);
+    return Q;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix
+(
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 q;
+    float r22 = M.m[2][2];
+    if (r22 <= 0.f)  // x^2 + y^2 >= z^2 + w^2
+    {
+        float dif10 = M.m[1][1] - M.m[0][0];
+        float omr22 = 1.f - r22;
+        if (dif10 <= 0.f)  // x^2 >= y^2
+        {
+            float fourXSqr = omr22 - dif10;
+            float inv4x = 0.5f / sqrtf(fourXSqr);
+            q.f[0] = fourXSqr*inv4x;
+            q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x;
+            q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x;
+            q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x;
+        }
+        else  // y^2 >= x^2
+        {
+            float fourYSqr = omr22 + dif10;
+            float inv4y = 0.5f / sqrtf(fourYSqr);
+            q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y;
+            q.f[1] = fourYSqr*inv4y;
+            q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y;
+            q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y;
+        }
+    }
+    else  // z^2 + w^2 >= x^2 + y^2
+    {
+        float sum10 = M.m[1][1] + M.m[0][0];
+        float opr22 = 1.f + r22;
+        if (sum10 <= 0.f)  // z^2 >= w^2
+        {
+            float fourZSqr = opr22 - sum10;
+            float inv4z = 0.5f / sqrtf(fourZSqr);
+            q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z;
+            q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z;
+            q.f[2] = fourZSqr*inv4z;
+            q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z;
+        }
+        else  // w^2 >= z^2
+        {
+            float fourWSqr = opr22 + sum10;
+            float inv4w = 0.5f / sqrtf(fourWSqr);
+            q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w;
+            q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w;
+            q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w;
+            q.f[3] = fourWSqr*inv4w;
+        }
+    }
+    return q.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f};
+    static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f};
+    static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; 
+    static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 };
+    static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 };
+
+    XMVECTOR r0 = M.r[0];
+    XMVECTOR r1 = M.r[1];
+    XMVECTOR r2 = M.r[2];
+
+    XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
+    XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
+    XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
+
+    // x^2 >= y^2 equivalent to r11 - r00 <= 0
+    XMVECTOR r11mr00 = vsubq_f32(r11, r00);
+    XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero);
+
+    // z^2 >= w^2 equivalent to r11 + r00 <= 0
+    XMVECTOR r11pr00 = vaddq_f32(r11, r00);
+    XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero);
+    
+    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
+    XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
+
+    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
+    XMVECTOR t0 = vmulq_f32( XMPMMP, r00 );
+    XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 );
+    x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 );
+    x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne );
+
+    // (r01, r02, r12, r11)
+    t0 = vextq_f32(r0, r0, 1);
+    XMVECTOR t1 = vextq_f32(r1, r1, 1);
+    t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) );
+
+    // (r10, r20, r21, r10)
+    t1 = vextq_f32(r2, r2, 3);
+    XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 );
+    t1 = vbslq_f32( Select0110, t1, r10 );
+
+    // (4*x*y, 4*x*z, 4*y*z, unused)
+    XMVECTOR xyxzyz = vaddq_f32(t0, t1);
+
+    // (r21, r20, r10, r10)
+    t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) );
+
+    // (r12, r02, r01, r12)
+    XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) );
+    XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 );
+    t1 = vbslq_f32( Select0110, t2, t3 );
+
+    // (4*x*w, 4*y*w, 4*z*w, unused)
+    XMVECTOR xwywzw = vsubq_f32(t0, t1);
+    xwywzw = vmulq_f32(XMMPMP, xwywzw);
+
+    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
+    t0 = vextq_f32( xyxzyz, xyxzyz, 3 );
+    t1 = vbslq_f32( Select0110, t0, x2y2z2w2 );
+    t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 );
+    XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 );
+
+    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
+    t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 );
+    t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 );
+    XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 );
+
+    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
+    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
+    t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) );
+    XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 );
+
+    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
+    XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 );
+
+    // Select the row of the tensor-product matrix that has the largest
+    // magnitude.
+    t0 = vbslq_f32( x2gey2, tensor0, tensor1 );
+    t1 = vbslq_f32( z2gew2, tensor2, tensor3 );
+    t2 = vbslq_f32( x2py2gez2pw2, t0, t1 );
+
+    // Normalize the row.  No division by zero is possible because the
+    // quaternion is unit-length (and the row is a nonzero multiple of
+    // the quaternion).
+    t0 = XMVector4Length(t2);
+    return XMVectorDivide(t2, t0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f};
+    static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f};
+    static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; 
+
+    XMVECTOR r0 = M.r[0];  // (r00, r01, r02, 0)
+    XMVECTOR r1 = M.r[1];  // (r10, r11, r12, 0)
+    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)
+
+    // (r00, r00, r00, r00)
+    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0));
+    // (r11, r11, r11, r11)
+    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1));
+    // (r22, r22, r22, r22)
+    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2));
+
+    // x^2 >= y^2 equivalent to r11 - r00 <= 0
+    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
+    XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
+    XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);
+
+    // z^2 >= w^2 equivalent to r11 + r00 <= 0
+    // (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
+    XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
+    XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);
+
+    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
+    XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);
+
+    // (+r00, -r00, -r00, +r00)
+    XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00);
+
+    // (-r11, +r11, -r11, +r11)
+    XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
+
+    // (-r22, -r22, +r22, +r22)
+    XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22);
+
+    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
+    XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1);
+    x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2);
+    x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne);
+
+    // (r01, r02, r12, r11)
+    t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1));
+    // (r10, r10, r20, r21)
+    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0));
+    // (r10, r20, r21, r10)
+    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
+    // (4*x*y, 4*x*z, 4*y*z, unused)
+    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
+
+    // (r21, r20, r10, r10)
+    t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1));
+    // (r12, r12, r02, r01)
+    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2));
+    // (r12, r02, r01, r12)
+    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
+    // (4*x*w, 4*y*w, 4*z*w, unused)
+    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
+    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
+
+    // (4*x^2, 4*y^2, 4*x*y, unused)
+    t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0));
+    // (4*z^2, 4*w^2, 4*z*w, unused)
+    t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2));
+    // (4*x*z, 4*y*z, 4*x*w, 4*y*w)
+    t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1));
+
+    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
+    XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0));
+    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
+    XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2));
+    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
+    XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0));
+    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
+    XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2));
+
+    // Select the row of the tensor-product matrix that has the largest
+    // magnitude.
+    t0 = _mm_and_ps(x2gey2, tensor0);
+    t1 = _mm_andnot_ps(x2gey2, tensor1);
+    t0 = _mm_or_ps(t0, t1);
+    t1 = _mm_and_ps(z2gew2, tensor2);
+    t2 = _mm_andnot_ps(z2gew2, tensor3);
+    t1 = _mm_or_ps(t1, t2);
+    t0 = _mm_and_ps(x2py2gez2pw2, t0);
+    t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
+    t2 = _mm_or_ps(t0, t1);
+
+    // Normalize the row.  No division by zero is possible because the
+    // quaternion is unit-length (and the row is a nonzero multiple of
+    // the quaternion).
+    t0 = XMVector4Length(t2);
+    return _mm_div_ps(t2, t0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMQuaternionToAxisAngle
+(
+    XMVECTOR* pAxis,
+    float*    pAngle,
+    FXMVECTOR  Q
+)
+{
+    assert(pAxis);
+    assert(pAngle);
+
+    *pAxis = Q;
+
+    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
+}
+
+/****************************************************************************
+ *
+ * Plane
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2
+)
+{
+    return XMVector4Equal(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneNearEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2,
+    FXMVECTOR Epsilon
+)
+{
+    XMVECTOR NP1 = XMPlaneNormalize(P1);
+    XMVECTOR NP2 = XMPlaneNormalize(P2);
+    return XMVector4NearEqual(NP1, NP2, Epsilon);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneNotEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2
+)
+{
+    return XMVector4NotEqual(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneIsNaN
+(
+    FXMVECTOR P
+)
+{
+    return XMVector4IsNaN(P);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneIsInfinite
+(
+    FXMVECTOR P
+)
+{
+    return XMVector4IsInfinite(P);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneDot
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+)
+{
+    return XMVector4Dot(P, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneDotCoord
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+)
+{
+    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
+
+    XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
+    XMVECTOR Result = XMVector4Dot(P, V3);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneDotNormal
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+)
+{
+    return XMVector3Dot(P, V);
+}
+
+//------------------------------------------------------------------------------
+// XMPlaneNormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
+(
+    FXMVECTOR P
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
+    return XMVectorMultiply(P, Result);
+
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, P);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(P,P);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.z
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
+    // Get the reciprocal
+    vDot = _mm_rsqrt_ps(vDot);
+    // Get the reciprocal
+    vDot = _mm_mul_ps(vDot,P);
+    return vDot;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalize
+(
+    FXMVECTOR P
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2]));
+    // Prevent divide by zero
+    if (fLengthSq) {
+        fLengthSq = 1.0f/fLengthSq;
+    }
+    {
+    XMVECTOR vResult = {
+        P.vector4_f32[0]*fLengthSq,
+        P.vector4_f32[1]*fLengthSq,
+        P.vector4_f32[2]*fLengthSq,
+        P.vector4_f32[3]*fLengthSq
+    };
+    return vResult;
+    }
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vLength = XMVector3ReciprocalLength(P);
+    return XMVectorMultiply( P, vLength );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine
+(
+    FXMVECTOR P,
+    FXMVECTOR LinePoint1,
+    FXMVECTOR LinePoint2
+)
+{
+    XMVECTOR V1 = XMVector3Dot(P, LinePoint1);
+    XMVECTOR V2 = XMVector3Dot(P, LinePoint2);
+    XMVECTOR D = XMVectorSubtract(V1, V2);
+
+    XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1);
+    VT = XMVectorDivide(VT, D);
+
+    XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1);
+    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
+
+    return XMVectorSelect(Point, g_XMQNaN.v, Control);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMPlaneIntersectPlane
+(
+    XMVECTOR* pLinePoint1,
+    XMVECTOR* pLinePoint2,
+    FXMVECTOR  P1,
+    FXMVECTOR  P2
+)
+{
+    assert(pLinePoint1);
+    assert(pLinePoint2);
+
+    XMVECTOR V1 = XMVector3Cross(P2, P1);
+
+    XMVECTOR LengthSq = XMVector3LengthSq(V1);
+
+    XMVECTOR V2 = XMVector3Cross(P2, V1);
+
+    XMVECTOR P1W = XMVectorSplatW(P1);
+    XMVECTOR Point = XMVectorMultiply(V2, P1W);
+
+    XMVECTOR V3 = XMVector3Cross(V1, P1);
+
+    XMVECTOR P2W = XMVectorSplatW(P2);
+    Point = XMVectorMultiplyAdd(V3, P2W, Point);
+
+    XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq);
+
+    XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1);
+
+    XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
+    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control);
+    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneTransform
+(
+    FXMVECTOR P,
+    FXMMATRIX M
+)
+{
+    XMVECTOR W = XMVectorSplatW(P);
+    XMVECTOR Z = XMVectorSplatZ(P);
+    XMVECTOR Y = XMVectorSplatY(P);
+    XMVECTOR X = XMVectorSplatX(P);
+
+    XMVECTOR Result = XMVectorMultiply(W, M.r[3]);
+    Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream
+(
+    XMFLOAT4*       pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT4* pInputStream,    
+    size_t          InputStride,
+    size_t          PlaneCount,
+    FXMMATRIX       M
+)
+{
+    return XMVector4TransformStream(pOutputStream,
+                                    OutputStride,
+                                    pInputStream,
+                                    InputStride,
+                                    PlaneCount,
+                                    M);
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal
+(
+    FXMVECTOR Point,
+    FXMVECTOR Normal
+)
+{
+    XMVECTOR W = XMVector3Dot(Point, Normal);
+    W = XMVectorNegate(W);
+    return XMVectorSelect(W, Normal, g_XMSelect1110.v);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneFromPoints
+(
+    FXMVECTOR Point1,
+    FXMVECTOR Point2,
+    FXMVECTOR Point3
+)
+{
+    XMVECTOR V21 = XMVectorSubtract(Point1, Point2);
+    XMVECTOR V31 = XMVectorSubtract(Point1, Point3);
+
+    XMVECTOR N = XMVector3Cross(V21, V31);
+    N = XMVector3Normalize(N);
+
+    XMVECTOR D = XMPlaneDotNormal(N, Point1);
+    D = XMVectorNegate(D);
+
+    XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v);
+
+    return Result;
+}
+
+/****************************************************************************
+ *
+ * Color
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4Equal(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorNotEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4NotEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorGreater
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4Greater(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorGreaterOrEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4GreaterOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorLess
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4Less(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorLessOrEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4LessOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorIsNaN
+(
+    FXMVECTOR C
+)
+{
+    return XMVector4IsNaN(C);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorIsInfinite
+(
+    FXMVECTOR C
+)
+{
+    return XMVector4IsInfinite(C);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorNegative
+(
+    FXMVECTOR vColor
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        1.0f - vColor.vector4_f32[0],
+        1.0f - vColor.vector4_f32[1],
+        1.0f - vColor.vector4_f32[2],
+        vColor.vector4_f32[3]
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3);
+    return vaddq_f32(vTemp,g_XMOne3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Negate only x,y and z.
+    XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3);
+    // Add 1,1,1,0 to -x,-y,-z,w
+    return _mm_add_ps(vTemp,g_XMOne3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorModulate
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVectorMultiply(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation
+(
+    FXMVECTOR vColor,
+    float    fSaturation
+)
+{
+    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
+    // Result = (C - Luminance) * Saturation + Luminance;
+
+#if defined(_XM_NO_INTRINSICS_)
+    const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+
+    float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]);
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance;
+    vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance;
+    vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance;
+    vResult.vector4_f32[3] = vColor.vector4_f32[3];
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
+    XMVECTOR vResult = vsubq_f32(vColor, vLuminance);
+    vResult = vmlaq_n_f32( vLuminance, vResult, fSaturation );
+    return vbslq_f32( g_XMSelect1110, vResult, vColor );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
+// Splat fSaturation
+    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
+// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
+    XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance);
+    vResult = _mm_mul_ps(vResult,vSaturation);
+    vResult = _mm_add_ps(vResult,vLuminance);
+// Retain w from the source color
+    vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+    vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorAdjustContrast
+(
+    FXMVECTOR vColor,
+    float    fContrast
+)
+{
+    // Result = (vColor - 0.5f) * fContrast + 0.5f;
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f,
+        ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f,
+        ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f,
+        vColor.vector4_f32[3]        // Leave W untouched
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v);
+    vResult = vmlaq_n_f32( g_XMOneHalf.v, vResult, fContrast );
+    return vbslq_f32( g_XMSelect1110, vResult, vColor );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
+    XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
+    vResult = _mm_mul_ps(vResult,vScale);               // Mul by scale
+    vResult = _mm_add_ps(vResult,g_XMOneHalf);          // Add 0.5f
+// Retain w from the source color
+    vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+    vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb )
+{
+    XMVECTOR r = XMVectorSplatX( rgb );
+    XMVECTOR g = XMVectorSplatY( rgb );
+    XMVECTOR b = XMVectorSplatZ( rgb );
+
+    XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) );
+    XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) );
+
+    XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf );
+
+    XMVECTOR d = XMVectorSubtract( max, min );
+
+    XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 );
+
+    if ( XMVector3Less( d, g_XMEpsilon ) )
+    {
+        // Achromatic, assume H and S of 0
+        return XMVectorSelect( la, g_XMZero, g_XMSelect1100 );
+    }
+    else
+    {
+        XMVECTOR s, h;
+
+        XMVECTOR d2 = XMVectorAdd( min, max );
+
+        if ( XMVector3Greater( l, g_XMOneHalf ) )
+        {
+            // d / (2-max-min)
+            s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); 
+        }
+        else
+        {
+            // d / (max+min)
+            s = XMVectorDivide( d, d2 ); 
+        }
+
+        if ( XMVector3Equal( r, max ) )
+        {
+            // Red is max
+            h = XMVectorDivide( XMVectorSubtract( g, b ), d );
+        }
+        else if ( XMVector3Equal( g, max ) )
+        {
+            // Green is max
+            h = XMVectorDivide( XMVectorSubtract( b, r ), d );
+            h = XMVectorAdd( h, g_XMTwo );
+        }
+        else
+        {
+            // Blue is max
+            h = XMVectorDivide( XMVectorSubtract( r, g ), d );
+            h = XMVectorAdd( h, g_XMFour );
+        }
+
+        h = XMVectorDivide( h, g_XMSix );
+
+        if ( XMVector3Less( h, g_XMZero ) )
+            h = XMVectorAdd( h, g_XMOne );
+
+        XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 );
+        return XMVectorSelect( s, lha, g_XMSelect1011 );
+    }
+}
+
+//------------------------------------------------------------------------------
+
+namespace Internal
+{
+
+inline XMVECTOR XM_CALLCONV XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h )
+{
+    static const XMVECTORF32 oneSixth  = { 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f };
+    static const XMVECTORF32 twoThirds = { 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f };
+    
+    XMVECTOR t = h;
+
+    if ( XMVector3Less( t, g_XMZero ) )
+        t = XMVectorAdd( t, g_XMOne );
+
+    if ( XMVector3Greater( t, g_XMOne ) )
+        t = XMVectorSubtract( t, g_XMOne );
+
+    if ( XMVector3Less( t, oneSixth ) )
+    {
+        // p + (q - p) * 6 * t
+        XMVECTOR t1 = XMVectorSubtract( q, p );
+        XMVECTOR t2 = XMVectorMultiply( g_XMSix, t );
+        return XMVectorMultiplyAdd( t1, t2, p );
+    }
+
+    if ( XMVector3Less( t, g_XMOneHalf ) )
+        return q;
+
+    if ( XMVector3Less( t, twoThirds ) )
+    {
+        // p + (q - p) * 6 * (2/3 - t)
+        XMVECTOR t1 = XMVectorSubtract( q, p );
+        XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) );
+        return XMVectorMultiplyAdd( t1, t2, p );
+    }
+
+    return p;
+}
+
+}; // namespace Internal
+
+inline XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl )
+{
+    static const XMVECTORF32 oneThird = { 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f };
+
+    XMVECTOR s = XMVectorSplatY( hsl );
+    XMVECTOR l = XMVectorSplatZ( hsl );
+
+    if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) )
+    {
+        // Achromatic
+        return XMVectorSelect( hsl, l, g_XMSelect1110 );
+    }
+    else
+    {
+        XMVECTOR h = XMVectorSplatX( hsl );
+
+        XMVECTOR q;
+        if ( XMVector3Less( l, g_XMOneHalf ) )
+        {
+            q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) );
+        }
+        else
+        {
+            q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) );
+        }
+
+        XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q );
+
+        XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) );
+        XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h );
+        XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) );
+
+        XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 );
+        XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 );
+
+        return XMVectorSelect( ba, rg, g_XMSelect1100 );
+    }
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb )
+{
+    XMVECTOR r = XMVectorSplatX( rgb );
+    XMVECTOR g = XMVectorSplatY( rgb );
+    XMVECTOR b = XMVectorSplatZ( rgb );
+
+    XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) );
+    XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) );
+
+    XMVECTOR d = XMVectorSubtract( v, min );
+
+    XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v );
+
+    if ( XMVector3Less( d, g_XMEpsilon ) )
+    {
+        // Achromatic, assume H of 0
+        XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 );
+        XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 );
+        return XMVectorSelect( s, hva, g_XMSelect1011 );
+    }
+    else
+    {
+        XMVECTOR h;
+
+        if ( XMVector3Equal( r, v ) )
+        {
+            // Red is max
+            h = XMVectorDivide( XMVectorSubtract( g, b ), d );
+
+            if ( XMVector3Less( g, b ) )
+                h = XMVectorAdd( h, g_XMSix );
+        }
+        else if ( XMVector3Equal( g, v ) )
+        {
+            // Green is max
+            h = XMVectorDivide( XMVectorSubtract( b, r ), d );
+            h = XMVectorAdd( h, g_XMTwo );
+        }
+        else
+        {
+            // Blue is max
+            h = XMVectorDivide( XMVectorSubtract( r, g ), d );
+            h = XMVectorAdd( h, g_XMFour );
+        }
+
+        h = XMVectorDivide( h, g_XMSix );
+
+        XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 );
+        XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 );
+        return XMVectorSelect( s, hva, g_XMSelect1011 );
+    }
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv )
+{
+    XMVECTOR h = XMVectorSplatX( hsv );
+    XMVECTOR s = XMVectorSplatY( hsv );
+    XMVECTOR v = XMVectorSplatZ( hsv );
+
+    XMVECTOR h6 = XMVectorMultiply( h, g_XMSix );
+
+    XMVECTOR i = XMVectorFloor( h6 );
+    XMVECTOR f = XMVectorSubtract( h6, i );
+
+    // p = v* (1-s)
+    XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) );
+
+    // q = v*(1-f*s)
+    XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) );
+
+    // t = v*(1 - (1-f)*s)
+    XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) );
+
+    int ii = static_cast<int>( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) );
+
+    XMVECTOR _rgb;
+
+    switch (ii)
+    {
+    case 0: // rgb = vtp
+        {
+            XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 );
+            _rgb = XMVectorSelect( p, vt, g_XMSelect1100 );
+        }
+        break;
+    case 1: // rgb = qvp
+        {
+            XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 );
+            _rgb = XMVectorSelect( p, qv, g_XMSelect1100 );
+        }
+        break;
+    case 2: // rgb = pvt
+        {
+            XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 );
+            _rgb = XMVectorSelect( t, pv, g_XMSelect1100 );
+        }
+        break;
+    case 3: // rgb = pqv
+        {
+            XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 );
+            _rgb = XMVectorSelect( v, pq, g_XMSelect1100 );
+        }
+        break;
+    case 4: // rgb = tpv
+        {
+            XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 );
+            _rgb = XMVectorSelect( v, tp, g_XMSelect1100 );
+        }
+        break;
+    default: // rgb = vpq
+        {
+            XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 );
+            _rgb = XMVectorSelect( q, vp, g_XMSelect1100 );
+        }
+        break;
+    }
+
+    return XMVectorSelect( hsv, _rgb, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb )
+{
+    static const XMVECTORF32 Scale0 = {  0.299f, -0.147f,  0.615f, 0.0f }; 
+    static const XMVECTORF32 Scale1 = {  0.587f, -0.289f, -0.515f, 0.0f };
+    static const XMVECTORF32 Scale2 = {  0.114f,  0.436f, -0.100f, 0.0f };
+
+    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVector3Transform( rgb, M );
+
+    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv )
+{
+    static const XMVECTORF32 Scale1 = {   0.0f, -0.395f, 2.032f, 0.0f };
+    static const XMVECTORF32 Scale2 = { 1.140f, -0.581f,   0.0f, 0.0f };
+
+    XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVector3Transform( yuv, M );
+
+    return XMVectorSelect( yuv, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb )
+{
+    static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f,  0.6150f, 0.0f };
+    static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f };
+    static const XMVECTORF32 Scale2 = { 0.0722f,  0.4351f, -0.0564f, 0.0f };
+
+    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVector3Transform( rgb, M );
+
+    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv )
+{
+    static const XMVECTORF32 Scale1 = {    0.0f, -0.2153f, 2.1324f, 0.0f };
+    static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f,    0.0f, 0.0f };
+        
+    XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVector3Transform( yuv, M );
+
+    return XMVectorSelect( yuv, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb )
+{
+    static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f };
+    static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f };
+    static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f };
+    static const XMVECTORF32 Scale = { 1.f/0.17697f, 1.f/0.17697f, 1.f/0.17697f, 0.0f };
+
+    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale );
+
+    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
+}
+
+inline XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz )
+{
+    static const XMVECTORF32 Scale0 = {  2.3706743f, -0.5138850f,  0.0052982f, 0.0f };
+    static const XMVECTORF32 Scale1 = { -0.9000405f,  1.4253036f, -0.0146949f, 0.0f };
+    static const XMVECTORF32 Scale2 = { -0.4706338f,  0.0885814f,  1.0093968f, 0.0f };
+    static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f };
+
+    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M );
+
+    return XMVectorSelect( xyz, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz )
+{
+    static const XMVECTORF32 Scale0 = {  3.2406f, -0.9689f,  0.0557f, 0.0f };
+    static const XMVECTORF32 Scale1 = { -1.5372f,  1.8758f, -0.2040f, 0.0f };
+    static const XMVECTORF32 Scale2 = { -0.4986f,  0.0415f,  1.0570f, 0.0f };
+    static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f };
+    static const XMVECTORF32 Exp    = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.0f };
+
+    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
+    XMVECTOR lclr = XMVector3Transform( xyz, M );
+
+    XMVECTOR sel = XMVectorGreater( lclr, Cutoff );
+
+    // clr = 12.92 * lclr for lclr <= 0.0031308f
+    XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale );
+
+    // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055)
+    XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA );
+
+    XMVECTOR clr = XMVectorSelect( smallC, largeC, sel );
+
+    return XMVectorSelect( xyz, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb )
+{
+    static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f };
+    static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f };
+    static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f };
+    static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f };
+    static const XMVECTORF32 Exp    = { 2.4f, 2.4f, 2.4f, 1.0f };
+
+    XMVECTOR sel = XMVectorGreater( srgb, Cutoff );
+
+    // lclr = clr / 12.92
+    XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale );
+
+    // lclr = pow( (clr + a) / (1+a), 2.4 )
+    XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp );
+
+    XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel );
+
+    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
+    XMVECTOR clr = XMVector3Transform( lclr, M );
+
+    return XMVectorSelect( srgb, clr, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb )
+{
+    static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 1.f };
+    static const XMVECTORF32 Linear = { 12.92f, 12.92f, 12.92f, 1.f };
+    static const XMVECTORF32 Scale = { 1.055f, 1.055f, 1.055f, 1.f };
+    static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f };
+    static const XMVECTORF32 InvGamma = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.f };
+
+    XMVECTOR V = XMVectorSaturate(rgb);
+    XMVECTOR V0 = XMVectorMultiply( V, Linear );
+    XMVECTOR V1 = Scale * XMVectorPow( V, InvGamma ) - Bias;
+    XMVECTOR select = XMVectorLess( V, Cutoff );
+    V = XMVectorSelect( V1, V0, select );
+    return XMVectorSelect( rgb, V, g_XMSelect1110 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
+{
+    static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 1.f };
+    static const XMVECTORF32 ILinear = { 1.f/12.92f, 1.f/12.92f, 1.f/12.92f, 1.f };
+    static const XMVECTORF32 Scale = { 1.f/1.055f, 1.f/1.055f, 1.f/1.055f, 1.f };
+    static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f };
+    static const XMVECTORF32 Gamma = { 2.4f, 2.4f, 2.4f, 1.f };
+
+    XMVECTOR V = XMVectorSaturate(srgb);
+    XMVECTOR V0 = XMVectorMultiply( V, ILinear );
+    XMVECTOR V1 = XMVectorPow( (V + Bias) * Scale, Gamma );
+    XMVECTOR select = XMVectorGreater( V, Cutoff );
+    V = XMVectorSelect( V0, V1, select );
+    return XMVectorSelect( srgb, V, g_XMSelect1110 );
+}
+
+/****************************************************************************
+ *
+ * Miscellaneous
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline bool XMVerifyCPUSupport()
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    int CPUInfo[4] = { -1 };
+    __cpuid(CPUInfo, 0);
+
+#ifdef __AVX2__
+    if (CPUInfo[0] < 7)
+        return false;
+#else
+    if (CPUInfo[0] < 1)
+        return false;
+#endif
+
+    __cpuid(CPUInfo, 1);
+
+#ifdef __AVX2__
+    // The compiler can emit FMA3 instructions even without explicit intrinsics use
+    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
+        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
+#elif defined(_XM_F16C_INTRINSICS_)
+    if ((CPUInfo[2] & 0x38080001) != 0x38080001)
+        return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
+#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_)
+    if ((CPUInfo[2] & 0x18080001) != 0x18080001)
+        return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support
+#elif defined(_XM_SSE4_INTRINSICS_)
+    if ((CPUInfo[2] & 0x80001) != 0x80001)
+        return false; // No SSE3/SSE4.1 support
+#elif defined(_XM_SSE3_INTRINSICS_)
+    if (!(CPUInfo[2] & 0x1))
+        return false; // No SSE3 support  
+#endif
+
+    // The x64 processor model requires SSE2 support, but no harm in checking
+    if ((CPUInfo[3] & 0x6000000) != 0x6000000)
+        return false; // No SSE2/SSE support
+
+#ifdef __AVX2__
+    __cpuidex(CPUInfo, 7, 0);
+    if (!(CPUInfo[1] & 0x20))
+        return false; // No AVX2 support
+#endif
+
+    return true;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    // ARM-NEON support is required for the Windows on ARM platform
+    return true;
+#else
+    // No intrinsics path always supported
+    return true;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMFresnelTerm
+(
+    FXMVECTOR CosIncidentAngle,
+    FXMVECTOR RefractionIndex
+)
+{
+    assert(!XMVector4IsInfinite(CosIncidentAngle));
+
+    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
+    // c = CosIncidentAngle
+    // g = sqrt(c^2 + RefractionIndex^2 - 1)
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
+    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
+    G = XMVectorAbs(G);
+    G = XMVectorSqrt(G);
+
+    XMVECTOR S = XMVectorAdd(G, CosIncidentAngle);
+    XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle);
+
+    XMVECTOR V0 = XMVectorMultiply(D, D);
+    XMVECTOR V1 = XMVectorMultiply(S, S);
+    V1 = XMVectorReciprocal(V1);
+    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
+    V0 = XMVectorMultiply(V0, V1);
+
+    XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
+    XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
+    V2 = XMVectorMultiply(V2, V2);
+    V3 = XMVectorMultiply(V3, V3);
+    V3 = XMVectorReciprocal(V3);
+    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
+
+    XMVECTOR Result = XMVectorMultiply(V0, V2);
+
+    Result = XMVectorSaturate(Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
+    XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex);
+    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle);
+    G = _mm_sub_ps(G,g_XMOne);
+    vTemp = _mm_add_ps(vTemp,G);
+    // max((0-vTemp),vTemp) == abs(vTemp)
+    // The abs is needed to deal with refraction and cosine being zero
+    G = _mm_setzero_ps();
+    G = _mm_sub_ps(G,vTemp);
+    G = _mm_max_ps(G,vTemp);
+    // Last operation, the sqrt()
+    G = _mm_sqrt_ps(G);
+
+    // Calc G-C and G+C
+    XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle);
+    XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle);
+    // Perform the term (0.5f *(g - c)^2) / (g + c)^2 
+    XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC);
+    vTemp = _mm_mul_ps(GAddC,GAddC);
+    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
+    vResult = _mm_div_ps(vResult,vTemp);
+    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
+    GAddC = _mm_mul_ps(GAddC,CosIncidentAngle);
+    GSubC = _mm_mul_ps(GSubC,CosIncidentAngle);
+    GAddC = _mm_sub_ps(GAddC,g_XMOne);
+    GSubC = _mm_add_ps(GSubC,g_XMOne);
+    GAddC = _mm_mul_ps(GAddC,GAddC);
+    GSubC = _mm_mul_ps(GSubC,GSubC);
+    GAddC = _mm_div_ps(GAddC,GSubC);
+    GAddC = _mm_add_ps(GAddC,g_XMOne);
+    // Multiply the two term parts
+    vResult = _mm_mul_ps(vResult,GAddC);
+    // Clamp to 0.0 - 1.0f
+    vResult = _mm_max_ps(vResult,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XMScalarNearEqual
+(
+    float S1,
+    float S2,
+    float Epsilon
+)
+{
+    float Delta = S1 - S2;
+    return (fabsf(Delta) <= Epsilon);
+}
+
+//------------------------------------------------------------------------------
+// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
+inline float XMScalarModAngle
+(
+    float Angle
+)
+{
+    // Note: The modulo is performed with unsigned math only to work
+    // around a precision error on numbers that are close to PI
+
+    // Normalize the range from 0.0f to XM_2PI
+    Angle = Angle + XM_PI;
+    // Perform the modulo, unsigned
+    float fTemp = fabsf(Angle);
+    fTemp = fTemp - (XM_2PI * (float)((int32_t)(fTemp/XM_2PI)));
+    // Restore the number to the range of -XM_PI to XM_PI-epsilon
+    fTemp = fTemp - XM_PI;
+    // If the modulo'd value was negative, restore negation
+    if (Angle<0.0f) {
+        fTemp = -fTemp;
+    }
+    return fTemp;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarSin
+(
+    float Value
+)
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI*Value;
+    if (Value >= 0.0f)
+    {
+        quotient = (float)((int)(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = (float)((int)(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI*quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+    }
+
+    // 11-degree minimax approximation
+    float y2 = y * y;
+    return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarSinEst
+(
+    float Value
+)
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI*Value;
+    if (Value >= 0.0f)
+    {
+        quotient = (float)((int)(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = (float)((int)(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI*quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+    }
+
+    // 7-degree minimax approximation
+    float y2 = y * y;
+    return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarCos
+(
+    float Value
+)
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI*Value;
+    if (Value >= 0.0f)
+    {
+        quotient = (float)((int)(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = (float)((int)(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI*quotient;
+
+    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    // 10-degree minimax approximation
+    float y2 = y*y;
+    float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f;
+    return sign*p;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarCosEst
+(
+    float Value
+)
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI*Value;
+    if (Value >= 0.0f)
+    {
+        quotient = (float)((int)(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = (float)((int)(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI*quotient;
+
+    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    // 6-degree minimax approximation
+    float y2 = y * y;
+    float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f;
+    return sign*p;
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XMScalarSinCos
+(
+    float* pSin,
+    float* pCos,
+    float  Value
+)
+{
+    assert(pSin);
+    assert(pCos);
+
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI*Value;
+    if (Value >= 0.0f)
+    {
+        quotient = (float)((int)(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = (float)((int)(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI*quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    float y2 = y * y;
+
+    // 11-degree minimax approximation
+    *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y;
+
+    // 10-degree minimax approximation
+    float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f;
+    *pCos = sign*p;
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XMScalarSinCosEst
+(
+    float* pSin,
+    float* pCos,
+    float  Value
+)
+{
+    assert(pSin);
+    assert(pCos);
+
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI*Value;
+    if (Value >= 0.0f)
+    {
+        quotient = (float)((int)(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = (float)((int)(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI*quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    float y2 = y * y;
+
+    // 7-degree minimax approximation
+    *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y;
+
+    // 6-degree minimax approximation
+    float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f;
+    *pCos = sign*p;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarASin
+(
+    float Value
+)
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 7-degree minimax approximation
+    float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f;
+    result *= root;  // acos(|x|)
+
+    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
+    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarASinEst
+(
+    float Value
+)
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 3-degree minimax approximation
+    float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f;
+    result *= root;  // acos(|x|)
+
+    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
+    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarACos
+(
+    float Value
+)
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 7-degree minimax approximation
+    float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f;
+    result *= root;
+
+    // acos(x) = pi - acos(-x) when x < 0
+    return (nonnegative ? result : XM_PI - result);
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarACosEst
+(
+    float Value
+)
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 3-degree minimax approximation
+    float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f;
+    result *= root;
+
+    // acos(x) = pi - acos(-x) when x < 0
+    return (nonnegative ? result : XM_PI - result);
+}
+
diff --git a/Inc/DirectXMathVector.inl b/Inc/DirectXMathVector.inl
index 53a7c4e..d417c7c 100644
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
@@ -1,14453 +1,14453 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathVector.inl -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#if defined(_XM_NO_INTRINSICS_)
-#define XMISNAN(x)  ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0)
-#define XMISINF(x)  ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000)
-#endif
-
-#if defined(_XM_SSE_INTRINSICS_)
-
-#define XM3UNPACK3INTO4(l1,l2,l3) \
-    XMVECTOR V3 = _mm_shuffle_ps(l2,l3,_MM_SHUFFLE(0,0,3,2));\
-    XMVECTOR V2 = _mm_shuffle_ps(l2,l1,_MM_SHUFFLE(3,3,1,0));\
-    V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,1,0,2));\
-    XMVECTOR V4 = _mm_castsi128_ps( _mm_srli_si128(_mm_castps_si128(L3),32/8) );
-
-#define XM3PACK4INTO3(v2x) \
-    v2x = _mm_shuffle_ps(V2,V3,_MM_SHUFFLE(1,0,2,1));\
-    V2 = _mm_shuffle_ps(V2,V1,_MM_SHUFFLE(2,2,0,0));\
-    V1 = _mm_shuffle_ps(V1,V2,_MM_SHUFFLE(0,2,1,0));\
-    V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(0,0,2,2));\
-    V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(2,1,2,0));\
-
-#endif
-
-/****************************************************************************
- *
- * General Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Assignment operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-// Return a vector with all elements equaling zero
-inline XMVECTOR XM_CALLCONV XMVectorZero()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_f32(0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_setzero_ps();
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with four floating point values
-inline XMVECTOR XM_CALLCONV XMVectorSet
-(
-    float x, 
-    float y, 
-    float z, 
-    float w
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {x,y,z,w};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32));
-    float32x2_t V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32));
-    return vcombine_f32(V0, V1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_set_ps( w, z, y, x );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with four integer values
-inline XMVECTOR XM_CALLCONV XMVectorSetInt
-(
-    uint32_t x, 
-    uint32_t y, 
-    uint32_t z, 
-    uint32_t w
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult = {x,y,z,w};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32));
-    uint32x2_t V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32));
-    return vcombine_u32(V0, V1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_set_epi32( w, z, y, x );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated floating point value
-inline XMVECTOR XM_CALLCONV XMVectorReplicate
-(
-    float Value
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = Value;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_f32( Value );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_set_ps1( Value );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated floating point value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr
-(
-    const float *pValue
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float Value = pValue[0];
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = Value;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_dup_f32( pValue );
-#elif defined(_XM_AVX_INTRINSICS_)
-    return _mm_broadcast_ss( pValue );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ps1( pValue );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated integer value
-inline XMVECTOR XM_CALLCONV XMVectorReplicateInt
-(
-    uint32_t Value
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = 
-    vResult.u[1] = 
-    vResult.u[2] = 
-    vResult.u[3] = Value;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_u32( Value );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_set1_epi32( Value );
-    return _mm_castsi128_ps(vTemp);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated integer value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr
-(
-    const uint32_t *pValue
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t Value = pValue[0];
-    XMVECTORU32 vResult;
-    vResult.u[0] = 
-    vResult.u[1] = 
-    vResult.u[2] = 
-    vResult.u[3] = Value;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_dup_u32(pValue);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ps1(reinterpret_cast<const float *>(pValue));
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with all bits set (true mask)
-inline XMVECTOR XM_CALLCONV XMVectorTrueInt()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_s32(-1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_set1_epi32(-1);
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with all bits clear (false mask)
-inline XMVECTOR XM_CALLCONV XMVectorFalseInt()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_u32(0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_setzero_ps();
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the x component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatX
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = V.vector4_f32[0];
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32( vget_low_f32( V ), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the y component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatY
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = V.vector4_f32[1];
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32( vget_low_f32( V ), 1 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the z component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatZ
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = V.vector4_f32[2];
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32( vget_high_f32( V ), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the w component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatW
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = V.vector4_f32[3];
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32( vget_high_f32( V ), 1 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of 1.0f,1.0f,1.0f,1.0f
-inline XMVECTOR XM_CALLCONV XMVectorSplatOne()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = 1.0f;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_f32(1.0f);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMOne;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of INF,INF,INF,INF
-inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_u32[0] = 
-    vResult.vector4_u32[1] = 
-    vResult.vector4_u32[2] = 
-    vResult.vector4_u32[3] = 0x7F800000;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_u32(0x7F800000);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMInfinity;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
-inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_u32[0] = 
-    vResult.vector4_u32[1] = 
-    vResult.vector4_u32[2] = 
-    vResult.vector4_u32[3] = 0x7FC00000;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_u32(0x7FC00000);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMQNaN;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
-inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_u32[0] = 
-    vResult.vector4_u32[1] = 
-    vResult.vector4_u32[2] = 
-    vResult.vector4_u32[3] = 0x34000000;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_u32(0x34000000);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMEpsilon;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
-inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask()
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_u32[0] = 
-    vResult.vector4_u32[1] = 
-    vResult.vector4_u32[2] = 
-    vResult.vector4_u32[3] = 0x80000000U;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_u32(0x80000000U);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_set1_epi32( 0x80000000 );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a floating point value via an index. This is not a recommended
-// function to use due to performance loss.
-inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i)
-{
-    assert( i < 4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[i];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return V.n128_f32[i];
-#elif defined(_XM_SSE_INTRINSICS_)
-    return V.m128_f32[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return the X component in an FPU register. 
-inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cvtss_f32(V);
-#endif
-}
-
-// Return the Y component in an FPU register. 
-inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    return _mm_cvtss_f32(vTemp);
-#endif
-}
-
-// Return the Z component in an FPU register. 
-inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    return _mm_cvtss_f32(vTemp);
-#endif
-}
-
-// Return the W component in an FPU register. 
-inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-    return _mm_cvtss_f32(vTemp);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store a component indexed by i into a 32 bit float location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i)
-{
-    assert( f != nullptr );
-    assert( i <  4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    *f = V.vector4_f32[i];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    *f = V.n128_f32[i];
-#elif defined(_XM_SSE_INTRINSICS_)
-    *f = V.m128_f32[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store the X component into a 32 bit float location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetXPtr(float *x, FXMVECTOR V)
-{
-    assert( x != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *x = V.vector4_f32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(x,V,0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss(x,V);
-#endif
-}
-
-// Store the Y component into a 32 bit float location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V)
-{
-    assert( y != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *y = V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(y,V,1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *((int*)y) = _mm_extract_ps( V, 1 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    _mm_store_ss(y,vResult);
-#endif
-}
-
-// Store the Z component into a 32 bit float location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V)
-{
-    assert( z != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *z = V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(z,V,2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *((int*)z) = _mm_extract_ps( V, 2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss(z,vResult);
-#endif
-}
-
-// Store the W component into a 32 bit float location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V)
-{
-    assert( w != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *w = V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(w,V,3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *((int*)w) = _mm_extract_ps( V, 3 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(w,vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return an integer value via an index. This is not a recommended
-// function to use due to performance loss.
-inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i)
-{
-    assert( i < 4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[i];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return V.n128_u32[i];
-#elif defined(_XM_SSE_INTRINSICS_)
-    return V.m128_u32[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return the X component in an integer register. 
-inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
-#endif
-}
-
-// Return the Y component in an integer register. 
-inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(V, 1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128( V );
-    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
-#endif
-}
-
-// Return the Z component in an integer register. 
-inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(V, 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128( V );
-    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
-#endif
-}
-
-// Return the W component in an integer register. 
-inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(V, 3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128( V );
-    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store a component indexed by i into a 32 bit integer location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i)
-{
-    assert( x != nullptr );
-    assert( i <  4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    *x = V.vector4_u32[i];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    *x = V.n128_u32[i];
-#elif defined(_XM_SSE_INTRINSICS_)
-    *x = V.m128_u32[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store the X component into a 32 bit integer location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V)
-{
-    assert( x != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *x = V.vector4_u32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(x,*reinterpret_cast<const uint32x4_t*>(&V),0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss(reinterpret_cast<float *>(x),V);
-#endif
-}
-
-// Store the Y component into a 32 bit integer location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
-{
-    assert( y != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *y = V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(y,*reinterpret_cast<const uint32x4_t*>(&V),1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128( V );
-    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    _mm_store_ss(reinterpret_cast<float *>(y),vResult);
-#endif
-}
-
-// Store the Z component into a 32 bit integer locaCantion in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
-{
-    assert( z != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *z = V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(z,*reinterpret_cast<const uint32x4_t*>(&V),2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128( V );
-    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss(reinterpret_cast<float *>(z),vResult);
-#endif
-}
-
-// Store the W component into a 32 bit integer location in memory.
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
-{
-    assert( w != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    *w = V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(w,*reinterpret_cast<const uint32x4_t*>(&V),3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128( V );
-    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(reinterpret_cast<float *>(w),vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Set a single indexed floating point component
-inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
-{
-    assert( i < 4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U = V;
-    U.vector4_f32[i] = f;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR U = V;
-    U.n128_f32[i] = f;
-    return U;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR U = V;
-    U.m128_f32[i] = f;
-    return U;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = x;
-    U.vector4_f32[1] = V.vector4_f32[1];
-    U.vector4_f32[2] = V.vector4_f32[2];
-    U.vector4_f32[3] = V.vector4_f32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(x,V,0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(x);
-    vResult = _mm_move_ss(V,vResult);
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = V.vector4_f32[0];
-    U.vector4_f32[1] = y;
-    U.vector4_f32[2] = V.vector4_f32[2];
-    U.vector4_f32[3] = V.vector4_f32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(y,V,1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(y);
-    vResult = _mm_insert_ps( V, vResult, 0x10 );
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_set_ss(y);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
-    return vResult;
-#endif
-}
-// Sets the Z component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = V.vector4_f32[0];
-    U.vector4_f32[1] = V.vector4_f32[1];
-    U.vector4_f32[2] = z;
-    U.vector4_f32[3] = V.vector4_f32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(z,V,2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(z);
-    vResult = _mm_insert_ps( V, vResult, 0x20 );
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_set_ss(z);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = V.vector4_f32[0];
-    U.vector4_f32[1] = V.vector4_f32[1];
-    U.vector4_f32[2] = V.vector4_f32[2];
-    U.vector4_f32[3] = w;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(w,V,3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(w);
-    vResult = _mm_insert_ps( V, vResult, 0x30 );
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_set_ss(w);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets a component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i)
-{
-    assert( f != nullptr );
-    assert( i < 4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U = V;
-    U.vector4_f32[i] = *f;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR U = V;
-    U.n128_f32[i] = *f;
-    return U;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR U = V;
-    U.m128_f32[i] = *f;
-    return U;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float *x)
-{
-    assert( x != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = *x;
-    U.vector4_f32[1] = V.vector4_f32[1];
-    U.vector4_f32[2] = V.vector4_f32[2];
-    U.vector4_f32[3] = V.vector4_f32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(x,V,0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_load_ss(x);
-    vResult = _mm_move_ss(V,vResult);
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float *y)
-{
-    assert( y != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = V.vector4_f32[0];
-    U.vector4_f32[1] = *y;
-    U.vector4_f32[2] = V.vector4_f32[2];
-    U.vector4_f32[3] = V.vector4_f32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(y,V,1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(y);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
-    return vResult;
-#endif
-}
-
-// Sets the Z component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float *z)
-{
-    assert( z != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = V.vector4_f32[0];
-    U.vector4_f32[1] = V.vector4_f32[1];
-    U.vector4_f32[2] = *z;
-    U.vector4_f32[3] = V.vector4_f32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(z,V,2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(z);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float *w)
-{
-    assert( w != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_f32[0] = V.vector4_f32[0];
-    U.vector4_f32[1] = V.vector4_f32[1];
-    U.vector4_f32[2] = V.vector4_f32[2];
-    U.vector4_f32[3] = *w;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(w,V,3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(w);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets a component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i)
-{
-    assert( i < 4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U = V;
-    U.vector4_u32[i] = x;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTORU32 tmp;
-    tmp.v = V;
-    tmp.u[i] = x;
-    return tmp;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTORU32 tmp;
-    tmp.v = V;
-    tmp.u[i] = x;
-    return tmp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = x;
-    U.vector4_u32[1] = V.vector4_u32[1];
-    U.vector4_u32[2] = V.vector4_u32[2];
-    U.vector4_u32[3] = V.vector4_u32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_u32(x,V,0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cvtsi32_si128(x);
-    XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp));
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = V.vector4_u32[0];
-    U.vector4_u32[1] = y;
-    U.vector4_u32[2] = V.vector4_u32[2];
-    U.vector4_u32[3] = V.vector4_u32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_u32(y,V,1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i vResult = _mm_castps_si128( V );
-    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
-    return _mm_castsi128_ps( vResult );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
-    // Convert input to vector
-    __m128i vTemp = _mm_cvtsi32_si128(y);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
-    return vResult;
-#endif
-}
-
-// Sets the Z component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = V.vector4_u32[0];
-    U.vector4_u32[1] = V.vector4_u32[1];
-    U.vector4_u32[2] = z;
-    U.vector4_u32[3] = V.vector4_u32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_u32(z,V,2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i vResult = _mm_castps_si128( V );
-    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
-    return _mm_castsi128_ps( vResult );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
-    // Convert input to vector
-    __m128i vTemp = _mm_cvtsi32_si128(z);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = V.vector4_u32[0];
-    U.vector4_u32[1] = V.vector4_u32[1];
-    U.vector4_u32[2] = V.vector4_u32[2];
-    U.vector4_u32[3] = w;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_u32(w,V,3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i vResult = _mm_castps_si128( V );
-    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
-    return _mm_castsi128_ps( vResult );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
-    // Convert input to vector
-    __m128i vTemp = _mm_cvtsi32_si128(w);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets a component of a vector to an integer value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i)
-{
-    assert( x != nullptr );
-    assert( i < 4 );
-    _Analysis_assume_( i < 4 );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U = V;
-    U.vector4_u32[i] = *x;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTORU32 tmp;
-    tmp.v = V;
-    tmp.u[i] = *x;
-    return tmp;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTORU32 tmp;
-    tmp.v = V;
-    tmp.u[i] = *x;
-    return tmp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to an integer value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x)
-{
-    assert( x != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = *x;
-    U.vector4_u32[1] = V.vector4_u32[1];
-    U.vector4_u32[2] = V.vector4_u32[2];
-    U.vector4_u32[3] = V.vector4_u32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_u32(x,*reinterpret_cast<const uint32x4_t *>(&V),0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x));
-    XMVECTOR vResult = _mm_move_ss(V,vTemp);
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to an integer value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y)
-{
-    assert( y != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = V.vector4_u32[0];
-    U.vector4_u32[1] = *y;
-    U.vector4_u32[2] = V.vector4_u32[2];
-    U.vector4_u32[3] = V.vector4_u32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_u32(y,*reinterpret_cast<const uint32x4_t *>(&V),1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
-    return vResult;
-#endif
-}
-
-// Sets the Z component of a vector to an integer value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z)
-{
-    assert( z != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = V.vector4_u32[0];
-    U.vector4_u32[1] = V.vector4_u32[1];
-    U.vector4_u32[2] = *z;
-    U.vector4_u32[3] = V.vector4_u32[3];
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_u32(z,*reinterpret_cast<const uint32x4_t *>(&V),2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to an integer value passed by pointer
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w)
-{
-    assert( w != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR U;
-    U.vector4_u32[0] = V.vector4_u32[0];
-    U.vector4_u32[1] = V.vector4_u32[1];
-    U.vector4_u32[2] = V.vector4_u32[2];
-    U.vector4_u32[3] = *w;
-    return U;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_u32(w,*reinterpret_cast<const uint32x4_t *>(&V),3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult,vTemp);
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle
-(
-    FXMVECTOR V,
-    uint32_t E0,
-    uint32_t E1,
-    uint32_t E2,
-    uint32_t E3
-)
-{
-    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
-    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result = { V.vector4_f32[E0],
-                        V.vector4_f32[E1],
-                        V.vector4_f32[E2],
-                        V.vector4_f32[E3] };
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const uint32_t ControlElement[ 4 ] =
-    {
-        0x03020100, // XM_SWIZZLE_X
-        0x07060504, // XM_SWIZZLE_Y
-        0x0B0A0908, // XM_SWIZZLE_Z
-        0x0F0E0D0C, // XM_SWIZZLE_W
-    };
-
-    int8x8x2_t tbl;
-    tbl.val[0] = vget_low_f32(V);
-    tbl.val[1] = vget_high_f32(V);
-
-    uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) );
-    const uint8x8_t rL = vtbl2_u8( tbl, idx );
-
-    idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) );
-    const uint8x8_t rH = vtbl2_u8( tbl, idx );
-
-    return vcombine_f32( rL, rH );
-#elif defined(_XM_AVX_INTRINSICS_)
-    unsigned int elem[4] = { E0, E1, E2, E3 };
-    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
-    return _mm_permutevar_ps( V, vControl );
-#else
-    const uint32_t *aPtr = (const uint32_t* )(&V);
-
-    XMVECTOR Result;
-    uint32_t *pWork = (uint32_t*)(&Result);
-
-    pWork[0] = aPtr[E0];
-    pWork[1] = aPtr[E1];
-    pWork[2] = aPtr[E2];
-    pWork[3] = aPtr[E3];
-
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-inline XMVECTOR XM_CALLCONV XMVectorPermute
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2,
-    uint32_t PermuteX,
-    uint32_t PermuteY,
-    uint32_t PermuteZ,
-    uint32_t PermuteW
-)
-{
-    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
-    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    static const uint32_t ControlElement[ 8 ] =
-    {
-        0x03020100, // XM_PERMUTE_0X
-        0x07060504, // XM_PERMUTE_0Y
-        0x0B0A0908, // XM_PERMUTE_0Z
-        0x0F0E0D0C, // XM_PERMUTE_0W
-        0x13121110, // XM_PERMUTE_1X
-        0x17161514, // XM_PERMUTE_1Y
-        0x1B1A1918, // XM_PERMUTE_1Z
-        0x1F1E1D1C, // XM_PERMUTE_1W
-    };
-
-    int8x8x4_t tbl;
-    tbl.val[0] = vget_low_f32(V1);
-    tbl.val[1] = vget_high_f32(V1);
-    tbl.val[2] = vget_low_f32(V2);
-    tbl.val[3] = vget_high_f32(V2);
-
-    uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) );
-    const uint8x8_t rL = vtbl4_u8( tbl, idx );
-
-    idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) );
-    const uint8x8_t rH = vtbl4_u8( tbl, idx );
-
-    return vcombine_f32( rL, rH );
-#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    static const XMVECTORU32 three = { 3, 3, 3, 3 };
-
-    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
-    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
-    
-    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
-    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
-
-    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
-    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
-
-    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
-    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
-
-    return _mm_or_ps( masked1, masked2 );
-#else
- 
-    const uint32_t *aPtr[2];
-    aPtr[0] = (const uint32_t* )(&V1);
-    aPtr[1] = (const uint32_t* )(&V2);
-
-    XMVECTOR Result;
-    uint32_t *pWork = (uint32_t*)(&Result);
-
-    const uint32_t i0 = PermuteX & 3;
-    const uint32_t vi0 = PermuteX >> 2;
-    pWork[0] = aPtr[vi0][i0];
-
-    const uint32_t i1 = PermuteY & 3;
-    const uint32_t vi1 = PermuteY >> 2;
-    pWork[1] = aPtr[vi1][i1];
-
-    const uint32_t i2 = PermuteZ & 3;
-    const uint32_t vi2 = PermuteZ >> 2;
-    pWork[2] = aPtr[vi2][i2];
-
-    const uint32_t i3 = PermuteW & 3;
-    const uint32_t vi3 = PermuteW >> 2;
-    pWork[3] = aPtr[vi3][i3];
-
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Define a control vector to be used in XMVectorSelect 
-// operations.  The four integers specified in XMVectorSelectControl
-// serve as indices to select between components in two vectors.
-// The first index controls selection for the first component of 
-// the vectors involved in a select operation, the second index 
-// controls selection for the second component etc.  A value of
-// zero for an index causes the corresponding component from the first 
-// vector to be selected whereas a one causes the component from the
-// second vector to be selected instead.
-
-inline XMVECTOR XM_CALLCONV XMVectorSelectControl
-(
-    uint32_t VectorIndex0, 
-    uint32_t VectorIndex1, 
-    uint32_t VectorIndex2, 
-    uint32_t VectorIndex3
-)
-{
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    // x=Index0,y=Index1,z=Index2,w=Index3
-    __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0);
-    // Any non-zero entries become 0xFFFFFFFF else 0
-    vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero);
-    return _mm_castsi128_ps(vTemp);
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    int32x2_t V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32));
-    int32x2_t V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32));
-    int32x4_t vTemp = vcombine_s32(V0, V1);
-    // Any non-zero entries become 0xFFFFFFFF else 0
-    return vcgtq_s32(vTemp,g_XMZero);
-#else
-    XMVECTOR    ControlVector;
-    const uint32_t  ControlElement[] =
-                {
-                    XM_SELECT_0,
-                    XM_SELECT_1
-                };
-
-    assert(VectorIndex0 < 2);
-    assert(VectorIndex1 < 2);
-    assert(VectorIndex2 < 2);
-    assert(VectorIndex3 < 2);
-    _Analysis_assume_(VectorIndex0 < 2);
-    _Analysis_assume_(VectorIndex1 < 2);
-    _Analysis_assume_(VectorIndex2 < 2);
-    _Analysis_assume_(VectorIndex3 < 2);
-
-    ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
-    ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
-    ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
-    ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
-
-    return ControlVector;
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSelect
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR Control
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]);
-    Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]);
-    Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]);
-    Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vbslq_f32( Control, V2, V1 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1);
-    XMVECTOR vTemp2 = _mm_and_ps(V2,Control);
-    return _mm_or_ps(vTemp1,vTemp2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMergeXY
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = V1.vector4_u32[0];
-    Result.vector4_u32[1] = V2.vector4_u32[0];
-    Result.vector4_u32[2] = V1.vector4_u32[1];
-    Result.vector4_u32[3] = V2.vector4_u32[1];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vzipq_f32( V1, V2 ).val[0];
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_unpacklo_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMergeZW
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = V1.vector4_u32[2];
-    Result.vector4_u32[1] = V2.vector4_u32[2];
-    Result.vector4_u32[2] = V1.vector4_u32[3];
-    Result.vector4_u32[3] = V2.vector4_u32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vzipq_f32( V1, V2 ).val[1];
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_unpackhi_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
-{
-    assert( Elements < 4 );
-    _Analysis_assume_( Elements < 4 );
-    return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
-                                  uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3)
-{
-    XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
-    return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control );
-}
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vceqq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpeq_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorEqualR
-(
-    uint32_t*    pCR,
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    assert( pCR != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    uint32_t CR = 0;
-    if (ux&uy&uz&uw)
-    {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!(ux|uy|uz|uw))
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = ux;
-    Control.vector4_u32[1] = uy;
-    Control.vector4_u32[2] = uz;
-    Control.vector4_u32[3] = uw;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        // All elements are equal
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        // All elements are not equal
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest==0xf)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Treat the components of the vectors as unsigned integers and
-// compare individual bits between the two.  This is useful for
-// comparing control vectors and result vectors returned from
-// other comparison operations.
-
-inline XMVECTOR XM_CALLCONV XMVectorEqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vceqq_u32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorEqualIntR
-(
-    uint32_t*    pCR,
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    assert( pCR != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control = XMVectorEqualInt(V1, V2);
-
-    *pCR = 0;
-    if (XMVector4EqualInt(Control, XMVectorTrueInt()))
-    {
-        // All elements are equal
-        *pCR |= XM_CRMASK_CR6TRUE;
-    }
-    else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
-    {
-        // All elements are not equal
-        *pCR |= XM_CRMASK_CR6FALSE;
-    }
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        // All elements are equal
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        // All elements are not equal
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
-    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V));
-    uint32_t CR = 0;
-    if (iTemp==0x0F)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTemp)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNearEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR Epsilon
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0];
-    float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1];
-    float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2];
-    float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3];
-
-    fDeltax = fabsf(fDeltax);
-    fDeltay = fabsf(fDeltay);
-    fDeltaz = fabsf(fDeltaz);
-    fDeltaw = fabsf(fDeltaw);
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vDelta = vsubq_f32(V1,V2);
-    return vacleq_f32( vDelta, Epsilon );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp,vDelta);
-    vTemp = _mm_max_ps(vTemp,vDelta);
-    vTemp = _mm_cmple_ps(vTemp,Epsilon);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNotEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmvnq_u32(vceqq_f32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpneq_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmvnq_u32(vceqq_u32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
-    return _mm_xor_ps(_mm_castsi128_ps(V),g_XMNegOneMask);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorGreater
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vcgtq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpgt_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorGreaterR
-(
-    uint32_t*    pCR,
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    assert( pCR != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    uint32_t CR = 0;
-    if (ux&uy&uz&uw)
-    {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!(ux|uy|uz|uw))
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = ux;
-    Control.vector4_u32[1] = uy;
-    Control.vector4_u32[2] = uz;
-    Control.vector4_u32[3] = uw;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest==0xf)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vcgeq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpge_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR
-(
-    uint32_t*    pCR,
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    assert( pCR != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    uint32_t CR = 0;
-    if (ux&uy&uz&uw)
-    {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!(ux|uy|uz|uw))
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = ux;
-    Control.vector4_u32[1] = uy;
-    Control.vector4_u32[2] = uz;
-    Control.vector4_u32[3] = uw;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        // All elements are greater or equal
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        // All elements are not greater or equal
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest==0xf)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLess
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vcltq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmplt_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vcleq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmple_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorInBounds
-(
-    FXMVECTOR V, 
-    FXMVECTOR Bounds
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0;
-    Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = vnegq_f32(Bounds);
-    // Test if greater or equal (Reversed)
-    vTemp2 = vcleq_f32(vTemp2,V);
-    // Blend answers
-    vTemp1 = vandq_u32(vTemp1,vTemp2);
-    return vTemp1;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2,V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
-    return vTemp1;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV XMVectorInBoundsR
-(
-    uint32_t*    pCR,
-    FXMVECTOR V, 
-    FXMVECTOR Bounds
-)
-{
-    assert( pCR != nullptr );
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-
-    uint32_t CR = 0;
-    if (ux&uy&uz&uw)
-    {
-        // All elements are in bounds
-        CR = XM_CRMASK_CR6BOUNDS;
-    }
-    *pCR = CR;
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = ux;
-    Control.vector4_u32[1] = uy;
-    Control.vector4_u32[2] = uz;
-    Control.vector4_u32[3] = uw;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = vnegq_f32(Bounds);
-    // Test if greater or equal (Reversed)
-    vTemp2 = vcleq_f32(vTemp2,V);
-    // Blend answers
-    vTemp1 = vandq_u32(vTemp1,vTemp2);
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        // All elements are in bounds
-        CR = XM_CRMASK_CR6BOUNDS;
-    }
-    *pCR = CR;
-    return vTemp1;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2,V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
-
-    uint32_t CR = 0;
-    if (_mm_movemask_ps(vTemp1)==0xf) {
-        // All elements are in bounds
-        CR = XM_CRMASK_CR6BOUNDS;
-    }
-    *pCR = CR;
-    return vTemp1;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorIsNaN
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    uint32x4_t vTempNan = vceqq_f32( V, V );
-    // Flip results
-    return vmvnq_u32( vTempNan );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    return _mm_cmpneq_ps(V,V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorIsInfinite
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control;
-    Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x4_t vTemp = vandq_u32(V,g_XMAbsMask);
-    // Compare to infinity
-    vTemp = vceqq_f32(vTemp,g_XMInfinity);
-    // If any are infinity, the signs are true.
-    return vTemp;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
-    // If any are infinity, the signs are true.
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Rounding and clamping operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMin
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
-    Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
-    Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
-    Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vminq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_min_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMax
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
-    Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
-    Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
-    Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmaxq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_max_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _XM_NO_ROUNDF_
-
-namespace Internal
-{
-    inline float round_to_nearest( float x )
-    {
-        float i = floorf(x);
-        x -= i;
-        if(x < 0.5f)
-            return i;
-        if(x > 0.5f)
-            return i + 1.f;
-
-        float int_part;
-        modff( i / 2.f, &int_part );
-        if ( (2.f*int_part) == i )
-        {
-            return i;
-        }
-
-        return i + 1.f;
-    }
-};
-
-#endif
-
-#if !defined(_XM_NO_INTRINSICS_)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-inline XMVECTOR XM_CALLCONV XMVectorRound
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-#ifdef _XM_NO_ROUNDF_
-    XMVECTOR Result;
-    Result.vector4_f32[0] = Internal::round_to_nearest( V.vector4_f32[0] );
-    Result.vector4_f32[1] = Internal::round_to_nearest( V.vector4_f32[1] );
-    Result.vector4_f32[2] = Internal::round_to_nearest( V.vector4_f32[2] );
-    Result.vector4_f32[3] = Internal::round_to_nearest( V.vector4_f32[3] );
-    return Result;
-#else
-    XMVECTOR Result;
-    Result.vector4_f32[0] = roundf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = roundf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = roundf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = roundf( V.vector4_f32[3] );
-    return Result;
-#endif
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
-    uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign );
-    float32x4_t R1 = vaddq_f32( V, sMagic );
-    R1 = vsubq_f32( R1, sMagic );
-    float32x4_t R2 = vabsq_f32( V );
-    uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
-    XMVECTOR vResult = vbslq_f32( mask, R1, V );
-    return vResult;
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 sign = _mm_and_ps( V, g_XMNegativeZero );
-    __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign );
-    __m128 R1 = _mm_add_ps( V, sMagic );
-    R1 = _mm_sub_ps( R1, sMagic );
-    __m128 R2 = _mm_and_ps( V, g_XMAbsMask );
-    __m128 mask = _mm_cmple_ps( R2, g_XMNoFraction );
-    R2 = _mm_andnot_ps(mask,V);
-    R1 = _mm_and_ps(R1,mask);
-    XMVECTOR vResult = _mm_xor_ps(R1, R2);
-    return vResult;
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTruncate
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    uint32_t     i;
-
-    // Avoid C4701
-    Result.vector4_f32[0] = 0.0f;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (XMISNAN(V.vector4_f32[i]))
-        {
-            Result.vector4_u32[i] = 0x7FC00000;
-        }
-        else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
-        {
-            Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]);
-        }
-        else
-        {
-            Result.vector4_f32[i] = V.vector4_f32[i];
-        }
-    }
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTest = vabsq_f32( V );
-    vTest = vcltq_f32( vTest, g_XMNoFraction );
-
-    int32x4_t vInt = vcvtq_s32_f32( V );
-    XMVECTOR vResult = vcvtq_f32_s32( vInt );
-
-    // All numbers less than 8388608 will use the round to int
-    // All others, use the ORIGINAL value
-    return vbslq_f32( vTest, vResult, V );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // To handle NAN, INF and numbers greater than 8388608, use masking
-    // Get the abs value
-    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
-    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
-    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
-    // Convert to int and back to float for rounding with truncation
-    __m128i vInt = _mm_cvttps_epi32(V);
-    // Convert back to floats
-    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
-    // All numbers less than 8388608 will use the round to int
-    vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
-    // All others, use the ORIGINAL value
-    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
-    vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorFloor
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = floorf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = floorf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = floorf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = floorf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTest = vabsq_f32( V );
-    vTest = vcltq_f32( vTest, g_XMNoFraction );
-    // Truncate
-    int32x4_t vInt = vcvtq_s32_f32( V );
-    XMVECTOR vResult = vcvtq_f32_s32( vInt );
-    XMVECTOR vLarger = vcgtq_f32( vResult, V );
-    // 0 -> 0, 0xffffffff -> -1.0f
-    vLarger = vcvtq_f32_s32( vLarger );
-    vResult = vaddq_f32( vResult, vLarger );
-    // All numbers less than 8388608 will use the round to int
-    // All others, use the ORIGINAL value
-    return vbslq_f32( vTest, vResult, V );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_floor_ps( V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // To handle NAN, INF and numbers greater than 8388608, use masking
-    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
-    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
-    // Truncate
-    __m128i vInt = _mm_cvttps_epi32(V);
-    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
-    __m128 vLarger = _mm_cmpgt_ps( vResult, V );
-    // 0 -> 0, 0xffffffff -> -1.0f
-    vLarger = _mm_cvtepi32_ps( _mm_castps_si128( vLarger ) );
-    vResult = _mm_add_ps( vResult, vLarger );
-    // All numbers less than 8388608 will use the round to int
-    vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
-    // All others, use the ORIGINAL value
-    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
-    vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCeiling
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = ceilf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = ceilf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = ceilf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = ceilf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTest = vabsq_f32( V );
-    vTest = vcltq_f32( vTest, g_XMNoFraction );
-    // Truncate
-    int32x4_t vInt = vcvtq_s32_f32( V );
-    XMVECTOR vResult = vcvtq_f32_s32( vInt );
-    XMVECTOR vSmaller = vcltq_f32( vResult, V );
-    // 0 -> 0, 0xffffffff -> -1.0f
-    vSmaller = vcvtq_f32_s32( vSmaller );
-    vResult = vsubq_f32( vResult, vSmaller );
-    // All numbers less than 8388608 will use the round to int
-    // All others, use the ORIGINAL value
-    return vbslq_f32( vTest, vResult, V );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_ceil_ps( V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // To handle NAN, INF and numbers greater than 8388608, use masking
-    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
-    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
-    // Truncate
-    __m128i vInt = _mm_cvttps_epi32(V);
-    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
-    __m128 vSmaller = _mm_cmplt_ps( vResult, V );
-    // 0 -> 0, 0xffffffff -> -1.0f
-    vSmaller = _mm_cvtepi32_ps( _mm_castps_si128( vSmaller ) );
-    vResult = _mm_sub_ps( vResult, vSmaller );
-    // All numbers less than 8388608 will use the round to int
-    vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
-    // All others, use the ORIGINAL value
-    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
-    vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorClamp
-(
-    FXMVECTOR V, 
-    FXMVECTOR Min, 
-    FXMVECTOR Max
-)
-{
-    assert(XMVector4LessOrEqual(Min, Max));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVectorMax(Min, V);
-    Result = XMVectorMin(Max, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult = vmaxq_f32(Min,V);
-    vResult = vminq_f32(vResult,Max);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult = _mm_max_ps(Min,V);
-    vResult = _mm_min_ps(vResult,Max);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSaturate
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    return XMVectorClamp(V, Zero, g_XMOne.v);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Set <0 to 0
-    XMVECTOR vResult = vmaxq_f32(V, vdupq_n_f32(0) );
-    // Set>1 to 1
-    return vminq_f32(vResult, vdupq_n_f32(1.0f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Set <0 to 0
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    // Set>1 to 1
-    return _mm_min_ps(vResult,g_XMOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Bitwise logical operations
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAndInt
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0];
-    Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1];
-    Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2];
-    Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vandq_u32(V1,V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_and_ps(V1,V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAndCInt
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0];
-    Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1];
-    Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2];
-    Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vbicq_u32(V1,V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorOrInt
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0];
-    Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1];
-    Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2];
-    Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vorrq_u32(V1,V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNorInt
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]);
-    Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]);
-    Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]);
-    Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t Result = vorrq_u32(V1,V2);
-    return vbicq_u32(g_XMNegOneMask, Result);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i Result;
-    Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
-    Result = _mm_andnot_si128( Result,g_XMNegOneMask);
-    return _mm_castsi128_ps(Result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorXorInt
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0];
-    Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1];
-    Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2];
-    Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return veorq_u32(V1,V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNegate
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = -V.vector4_f32[0];
-    Result.vector4_f32[1] = -V.vector4_f32[1];
-    Result.vector4_f32[2] = -V.vector4_f32[2];
-    Result.vector4_f32[3] = -V.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vnegq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR Z;
-
-    Z = _mm_setzero_ps();
-
-    return _mm_sub_ps( Z, V );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAdd
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0];
-    Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1];
-    Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2];
-    Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vaddq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_add_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSum
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = 
-    Result.vector4_f32[1] = 
-    Result.vector4_f32[2] = 
-    Result.vector4_f32[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t v1 = vget_low_f32(V);
-    float32x2_t v2 = vget_high_f32(V);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    return vcombine_f32(v1, v1);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vTemp = _mm_hadd_ps(V, V);
-    return _mm_hadd_ps(vTemp,vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
-    XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
-    vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
-    return _mm_add_ps(vTemp, vTemp2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAddAngles
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    // Add the given angles together.  If the range of V1 is such
-    // that -Pi <= V1 < Pi and the range of V2 is such that
-    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
-    // will be -Pi <= Result < Pi.
-    XMVECTOR Result = XMVectorAdd(V1, V2);
-
-    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
-    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
-
-    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
-    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
-
-    Result = XMVectorAdd(Result, Offset);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = vaddq_f32(V1,V2);
-    // Less than Pi?
-    uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi);
-    vOffset = vandq_u32(vOffset,g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = vaddq_f32(vResult,vOffset);
-    // Greater than or equal to Pi?
-    vOffset = vcgeq_f32(vResult,g_XMPi);
-    vOffset = vandq_u32(vOffset,g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = vsubq_f32(vResult,vOffset);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = _mm_add_ps(V1,V2);
-    // Less than Pi?
-    XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
-    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = _mm_add_ps(vResult,vOffset);
-    // Greater than or equal to Pi?
-    vOffset = _mm_cmpge_ps(vResult,g_XMPi);
-    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = _mm_sub_ps(vResult,vOffset);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSubtract
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0];
-    Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1];
-    Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2];
-    Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsubq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_sub_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    // Subtract the given angles.  If the range of V1 is such
-    // that -Pi <= V1 < Pi and the range of V2 is such that
-    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
-    // will be -Pi <= Result < Pi.
-    XMVECTOR Result = XMVectorSubtract(V1, V2);
-
-    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
-    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
-
-    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
-    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
-
-    Result = XMVectorAdd(Result, Offset);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = vsubq_f32(V1,V2);
-    // Less than Pi?
-    uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi);
-    vOffset = vandq_u32(vOffset,g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = vaddq_f32(vResult,vOffset);
-    // Greater than or equal to Pi?
-    vOffset = vcgeq_f32(vResult,g_XMPi);
-    vOffset = vandq_u32(vOffset,g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = vsubq_f32(vResult,vOffset);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = _mm_sub_ps(V1,V2);
-    // Less than Pi?
-    XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
-    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = _mm_add_ps(vResult,vOffset);
-    // Greater than or equal to Pi?
-    vOffset = _mm_cmpge_ps(vResult,g_XMPi);
-    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = _mm_sub_ps(vResult,vOffset);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiply
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0];
-    Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1];
-    Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2];
-    Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3];
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmulq_f32( V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_mul_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0];
-    Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1];
-    Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2];
-    Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3];
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmlaq_f32( V3, V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_mul_ps( V1, V2 );
-    return _mm_add_ps(vResult, V3 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorDivide
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0];
-    Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1];
-    Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2];
-    Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x4_t Reciprocal = vrecpeq_f32(V2);
-    float32x4_t S = vrecpsq_f32( Reciprocal, V2 );
-    Reciprocal = vmulq_f32( S, Reciprocal );
-    S = vrecpsq_f32( Reciprocal, V2 );
-    Reciprocal = vmulq_f32( S, Reciprocal );
-    return vmulq_f32( V1, Reciprocal );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_div_ps( V1, V2 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]);
-    Result.vector4_f32[1] = V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]);
-    Result.vector4_f32[2] = V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]);
-    Result.vector4_f32[3] = V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmlsq_f32( V3, V1, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR R = _mm_mul_ps( V1, V2 );
-    return _mm_sub_ps( V3, R );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorScale
-(
-    FXMVECTOR V, 
-    float    ScaleFactor
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V.vector4_f32[0] * ScaleFactor;
-    Result.vector4_f32[1] = V.vector4_f32[1] * ScaleFactor;
-    Result.vector4_f32[2] = V.vector4_f32[2] * ScaleFactor;
-    Result.vector4_f32[3] = V.vector4_f32[3] * ScaleFactor;
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmulq_n_f32( V, ScaleFactor );
-#elif defined(_XM_SSE_INTRINSICS_)
-   XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
-   return _mm_mul_ps(vResult,V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
-    Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
-    Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
-    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vrecpeq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_rcp_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocal
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
-    Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
-    Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
-    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 2 iterations of Newton-Raphson refinement
-    float32x4_t Reciprocal = vrecpeq_f32(V);
-    float32x4_t S = vrecpsq_f32( Reciprocal, V );
-    Reciprocal = vmulq_f32( S, Reciprocal );
-    S = vrecpsq_f32( Reciprocal, V );
-    return vmulq_f32( S, Reciprocal );
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_div_ps(g_XMOne,V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return an estimated square root
-inline XMVECTOR XM_CALLCONV XMVectorSqrtEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 1 iteration of Newton-Raphson refinment of sqrt
-    float32x4_t S0 = vrsqrteq_f32(V);
-    float32x4_t P0 = vmulq_f32( V, S0 );
-    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
-    float32x4_t S1 = vmulq_f32( S0, R0 );
-
-    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
-    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
-    XMVECTOR Result = vmulq_f32( V, S1 );
-    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    return XMVectorSelect(V, Result, Select);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_sqrt_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSqrt
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 3 iterations of Newton-Raphson refinment of sqrt
-    float32x4_t S0 = vrsqrteq_f32(V);
-    float32x4_t P0 = vmulq_f32( V, S0 );
-    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
-    float32x4_t S1 = vmulq_f32( S0, R0 );
-    float32x4_t P1 = vmulq_f32( V, S1 );
-    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
-    float32x4_t S2 = vmulq_f32( S1, R1 );
-    float32x4_t P2 = vmulq_f32( V, S2 );
-    float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
-    float32x4_t S3 = vmulq_f32( S2, R2 );
-
-    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
-    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
-    XMVECTOR Result = vmulq_f32( V, S3 );
-    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    return XMVectorSelect(V, Result, Select);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_sqrt_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vrsqrteq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_rsqrt_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x4_t S0 = vrsqrteq_f32(V);
-
-    float32x4_t P0 = vmulq_f32( V, S0 );
-    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
-
-    float32x4_t S1 = vmulq_f32( S0, R0 );
-    float32x4_t P1 = vmulq_f32( V, S1 );
-    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
-
-    return vmulq_f32( S1, R1 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_sqrt_ps(V);
-    vResult = _mm_div_ps(g_XMOne,vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExp2
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]);
-    Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]);
-    Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]);
-    Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t itrunc = vcvtq_s32_f32(V);
-    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
-    float32x4_t y = vsubq_f32(V, ftrunc);
-
-    float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y );
-    poly = vmlaq_f32( g_XMExpEst5, poly, y );
-    poly = vmlaq_f32( g_XMExpEst4, poly, y );
-    poly = vmlaq_f32( g_XMExpEst3, poly, y );
-    poly = vmlaq_f32( g_XMExpEst2, poly, y );
-    poly = vmlaq_f32( g_XMExpEst1, poly, y );
-    poly = vmlaq_f32( g_XMOne, poly, y );
-
-    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
-    biased = vshlq_n_s32(biased, 23);
-    float32x4_t result0 = XMVectorDivide(biased, poly);
-
-    biased = vaddq_s32(itrunc, g_XM253);
-    biased = vshlq_n_s32(biased, 23);
-    float32x4_t result1 = XMVectorDivide(biased, poly);
-    result1 = vmulq_f32(g_XMMinNormal.v, result1);
-
-    // Use selection to handle the cases
-    //  if (V is NaN) -> QNaN;
-    //  else if (V sign bit set)
-    //      if (V > -150)
-    //         if (V.exponent < -126) -> result1
-    //         else -> result0
-    //      else -> +0
-    //  else
-    //      if (V < 128) -> result0
-    //      else -> +inf
-
-    int32x4_t comp = vcltq_s32( V, g_XMBin128);
-    float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity );
-
-    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
-    float32x4_t result3 = vbslq_f32( comp, result1, result0 );
-
-    comp = vcltq_s32(V, g_XMBinNeg150);
-    float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero );
-
-    int32x4_t sign = vandq_s32(V, g_XMNegativeZero);
-    comp = vceqq_s32(sign, g_XMNegativeZero);
-    float32x4_t result5 = vbslq_f32( comp, result4, result2 );
-
-    int32x4_t t0 = vandq_s32(V, g_XMQNaNTest);
-    int32x4_t t1 = vandq_s32(V, g_XMInfinity);
-    t0 = vceqq_s32(t0, g_XMZero);
-    t1 = vceqq_s32(t1, g_XMInfinity);
-    int32x4_t isNaN = vbicq_s32( t1,t0);
-
-    float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 );
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i itrunc = _mm_cvttps_epi32(V);
-    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
-    __m128 y = _mm_sub_ps(V, ftrunc);
-    __m128 poly = _mm_mul_ps(g_XMExpEst7, y);
-    poly = _mm_add_ps(g_XMExpEst6, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst5, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst4, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst3, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst2, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst1, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMOne, poly);
-
-    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
-    biased = _mm_slli_epi32(biased, 23);
-    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
-
-    biased = _mm_add_epi32(itrunc, g_XM253);
-    biased = _mm_slli_epi32(biased, 23);
-    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
-    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
-
-    // Use selection to handle the cases
-    //  if (V is NaN) -> QNaN;
-    //  else if (V sign bit set)
-    //      if (V > -150)
-    //         if (V.exponent < -126) -> result1
-    //         else -> result0
-    //      else -> +0
-    //  else
-    //      if (V < 128) -> result0
-    //      else -> +inf
-
-    __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(V), g_XMBin128);
-    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
-    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
-    __m128i result2 = _mm_or_si128(select0, select1);
-
-    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
-    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
-    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
-    __m128i result3 = _mm_or_si128(select0, select1);
-
-    comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150);
-    select0 = _mm_and_si128(comp, result3);
-    select1 = _mm_andnot_si128(comp, g_XMZero);
-    __m128i result4 = _mm_or_si128(select0, select1);
-
-    __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero);
-    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
-    select0 = _mm_and_si128(comp, result4);
-    select1 = _mm_andnot_si128(comp, result2);
-    __m128i result5 = _mm_or_si128(select0, select1);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result5);
-    __m128i vResult = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExpE
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = expf(V.vector4_f32[0]);
-    Result.vector4_f32[1] = expf(V.vector4_f32[1]);
-    Result.vector4_f32[2] = expf(V.vector4_f32[2]);
-    Result.vector4_f32[3] = expf(V.vector4_f32[3]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // expE(V) = exp2(vin*log2(e))
-    float32x4_t Ve = vmulq_f32(g_XMLgE, V);
-
-    int32x4_t itrunc = vcvtq_s32_f32(Ve);
-    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
-    float32x4_t y = vsubq_f32(Ve, ftrunc);
-
-
-    float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y );
-    poly = vmlaq_f32( g_XMExpEst5, poly, y );
-    poly = vmlaq_f32( g_XMExpEst4, poly, y );
-    poly = vmlaq_f32( g_XMExpEst3, poly, y );
-    poly = vmlaq_f32( g_XMExpEst2, poly, y );
-    poly = vmlaq_f32( g_XMExpEst1, poly, y );
-    poly = vmlaq_f32( g_XMOne, poly, y );
-
-    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
-    biased = vshlq_n_s32(biased, 23);
-    float32x4_t result0 = XMVectorDivide(biased, poly);
-
-    biased = vaddq_s32(itrunc, g_XM253);
-    biased = vshlq_n_s32(biased, 23);
-    float32x4_t result1 = XMVectorDivide(biased, poly);
-    result1 = vmulq_f32(g_XMMinNormal.v, result1);
-
-    // Use selection to handle the cases
-    //  if (V is NaN) -> QNaN;
-    //  else if (V sign bit set)
-    //      if (V > -150)
-    //         if (V.exponent < -126) -> result1
-    //         else -> result0
-    //      else -> +0
-    //  else
-    //      if (V < 128) -> result0
-    //      else -> +inf
-
-    int32x4_t comp = vcltq_s32( Ve, g_XMBin128);
-    float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity );
-
-    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
-    float32x4_t result3 = vbslq_f32( comp, result1, result0 );
-
-    comp = vcltq_s32(Ve, g_XMBinNeg150);
-    float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero );
-
-    int32x4_t sign = vandq_s32(Ve, g_XMNegativeZero);
-    comp = vceqq_s32(sign, g_XMNegativeZero);
-    float32x4_t result5 = vbslq_f32( comp, result4, result2 );
-
-    int32x4_t t0 = vandq_s32(Ve, g_XMQNaNTest);
-    int32x4_t t1 = vandq_s32(Ve, g_XMInfinity);
-    t0 = vceqq_s32(t0, g_XMZero);
-    t1 = vceqq_s32(t1, g_XMInfinity);
-    int32x4_t isNaN = vbicq_s32( t1,t0);
-
-    float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 );
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // expE(V) = exp2(vin*log2(e))
-    __m128 Ve = _mm_mul_ps(g_XMLgE, V);
-
-    __m128i itrunc = _mm_cvttps_epi32(Ve);
-    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
-    __m128 y = _mm_sub_ps(Ve, ftrunc);
-    __m128 poly = _mm_mul_ps(g_XMExpEst7, y);
-    poly = _mm_add_ps(g_XMExpEst6, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst5, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst4, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst3, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst2, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMExpEst1, poly);
-    poly = _mm_mul_ps(poly, y);
-    poly = _mm_add_ps(g_XMOne, poly);
-
-    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
-    biased = _mm_slli_epi32(biased, 23);
-    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
-
-    biased = _mm_add_epi32(itrunc, g_XM253);
-    biased = _mm_slli_epi32(biased, 23);
-    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
-    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
-
-    // Use selection to handle the cases
-    //  if (V is NaN) -> QNaN;
-    //  else if (V sign bit set)
-    //      if (V > -150)
-    //         if (V.exponent < -126) -> result1
-    //         else -> result0
-    //      else -> +0
-    //  else
-    //      if (V < 128) -> result0
-    //      else -> +inf
-
-    __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(Ve), g_XMBin128);
-    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
-    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
-    __m128i result2 = _mm_or_si128(select0, select1);
-
-    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
-    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
-    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
-    __m128i result3 = _mm_or_si128(select0, select1);
-
-    comp = _mm_cmplt_epi32(_mm_castps_si128(Ve), g_XMBinNeg150);
-    select0 = _mm_and_si128(comp, result3);
-    select1 = _mm_andnot_si128(comp, g_XMZero);
-    __m128i result4 = _mm_or_si128(select0, select1);
-
-    __m128i sign = _mm_and_si128(_mm_castps_si128(Ve), g_XMNegativeZero);
-    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
-    select0 = _mm_and_si128(comp, result4);
-    select1 = _mm_andnot_si128(comp, result2);
-    __m128i result5 = _mm_or_si128(select0, select1);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(Ve), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(Ve), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result5);
-    __m128i vResult = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExp
-(
-    FXMVECTOR V
-)
-{
-    return XMVectorExp2(V);
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(_XM_SSE_INTRINSICS_)
-
-namespace Internal
-{
-    inline __m128i multi_sll_epi32(__m128i value, __m128i count)
-    {
-        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0));
-        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r0 = _mm_sll_epi32(v, c);
-
-        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1));
-        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r1 = _mm_sll_epi32(v, c);
-
-        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2));
-        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r2 = _mm_sll_epi32(v, c);
-
-        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3));
-        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r3 = _mm_sll_epi32(v, c);
-
-        // (r0,r0,r1,r1)
-        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0));
-        // (r2,r2,r3,r3)
-        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0));
-        // (r0,r1,r2,r3)
-        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0));
-        return _mm_castps_si128(result);
-    }
-
-    inline __m128i multi_srl_epi32(__m128i value, __m128i count)
-    {
-        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0));
-        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r0 = _mm_srl_epi32(v, c);
-
-        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1));
-        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r1 = _mm_srl_epi32(v, c);
-
-        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2));
-        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r2 = _mm_srl_epi32(v, c);
-
-        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3));
-        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3));
-        c = _mm_and_si128(c, g_XMMaskX);
-        __m128i r3 = _mm_srl_epi32(v, c);
-
-        // (r0,r0,r1,r1)
-        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0));
-        // (r2,r2,r3,r3)
-        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0));
-        // (r0,r1,r2,r3)
-        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0));
-        return _mm_castps_si128(result);
-    }
-
-    inline __m128i GetLeadingBit(const __m128i value)
-    {
-        static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF};
-        static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
-        static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F};
-        static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003};
-
-        __m128i v = value, r, c, b, s;
-
-        c = _mm_cmpgt_epi32(v, g_XM0000FFFF);   // c = (v > 0xFFFF)
-        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
-        r = _mm_slli_epi32(b, 4);               // r = (b << 4)
-        v = multi_srl_epi32(v, r);              // v = (v >> r)
-
-        c = _mm_cmpgt_epi32(v, g_XM000000FF);   // c = (v > 0xFF)
-        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
-        s = _mm_slli_epi32(b, 3);               // s = (b << 3)
-        v = multi_srl_epi32(v, s);              // v = (v >> s)
-        r = _mm_or_si128(r, s);                 // r = (r | s)
-
-        c = _mm_cmpgt_epi32(v, g_XM0000000F);   // c = (v > 0xF)
-        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
-        s = _mm_slli_epi32(b, 2);               // s = (b << 2)
-        v = multi_srl_epi32(v, s);              // v = (v >> s)
-        r = _mm_or_si128(r, s);                 // r = (r | s)
-
-        c = _mm_cmpgt_epi32(v, g_XM00000003);   // c = (v > 0x3)
-        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
-        s = _mm_slli_epi32(b, 1);               // s = (b << 1)
-        v = multi_srl_epi32(v, s);              // v = (v >> s)
-        r = _mm_or_si128(r, s);                 // r = (r | s)
-
-        s = _mm_srli_epi32(v, 1);
-        r = _mm_or_si128(r, s);
-        return r;
-    }
-} // namespace Internal
-
-#endif // _XM_SSE_INTRINSICS_
-
-#if defined(_XM_ARM_NEON_INTRINSICS_)
-
-namespace Internal
-{
-    inline int32x4_t GetLeadingBit(const int32x4_t value)
-    {
-        static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF};
-        static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
-        static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F};
-        static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003};
-
-        int32x4_t v = value, r, c, b, s;
-
-        c = vcgtq_s32(v, g_XM0000FFFF);     // c = (v > 0xFFFF)
-        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
-        r = vshlq_n_s32(b, 4);              // r = (b << 4)
-        r = vnegq_s32( r );
-        v = vshlq_u32( v, r );              // v = (v >> r)
-        
-        c = vcgtq_s32(v, g_XM000000FF);     // c = (v > 0xFF)
-        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
-        s = vshlq_n_s32(b, 3);              // s = (b << 3)
-        s = vnegq_s32( s );
-        v = vshlq_u32(v, s);                // v = (v >> s)
-        r = vorrq_s32(r, s);                // r = (r | s)
-
-        c = vcgtq_s32(v, g_XM0000000F);     // c = (v > 0xF)
-        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
-        s = vshlq_n_s32(b, 2);              // s = (b << 2)
-        s = vnegq_s32( s );
-        v = vshlq_u32(v, s);                // v = (v >> s)
-        r = vorrq_s32(r, s);                // r = (r | s)
-
-        c = vcgtq_s32(v, g_XM00000003);     // c = (v > 0x3)
-        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
-        s = vshlq_n_s32(b, 1);              // s = (b << 1)
-        s = vnegq_s32( s );
-        v = vshlq_u32(v, s);                // v = (v >> s)
-        r = vorrq_s32(r, s);                // r = (r | s)
-
-        s = vshrq_n_u32(v, 1);
-        r = vorrq_s32(r, s);
-        return r;
-    }
-
-} // namespace Internal
-
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLog2
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    const float fScale = 1.4426950f; // (1.0f / logf(2.0f));
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale;
-    Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale;
-    Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale;
-    Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale;
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t rawBiased = vandq_s32(V, g_XMInfinity);
-    int32x4_t trailing = vandq_s32(V, g_XMQNaNTest);
-    int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    int32x4_t biased = vshrq_n_u32(rawBiased, 23);
-    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
-    int32x4_t trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    int32x4_t leading = Internal::GetLeadingBit(trailing);
-    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
-    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
-    int32x4_t trailingSub = vshlq_u32(trailing, shift);
-    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
-    int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor );
-    int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor );
-
-    // Compute the approximation.
-    int32x4_t tmp = vorrq_s32(g_XMOne, t);
-    float32x4_t y = vsubq_f32(tmp, g_XMOne);
-
-    float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y );
-    log2 = vmlaq_f32( g_XMLogEst5, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst4, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst3, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst2, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst1, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst0, log2, y );
-    log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y );
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask);
-    isInfinite = vceqq_s32(isInfinite, g_XMInfinity);
-
-    int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero);
-    int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity);
-    int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite);
-
-    int32x4_t isZero = vandq_s32((V), g_XMAbsMask);
-    isZero = vceqq_s32(isZero, g_XMZero);
-
-    int32x4_t t0 = vandq_s32((V), g_XMQNaNTest);
-    int32x4_t t1 = vandq_s32((V), g_XMInfinity);
-    t0 = vceqq_s32(t0, g_XMZero);
-    t1 = vceqq_s32(t1, g_XMInfinity);
-    int32x4_t isNaN = vbicq_s32( t1,t0);
-
-    float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 );
-    tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN );
-    result = vbslq_f32(isPositive, result, tmp);
-    result = vbslq_f32(isNaN, g_XMQNaN, result );
-    return result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    __m128i biased = _mm_srli_epi32(rawBiased, 23);
-    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
-    __m128i trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    __m128i leading = Internal::GetLeadingBit(trailing);
-    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
-    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
-    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
-    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
-
-    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
-    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
-    __m128i e = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isExponentZero, trailingSub);
-    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
-    __m128i t = _mm_or_si128(select0, select1);
-
-    // Compute the approximation.
-    __m128i tmp = _mm_or_si128(g_XMOne, t);
-    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
-
-    __m128 log2 = _mm_mul_ps(g_XMLogEst7, y);
-    log2 = _mm_add_ps(g_XMLogEst6, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst5, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst4, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst3, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst2, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst1, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst0, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e));
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
-
-    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
-    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
-    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
-
-    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
-    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
-    __m128i result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
-    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
-    tmp = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isPositive, result);
-    select1 = _mm_andnot_si128(isPositive, tmp);
-    result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result);
-    result = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLogE
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = logf(V.vector4_f32[0]);
-    Result.vector4_f32[1] = logf(V.vector4_f32[1]);
-    Result.vector4_f32[2] = logf(V.vector4_f32[2]);
-    Result.vector4_f32[3] = logf(V.vector4_f32[3]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t rawBiased = vandq_s32(V, g_XMInfinity);
-    int32x4_t trailing = vandq_s32(V, g_XMQNaNTest);
-    int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    int32x4_t biased = vshrq_n_u32(rawBiased, 23);
-    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
-    int32x4_t trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    int32x4_t leading = Internal::GetLeadingBit(trailing);
-    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
-    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
-    int32x4_t trailingSub = vshlq_u32(trailing, shift);
-    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
-    int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor );
-    int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor );
-
-    // Compute the approximation.
-    int32x4_t tmp = vorrq_s32(g_XMOne, t);
-    float32x4_t y = vsubq_f32(tmp, g_XMOne);
-
-    float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y );
-    log2 = vmlaq_f32( g_XMLogEst5, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst4, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst3, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst2, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst1, log2, y );
-    log2 = vmlaq_f32( g_XMLogEst0, log2, y );
-    log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y );
-
-    log2 = vmulq_f32(g_XMInvLgE, log2);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask);
-    isInfinite = vceqq_s32(isInfinite, g_XMInfinity);
-
-    int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero);
-    int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity);
-    int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite);
-
-    int32x4_t isZero = vandq_s32((V), g_XMAbsMask);
-    isZero = vceqq_s32(isZero, g_XMZero);
-
-    int32x4_t t0 = vandq_s32((V), g_XMQNaNTest);
-    int32x4_t t1 = vandq_s32((V), g_XMInfinity);
-    t0 = vceqq_s32(t0, g_XMZero);
-    t1 = vceqq_s32(t1, g_XMInfinity);
-    int32x4_t isNaN = vbicq_s32( t1,t0);
-
-    float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 );
-    tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN );
-    result = vbslq_f32(isPositive, result, tmp);
-    result = vbslq_f32(isNaN, g_XMQNaN, result );
-    return result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    __m128i biased = _mm_srli_epi32(rawBiased, 23);
-    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
-    __m128i trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    __m128i leading = Internal::GetLeadingBit(trailing);
-    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
-    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
-    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
-    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
-
-    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
-    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
-    __m128i e = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isExponentZero, trailingSub);
-    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
-    __m128i t = _mm_or_si128(select0, select1);
-
-    // Compute the approximation.
-    __m128i tmp = _mm_or_si128(g_XMOne, t);
-    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
-
-    __m128 log2 = _mm_mul_ps(g_XMLogEst7, y);
-    log2 = _mm_add_ps(g_XMLogEst6, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst5, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst4, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst3, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst2, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst1, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(g_XMLogEst0, log2);
-    log2 = _mm_mul_ps(log2, y);
-    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e));
-
-    log2 = _mm_mul_ps(g_XMInvLgE, log2);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
-
-    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
-    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
-    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
-
-    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
-    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
-    __m128i result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
-    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
-    tmp = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isPositive, result);
-    select1 = _mm_andnot_si128(isPositive, tmp);
-    result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result);
-    result = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLog
-(
-    FXMVECTOR V
-)
-{
-    return XMVectorLog2(V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorPow
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]);
-    Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]);
-    Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]);
-    Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
-        powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
-        powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
-        powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
-    };
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __declspec(align(16)) float a[4];
-    __declspec(align(16)) float b[4];
-    _mm_store_ps( a, V1 );
-    _mm_store_ps( b, V2 );
-    XMVECTOR vResult = _mm_setr_ps(
-        powf(a[0],b[0]),
-        powf(a[1],b[1]),
-        powf(a[2],b[2]),
-        powf(a[3],b[3]));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAbs
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = fabsf(V.vector4_f32[0]);
-    vResult.vector4_f32[1] = fabsf(V.vector4_f32[1]);
-    vResult.vector4_f32[2] = fabsf(V.vector4_f32[2]);
-    vResult.vector4_f32[3] = fabsf(V.vector4_f32[3]);
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vabsq_f32( V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_setzero_ps();
-    vResult = _mm_sub_ps(vResult,V);
-    vResult = _mm_max_ps(vResult,V);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMod
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    // V1 % V2 = V1 - V2 * truncate(V1 / V2)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Quotient = XMVectorDivide(V1, V2);
-    Quotient = XMVectorTruncate(Quotient);
-    XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vResult = XMVectorDivide(V1, V2);
-    vResult = XMVectorTruncate(vResult);
-    return vmlsq_f32( V1, vResult, V2 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_div_ps(V1, V2);
-    vResult = XMVectorTruncate(vResult);
-    vResult = _mm_mul_ps(vResult,V2);
-    vResult = _mm_sub_ps(V1,vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorModAngles
-(
-    FXMVECTOR Angles
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR V;
-    XMVECTOR Result;
-
-    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
-    V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
-    V = XMVectorRound(V);
-    Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
-    XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi);
-    // Use the inline function due to complexity for rounding
-    vResult = XMVectorRound(vResult);
-    return vmlsq_f32( Angles, vResult, g_XMTwoPi );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
-    XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi);
-    // Use the inline function due to complexity for rounding
-    vResult = XMVectorRound(vResult);
-    vResult = _mm_mul_ps(vResult,g_XMTwoPi);
-    vResult = _mm_sub_ps(Angles,vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSin
-(
-    FXMVECTOR V
-)
-{
-    // 11-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = sinf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = sinf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = sinf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = sinf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32( x );
-    float32x4_t rflx = vsubq_f32(c, x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32( comp, x, rflx );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, x);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SC0,  _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCos
-(
-    FXMVECTOR V
-)
-{
-    // 10-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = cosf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = cosf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = cosf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = cosf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32( x );
-    float32x4_t rflx = vsubq_f32(c, x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32( comp, x, rflx );
-    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0 );
-
-    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, sign);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorSinCos
-(
-    XMVECTOR* pSin, 
-    XMVECTOR* pCos, 
-    FXMVECTOR V
-)
-{
-    assert(pSin != nullptr);
-    assert(pCos != nullptr);
-
-    // 11/10-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Sin;
-    Sin.vector4_f32[0] = sinf( V.vector4_f32[0] );
-    Sin.vector4_f32[1] = sinf( V.vector4_f32[1] );
-    Sin.vector4_f32[2] = sinf( V.vector4_f32[2] );
-    Sin.vector4_f32[3] = sinf( V.vector4_f32[3] );
-
-    XMVECTOR Cos;
-    Cos.vector4_f32[0] = cosf( V.vector4_f32[0] );
-    Cos.vector4_f32[1] = cosf( V.vector4_f32[1] );
-    Cos.vector4_f32[2] = cosf( V.vector4_f32[2] );
-    Cos.vector4_f32[3] = cosf( V.vector4_f32[3] );
-
-    *pSin = Sin;
-    *pCos = Cos;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32( x );
-    float32x4_t  rflx = vsubq_f32(c, x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32( comp, x, rflx );
-    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation for sine
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pSin = vmulq_f32(Result, x);
-
-    // Compute polynomial approximation for cosine
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
-    Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pCos = vmulq_f32(Result, sign);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation of sine
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    *pSin = Result;
-
-    // Compute polynomial approximation of cosine
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_mul_ps(vConstants, x2);
-
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CC0,  _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CC0,  _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    *pCos = Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTan
-(
-    FXMVECTOR V
-)
-{
-    // Cody and Waite algorithm to compute tangent.
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = tanf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = tanf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = tanf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = tanf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 
-
-    static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f};
-    static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f};
-    static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ };
-    static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1};
-
-    XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
-
-    XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
-    XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
-    XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
-
-    XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
-
-    VA = XMVectorRound(VA);
-
-    XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
-
-    XMVECTOR VB = XMVectorAbs(VA);
-
-    VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    VB = vcvtq_u32_f32( VB );
-#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB);
-#else
-    for (size_t i = 0; i < 4; i++)
-    {
-        VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i];
-    }
-#endif
-
-    XMVECTOR VC2 = XMVectorMultiply(VC, VC);
-
-    XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
-    XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
-    XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
-    XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
-    XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
-    XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
-    XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
-    XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
-
-    XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
-    VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
-
-    XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
-    XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
-    N = XMVectorMultiplyAdd(VC2, N, T5);
-    D = XMVectorMultiplyAdd(VC2, D, T2);
-    N = XMVectorMultiply(VC2, N);
-    D = XMVectorMultiplyAdd(VC2, D, T1);
-    N = XMVectorMultiplyAdd(VC, N, VC);
-    XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
-    D = XMVectorMultiplyAdd(VC2, D, T0);
-
-    N = XMVectorSelect(N, VC, VCNearZero);
-    D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
-
-    XMVECTOR R0 = XMVectorNegate(N);
-    XMVECTOR R1 = XMVectorDivide(N,D);
-    R0 = XMVectorDivide(D,R0);
-
-    XMVECTOR VIsZero = XMVectorEqual(V, Zero);
-
-    XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
-
-    Result = XMVectorSelect(Result, Zero, VIsZero);
-
-    return Result;
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSinH
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = sinhf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = sinhf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = sinhf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = sinhf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v );
-    XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v );
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-
-    return vsubq_f32(E1, E2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = _mm_mul_ps(V, Scale);
-    V1 = _mm_add_ps(V1,g_XMNegativeOne);
-    XMVECTOR V2 = _mm_mul_ps(V, Scale);
-    V2 = _mm_sub_ps(g_XMNegativeOne,V2);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-
-    return _mm_sub_ps(E1, E2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCosH
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = coshf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = coshf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = coshf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = coshf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
-    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-    return vaddq_f32(E1, E2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = _mm_mul_ps(V,Scale.v);
-    V1 = _mm_add_ps(V1,g_XMNegativeOne.v);
-    XMVECTOR V2 = _mm_mul_ps(V, Scale.v);
-    V2 = _mm_sub_ps(g_XMNegativeOne.v,V2);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-    return _mm_add_ps(E1, E2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTanH
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = tanhf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = tanhf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = tanhf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = tanhf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
-
-    XMVECTOR E = vmulq_f32(V, Scale.v);
-    E = XMVectorExp(E);
-    E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v );
-    E = XMVectorReciprocal(E);
-    return vsubq_f32(g_XMOne.v, E);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
-
-    XMVECTOR E = _mm_mul_ps(V, Scale.v);
-    E = XMVectorExp(E);
-    E = _mm_mul_ps(E,g_XMOneHalf.v);
-    E = _mm_add_ps(E,g_XMOneHalf.v);
-    E = _mm_div_ps(g_XMOne.v,E);
-    return _mm_sub_ps(g_XMOne.v,E);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorASin
-(
-    FXMVECTOR V
-)
-{
-    // 7-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = asinf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = asinf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = asinf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = asinf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
-    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32( nonnegative, t0, t1 );
-    t0 = vsubq_f32(g_XMHalfPi, t0);
-    return t0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 t0 = _mm_mul_ps(vConstants, x);
-
-    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    t0 = _mm_sub_ps(g_XMHalfPi, t0);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorACos
-(
-    FXMVECTOR V
-)
-{
-    // 7-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = acosf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = acosf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = acosf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = acosf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
-    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32( nonnegative, t0, t1 );
-    return t0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 t0 = _mm_mul_ps(vConstants, x);
-
-    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATan
-(
-    FXMVECTOR V
-)
-{
-    // 17-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = atanf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = atanf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = atanf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = atanf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t absV = vabsq_f32(V);
-    float32x4_t invV = XMVectorReciprocal(V);
-    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
-    uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-    comp = vcleq_f32(absV, g_XMOne);
-    sign = vbslq_f32(comp, g_XMZero, sign);
-    uint32x4_t x = vbslq_f32(comp, V, invV);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR TC1 = g_XMATanCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
-    XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(TC1), 1 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    const XMVECTOR TC0 = g_XMATanCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    Result = vmlaq_f32( g_XMOne, Result, x2 );
-    Result = vmulq_f32( Result, x );
-
-    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
-    result1 = vsubq_f32(result1, Result);
-
-    comp = vceqq_f32(sign, g_XMZero);
-    Result = vbslq_f32( comp, Result, result1 );
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 absV = XMVectorAbs(V);
-    __m128 invV = _mm_div_ps(g_XMOne, V);
-    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
-    __m128 select0 = _mm_and_ps(comp, g_XMOne);
-    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    __m128 sign = _mm_or_ps(select0, select1);
-    comp = _mm_cmple_ps(absV, g_XMOne);
-    select0 = _mm_and_ps(comp, g_XMZero);
-    select1 = _mm_andnot_ps(comp, sign);
-    sign = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, V);
-    select1 = _mm_andnot_ps(comp, invV);
-    __m128 x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR TC1 = g_XMATanCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    const XMVECTOR TC0 = g_XMATanCoefficients0;
-    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
-    result1 = _mm_sub_ps(result1, Result);
-
-    comp = _mm_cmpeq_ps(sign, g_XMZero);
-    select0 = _mm_and_ps(comp, Result);
-    select1 = _mm_andnot_ps(comp, result1);
-    Result = _mm_or_ps(select0, select1);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATan2
-(
-    FXMVECTOR Y, 
-    FXMVECTOR X
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] );
-    Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] );
-    Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] );
-    Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] );
-    return Result;
-#else
-
-    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
-
-    //     Y == 0 and X is Negative         -> Pi with the sign of Y
-    //     y == 0 and x is positive         -> 0 with the sign of y
-    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
-    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
-    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
-    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
-    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
-    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
-    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
-
-    static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
-
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR ATanResultValid = XMVectorTrueInt();
-
-    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
-    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
-    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
-    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
-
-    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
-    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
-    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
-    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
-    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
-
-    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
-    Pi = XMVectorOrInt(Pi, YSign);
-    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
-    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
-    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
-
-    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
-    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
-    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
-    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
-    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
-    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
-
-    XMVECTOR V = XMVectorDivide(Y, X);
-
-    XMVECTOR R0 = XMVectorATan(V);
-
-    R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive );
-    R2 = XMVectorAdd(R0, R1);
-
-    return XMVectorSelect(Result, R2, ATanResultValid);
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSinEst
-(
-    FXMVECTOR V
-)
-{
-    // 7-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = sinf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = sinf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = sinf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = sinf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32( x );
-    float32x4_t rflx = vsubq_f32(c, x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32( comp, x, rflx );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, x);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCosEst
-(
-    FXMVECTOR V
-)
-{
-    // 6-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = cosf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = cosf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = cosf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = cosf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32( x );
-    float32x4_t rflx = vsubq_f32(c, x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32( comp, x, rflx );
-    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, sign);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVectorSinCosEst
-(
-    XMVECTOR* pSin, 
-    XMVECTOR* pCos, 
-    FXMVECTOR  V
-)
-{
-    assert(pSin != nullptr);
-    assert(pCos != nullptr);
-
-    // 7/6-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Sin;
-    Sin.vector4_f32[0] = sinf( V.vector4_f32[0] );
-    Sin.vector4_f32[1] = sinf( V.vector4_f32[1] );
-    Sin.vector4_f32[2] = sinf( V.vector4_f32[2] );
-    Sin.vector4_f32[3] = sinf( V.vector4_f32[3] );
-
-    XMVECTOR Cos;
-    Cos.vector4_f32[0] = cosf( V.vector4_f32[0] );
-    Cos.vector4_f32[1] = cosf( V.vector4_f32[1] );
-    Cos.vector4_f32[2] = cosf( V.vector4_f32[2] );
-    Cos.vector4_f32[3] = cosf( V.vector4_f32[3] );
-
-    *pSin = Sin;
-    *pCos = Cos;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32( x );
-    float32x4_t rflx = vsubq_f32(c, x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32( comp, x, rflx );
-    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation for sine
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pSin = vmulq_f32(Result, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
-    Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pCos = vmulq_f32(Result, sign);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation for sine
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    *pSin = Result;
-
-    // Compute polynomial approximation for cosine
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    Result = _mm_mul_ps(vConstants, x2);
-
-    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    Result = _mm_add_ps(Result, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    *pCos = Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTanEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = tanf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = tanf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = tanf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = tanf( V.vector4_f32[3] );
-    return Result;
-#else
-
-    XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
-
-    XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
-    V1 = XMVectorRound(V1);
-
-    V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
-
-    XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
-    XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
-    XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
-
-    XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
-    XMVECTOR V2 = XMVectorMultiply(V1, V1);
-    XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
-    XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
-
-    XMVECTOR D = XMVectorReciprocalEst(V2T2);
-    XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
-
-    return XMVectorMultiply(N, D);
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorASinEst
-(
-    FXMVECTOR V
-)
-{
-    // 3-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = asinf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = asinf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = asinf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = asinf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
-    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32( nonnegative, t0, t1 );
-    t0 = vsubq_f32(g_XMHalfPi, t0);
-    return t0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 t0 = _mm_mul_ps(vConstants, x);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    t0 = _mm_sub_ps(g_XMHalfPi, t0);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorACosEst
-(
-    FXMVECTOR V
-)
-{
-    // 3-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = acosf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = acosf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = acosf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = acosf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
-    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
-    t0 = vmlaq_f32( vConstants, t0, x );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
-    t0 = vmlaq_f32( vConstants, t0, x );
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32( nonnegative, t0, t1 );
-    return t0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 t0 = _mm_mul_ps(vConstants, x);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, x);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
-    t0 = _mm_add_ps(t0, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATanEst
-(
-    FXMVECTOR V
-)
-{
-    // 9-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = atanf( V.vector4_f32[0] );
-    Result.vector4_f32[1] = atanf( V.vector4_f32[1] );
-    Result.vector4_f32[2] = atanf( V.vector4_f32[2] );
-    Result.vector4_f32[3] = atanf( V.vector4_f32[3] );
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t absV = vabsq_f32(V);
-    float32x4_t invV = XMVectorReciprocalEst(V);
-    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
-    uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne );
-    comp = vcleq_f32(absV, g_XMOne);
-    sign = vbslq_f32(comp, g_XMZero, sign );
-    uint32x4_t x = vbslq_f32(comp, V, invV );
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMATanEstCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(AEC), 1 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0);
-    Result = vmlaq_f32( vConstants, Result, x2 );
-
-    // ATanEstCoefficients0 is already splatted
-    Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 );
-    Result = vmulq_f32( Result, x );
-
-    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
-    result1 = vsubq_f32(result1, Result);
-
-    comp = vceqq_f32(sign, g_XMZero);
-    Result = vbslq_f32( comp, Result, result1 );
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 absV = XMVectorAbs(V);
-    __m128 invV = _mm_div_ps(g_XMOne, V);
-    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
-    __m128 select0 = _mm_and_ps(comp, g_XMOne);
-    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    __m128 sign = _mm_or_ps(select0, select1);
-    comp = _mm_cmple_ps(absV, g_XMOne);
-    select0 = _mm_and_ps(comp, g_XMZero);
-    select1 = _mm_andnot_ps(comp, sign);
-    sign = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, V);
-    select1 = _mm_andnot_ps(comp, invV);
-    __m128 x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMATanEstCoefficients1;
-    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
-    __m128 Result = _mm_mul_ps(vConstants, x2);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
-    Result = _mm_add_ps(Result, vConstants);
-    Result = _mm_mul_ps(Result, x2);
-
-    // ATanEstCoefficients0 is already splatted
-    Result = _mm_add_ps(Result, g_XMATanEstCoefficients0);
-    Result = _mm_mul_ps(Result, x);
-    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
-    result1 = _mm_sub_ps(result1, Result);
-
-    comp = _mm_cmpeq_ps(sign, g_XMZero);
-    select0 = _mm_and_ps(comp, Result);
-    select1 = _mm_andnot_ps(comp, result1);
-    Result = _mm_or_ps(select0, select1);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATan2Est
-(
-    FXMVECTOR Y, 
-    FXMVECTOR X
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] );
-    Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] );
-    Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] );
-    Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] );
-    return Result;
-#else
-
-    static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */};
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR ATanResultValid = XMVectorTrueInt();
-
-    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
-    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
-    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
-    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
-
-    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
-    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
-    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
-    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
-    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
-
-    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
-    Pi = XMVectorOrInt(Pi, YSign);
-    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
-    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
-    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
-
-    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
-    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
-    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
-    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
-    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
-    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
-
-    XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
-    XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
-    XMVECTOR R0 = XMVectorATanEst(V);
-
-    R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive );
-    R2 = XMVectorAdd(R0, R1);
-
-    Result = XMVectorSelect(Result, R2, ATanResultValid);
-
-    return Result;
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLerp
-(
-    FXMVECTOR V0, 
-    FXMVECTOR V1, 
-    float    t
-)
-{
-    // V0 + t * (V1 - V0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Scale = XMVectorReplicate(t);
-    XMVECTOR Length = XMVectorSubtract(V1, V0);
-    return XMVectorMultiplyAdd(Length, Scale, V0);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR L = vsubq_f32( V1, V0 );
-    return vmlaq_n_f32( V0, L, t );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR L = _mm_sub_ps( V1, V0 );
-    XMVECTOR S = _mm_set_ps1( t );
-    XMVECTOR Result = _mm_mul_ps( L, S );
-    return _mm_add_ps( Result, V0 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLerpV
-(
-    FXMVECTOR V0, 
-    FXMVECTOR V1, 
-    FXMVECTOR T
-)
-{
-    // V0 + T * (V1 - V0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Length = XMVectorSubtract(V1, V0);
-    return XMVectorMultiplyAdd(Length, T, V0);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR L = vsubq_f32( V1, V0 );
-    return vmlaq_f32( V0, L, T );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR Length = _mm_sub_ps( V1, V0 );
-    XMVECTOR Result = _mm_mul_ps( Length, T );
-    return _mm_add_ps( Result, V0 );
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorHermite
-(
-    FXMVECTOR Position0, 
-    FXMVECTOR Tangent0, 
-    FXMVECTOR Position1, 
-    GXMVECTOR Tangent1, 
-    float    t
-)
-{
-    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
-    //          (t^3 - 2 * t^2 + t) * Tangent0 +
-    //          (-2 * t^3 + 3 * t^2) * Position1 +
-    //          (t^3 - t^2) * Tangent1
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
-    XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
-    XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
-    XMVECTOR T1 = XMVectorReplicate(t3 - t2);
-
-    XMVECTOR Result = XMVectorMultiply(P0, Position0);
-    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
-    Result = XMVectorMultiplyAdd(P1, Position1, Result);
-    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f;
-    float t0 = t3 - 2.0f * t2 + t;
-    float p1 = -2.0f * t3 + 3.0f * t2;
-    float t1 = t3 - t2;
-
-    XMVECTOR vResult = vmulq_n_f32(Position0, p0 );
-    vResult = vmlaq_n_f32( vResult, Tangent0, t0 );
-    vResult = vmlaq_n_f32( vResult, Position1, p1 );
-    vResult = vmlaq_n_f32( vResult, Tangent1, t1 );
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
-    XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
-    XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
-    XMVECTOR T1 = _mm_set_ps1(t3 - t2);
-
-    XMVECTOR vResult = _mm_mul_ps(P0, Position0);
-    XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0);
-    vResult = _mm_add_ps(vResult,vTemp);
-    vTemp = _mm_mul_ps(P1, Position1);
-    vResult = _mm_add_ps(vResult,vTemp);
-    vTemp = _mm_mul_ps(T1, Tangent1);
-    vResult = _mm_add_ps(vResult,vTemp);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorHermiteV
-(
-    FXMVECTOR Position0, 
-    FXMVECTOR Tangent0, 
-    FXMVECTOR Position1, 
-    GXMVECTOR Tangent1, 
-    HXMVECTOR T
-)
-{
-    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
-    //          (t^3 - 2 * t^2 + t) * Tangent0 +
-    //          (-2 * t^3 + 3 * t^2) * Position1 +
-    //          (t^3 - t^2) * Tangent1
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR T2 = XMVectorMultiply(T, T);
-    XMVECTOR T3 = XMVectorMultiply(T , T2);
-
-    XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
-    XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
-    XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
-    XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
-
-    XMVECTOR Result = XMVectorMultiply(P0, Position0);
-    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
-    Result = XMVectorMultiplyAdd(P1, Position1, Result);
-    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
-    static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
-
-    XMVECTOR T2 = vmulq_f32(T,T);
-    XMVECTOR T3 = vmulq_f32(T,T2);
-    // Mul by the constants against t^2
-    T2 = vmulq_f32(T2,CatMulT2);
-    // Mul by the constants against t^3
-    T3 = vmlaq_f32(T2, T3, CatMulT3 );
-    // T3 now has the pre-result.
-    // I need to add t.y only
-    T2 = vandq_u32(T,g_XMMaskY);
-    T3 = vaddq_f32(T3,T2);
-    // Add 1.0f to x
-    T3 = vaddq_f32(T3,g_XMIdentityR0);
-    // Now, I have the constants created
-    // Mul the x constant to Position0
-    XMVECTOR vResult = vmulq_lane_f32( Position0, vget_low_f32( T3 ), 0 ); // T3[0]
-    // Mul the y constant to Tangent0
-    vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32( T3 ), 1 ); // T3[1]
-    // Mul the z constant to Position1
-    vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32( T3 ), 0  ); // T3[2]
-    // Mul the w constant to Tangent1
-    vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32( T3 ), 1 ); // T3[3]
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
-    static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
-
-    XMVECTOR T2 = _mm_mul_ps(T,T);
-    XMVECTOR T3 = _mm_mul_ps(T,T2);
-    // Mul by the constants against t^2
-    T2 = _mm_mul_ps(T2,CatMulT2);
-    // Mul by the constants against t^3
-    T3 = _mm_mul_ps(T3,CatMulT3);
-    // T3 now has the pre-result.
-    T3 = _mm_add_ps(T3,T2);
-    // I need to add t.y only
-    T2 = _mm_and_ps(T,g_XMMaskY);
-    T3 = _mm_add_ps(T3,T2);
-    // Add 1.0f to x
-    T3 = _mm_add_ps(T3,g_XMIdentityR0);
-    // Now, I have the constants created
-    // Mul the x constant to Position0
-    XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0));
-    vResult = _mm_mul_ps(vResult,Position0);
-    // Mul the y constant to Tangent0
-    T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1));
-    T2 = _mm_mul_ps(T2,Tangent0);
-    vResult = _mm_add_ps(vResult,T2);
-    // Mul the z constant to Position1
-    T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2));
-    T2 = _mm_mul_ps(T2,Position1);
-    vResult = _mm_add_ps(vResult,T2);
-    // Mul the w constant to Tangent1
-    T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3));
-    T3 = _mm_mul_ps(T3,Tangent1);
-    vResult = _mm_add_ps(vResult,T3);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCatmullRom
-(
-    FXMVECTOR Position0, 
-    FXMVECTOR Position1, 
-    FXMVECTOR Position2, 
-    GXMVECTOR Position3, 
-    float    t
-)
-{
-    // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
-    //           (3 * t^3 - 5 * t^2 + 2) * Position1 +
-    //           (-3 * t^3 + 4 * t^2 + t) * Position2 +
-    //           (t^3 - t^2) * Position3) * 0.5
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
-    XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
-    XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
-    XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
-
-    XMVECTOR Result = XMVectorMultiply(P0, Position0);
-    Result = XMVectorMultiplyAdd(P1, Position1, Result);
-    Result = XMVectorMultiplyAdd(P2, Position2, Result);
-    Result = XMVectorMultiplyAdd(P3, Position3, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    float p0 = (-t3 + 2.0f * t2 - t) * 0.5f;
-    float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f;
-    float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f;
-    float p3 = (t3 - t2) * 0.5f;
-
-    XMVECTOR P1 = vmulq_n_f32(Position1, p1);
-    XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0);
-    XMVECTOR P3 = vmulq_n_f32(Position3, p3);
-    XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2);
-    P0 = vaddq_f32(P0,P2);
-    return P0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
-    XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
-    XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
-    XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
-
-    P0 = _mm_mul_ps(P0, Position0);
-    P1 = _mm_mul_ps(P1, Position1);
-    P2 = _mm_mul_ps(P2, Position2);
-    P3 = _mm_mul_ps(P3, Position3);
-    P0 = _mm_add_ps(P0,P1);
-    P2 = _mm_add_ps(P2,P3);
-    P0 = _mm_add_ps(P0,P2);
-    return P0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV
-(
-    FXMVECTOR Position0, 
-    FXMVECTOR Position1, 
-    FXMVECTOR Position2, 
-    GXMVECTOR Position3, 
-    HXMVECTOR T
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float fx = T.vector4_f32[0];
-    float fy = T.vector4_f32[1];
-    float fz = T.vector4_f32[2];
-    float fw = T.vector4_f32[3];
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]
-                             + (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]
-                             + (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]
-                             + (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]);
-    vResult.vector4_f32[1] = 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]
-                             + (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]
-                             + (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]
-                             + (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]);
-    vResult.vector4_f32[2] = 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]
-                             + (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]
-                             + (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]
-                             + (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]);
-    vResult.vector4_f32[3] = 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]
-                             + (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]
-                             + (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]
-                             + (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]);
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
-    static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
-    static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
-    static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
-    // Cache T^2 and T^3
-    XMVECTOR T2 = vmulq_f32(T,T);
-    XMVECTOR T3 = vmulq_f32(T,T2);
-    // Perform the Position0 term
-    XMVECTOR vResult = vaddq_f32(T2,T2);
-    vResult = vsubq_f32(vResult,T);
-    vResult = vsubq_f32(vResult,T3);
-    vResult = vmulq_f32(vResult,Position0);
-    // Perform the Position1 term and add
-    XMVECTOR vTemp = vmulq_f32(T3,Catmul3);
-    vTemp = vmlsq_f32(vTemp, T2, Catmul5);
-    vTemp = vaddq_f32(vTemp,Catmul2);
-    vResult = vmlaq_f32(vResult, vTemp, Position1);
-    // Perform the Position2 term and add
-    vTemp = vmulq_f32(T2,Catmul4);
-    vTemp = vmlsq_f32(vTemp, T3, Catmul3);
-    vTemp = vaddq_f32(vTemp,T);
-    vResult = vmlaq_f32(vResult, vTemp, Position2);
-    // Position3 is the last term
-    T3 = vsubq_f32(T3,T2);
-    vResult = vmlaq_f32(vResult, T3, Position3);
-    // Multiply by 0.5f and exit
-    vResult = vmulq_f32(vResult,g_XMOneHalf);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
-    static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
-    static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
-    static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
-    // Cache T^2 and T^3
-    XMVECTOR T2 = _mm_mul_ps(T,T);
-    XMVECTOR T3 = _mm_mul_ps(T,T2);
-    // Perform the Position0 term
-    XMVECTOR vResult = _mm_add_ps(T2,T2);
-    vResult = _mm_sub_ps(vResult,T);
-    vResult = _mm_sub_ps(vResult,T3);
-    vResult = _mm_mul_ps(vResult,Position0);
-    // Perform the Position1 term and add
-    XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3);
-    XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5);
-    vTemp = _mm_sub_ps(vTemp,vTemp2);
-    vTemp = _mm_add_ps(vTemp,Catmul2);
-    vTemp = _mm_mul_ps(vTemp,Position1);
-    vResult = _mm_add_ps(vResult,vTemp);
-    // Perform the Position2 term and add
-    vTemp = _mm_mul_ps(T2,Catmul4);
-    vTemp2 = _mm_mul_ps(T3,Catmul3);
-    vTemp = _mm_sub_ps(vTemp,vTemp2);
-    vTemp = _mm_add_ps(vTemp,T);
-    vTemp = _mm_mul_ps(vTemp,Position2);
-    vResult = _mm_add_ps(vResult,vTemp);
-    // Position3 is the last term
-    T3 = _mm_sub_ps(T3,T2);
-    T3 = _mm_mul_ps(T3,Position3);
-    vResult = _mm_add_ps(vResult,T3);
-    // Multiply by 0.5f and exit
-    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorBaryCentric
-(
-    FXMVECTOR Position0, 
-    FXMVECTOR Position1, 
-    FXMVECTOR Position2, 
-    float    f, 
-    float    g
-)
-{
-    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
-    XMVECTOR ScaleF = XMVectorReplicate(f);
-
-    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
-    XMVECTOR ScaleG = XMVectorReplicate(g);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
-    Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR R1 = vsubq_f32(Position1,Position0);
-    XMVECTOR R2 = vsubq_f32(Position2,Position0);
-    R1 = vmlaq_n_f32( Position0, R1, f);
-    return vmlaq_n_f32( R1, R2, g );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
-    XMVECTOR SF = _mm_set_ps1(f);
-    XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
-    XMVECTOR SG = _mm_set_ps1(g);
-    R1 = _mm_mul_ps(R1,SF);
-    R2 = _mm_mul_ps(R2,SG);
-    R1 = _mm_add_ps(R1,Position0);
-    R1 = _mm_add_ps(R1,R2);
-    return R1;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV
-(
-    FXMVECTOR Position0, 
-    FXMVECTOR Position1, 
-    FXMVECTOR Position2, 
-    GXMVECTOR F, 
-    HXMVECTOR G
-)
-{
-    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
-    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
-    Result = XMVectorMultiplyAdd(P20, G, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR R1 = vsubq_f32(Position1,Position0);
-    XMVECTOR R2 = vsubq_f32(Position2,Position0);
-    R1 = vmlaq_f32( Position0, R1, F );
-    return vmlaq_f32( R1, R2, G);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
-    XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
-    R1 = _mm_mul_ps(R1,F);
-    R2 = _mm_mul_ps(R2,G);
-    R1 = _mm_add_ps(R1,Position0);
-    R1 = _mm_add_ps(R1,R2);
-    return R1;
-#endif
-}
-
-/****************************************************************************
- *
- * 2D Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2Equal
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-// z and w are don't care
-    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
-#endif
-}
-
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2EqualR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] == V2.vector4_f32[1]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] != V2.vector4_f32[1]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    uint64_t r = vget_lane_u64( vTemp, 0 );
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-// z and w are don't care
-    int iTest = _mm_movemask_ps(vTemp)&3;
-    uint32_t CR = 0;
-    if (iTest==3)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2EqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2EqualIntR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 
-        (V1.vector4_u32[1] == V2.vector4_u32[1]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 
-        (V1.vector4_u32[1] != V2.vector4_u32[1]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
-    uint64_t r = vget_lane_u64( vTemp, 0 );
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3;
-    uint32_t CR = 0;
-    if (iTest==3)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2NearEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR Epsilon
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
-    float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
-    return ((dx <= Epsilon.vector4_f32[0]) &&
-            (dy <= Epsilon.vector4_f32[1]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2));
-    uint32x2_t vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) );
-    uint64_t r = vget_lane_u64( vTemp, 0 );
-    return ( r == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp,vDelta);
-    vTemp = _mm_max_ps(vTemp,vDelta);
-    vTemp = _mm_cmple_ps(vTemp,Epsilon);
-    // z and w are don't care
-    return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2NotEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-// z and w are don't care
-    return (((_mm_movemask_ps(vTemp)&3)!=3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2NotEqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2Greater
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-// z and w are don't care
-    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2GreaterR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] > V2.vector4_f32[1]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] <= V2.vector4_f32[1]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    uint64_t r = vget_lane_u64( vTemp, 0 );
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-    int iTest = _mm_movemask_ps(vTemp)&3;
-    uint32_t CR = 0;
-    if (iTest==3)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2GreaterOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] >= V2.vector4_f32[1]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] < V2.vector4_f32[1]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    uint64_t r = vget_lane_u64( vTemp, 0 );
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    int iTest = _mm_movemask_ps(vTemp)&3;
-    uint32_t CR = 0;
-    if (iTest == 3)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2Less
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2LessOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2InBounds
-(
-    FXMVECTOR V, 
-    FXMVECTOR Bounds
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
-        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    float32x2_t B = vget_low_f32( Bounds );
-    // Test if less than or equal
-    uint32x2_t ivTemp1 = vcle_f32(VL,B);
-    // Negate the bounds
-    float32x2_t vTemp2 = vneg_f32(B);
-    // Test if greater or equal (Reversed)
-    uint32x2_t ivTemp2 = vcle_f32(vTemp2,VL);
-    // Blend answers
-    ivTemp1 = vand_u32(ivTemp1,ivTemp2);
-    // x and y in bounds?
-    return ( vget_lane_u64( ivTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2,V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
-    // x and y in bounds? (z and w are don't care)
-    return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2IsNaN
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (XMISNAN(V.vector4_f32[0]) ||
-            XMISNAN(V.vector4_f32[1]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    // Test against itself. NaN is always not equal
-    uint32x2_t vTempNan = vceq_f32( VL, VL );
-    // If x or y are NaN, the mask is zero
-    return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
-    // If x or y are NaN, the mask is non-zero
-    return ((_mm_movemask_ps(vTempNan)&3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2IsInfinite
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    return (XMISINF(V.vector4_f32[0]) ||
-            XMISINF(V.vector4_f32[1]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x2_t vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) );
-    // Compare to infinity
-    vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) );
-    // If any are infinity, the signs are true.
-    return vget_lane_u64( vTemp, 0 ) != 0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
-    // If x or z are infinity, the signs are true.
-    return ((_mm_movemask_ps(vTemp)&3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Dot
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] =
-    Result.vector4_f32[1] =
-    Result.vector4_f32[2] =
-    Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Perform the dot product on x and y
-    float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    return vcombine_f32( vTemp, vTemp );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_dp_ps( V1, V2, 0x3f );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V1, V2);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_moveldup_ps(vDot);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Cross
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = fCross;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 };
-
-    float32x2_t vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) );
-    vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    return vcombine_f32( vTemp, vTemp );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap x and y
-    XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1));
-    // Perform the muls
-    vResult = _mm_mul_ps(vResult,V1);
-    // Splat y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
-    // Sub the values
-    vResult = _mm_sub_ss(vResult,vTemp);
-    // Splat the cross product
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthSq
-(
-    FXMVECTOR V
-)
-{
-    return XMVector2Dot(V, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorReciprocalSqrtEst(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32( VL, VL );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    // Reciprocal sqrt (estimate)
-    vTemp = vrsqrte_f32( vTemp );
-    return vcombine_f32( vTemp, vTemp );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    return _mm_rsqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ss(vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = _mm_rsqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorReciprocalSqrt(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32( VL, VL );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    // Reciprocal sqrt
-    float32x2_t  S0 = vrsqrte_f32(vTemp);
-    float32x2_t  P0 = vmul_f32( vTemp, S0 );
-    float32x2_t  R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t  S1 = vmul_f32( S0, R0 );
-    float32x2_t  P1 = vmul_f32( vTemp, S1 );
-    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t Result = vmul_f32( S1, R1 );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
-    return _mm_div_ps( g_XMOne, vLengthSq );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ss(vTemp);
-    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = _mm_sqrt_ss(vLengthSq);
-    vLengthSq = _mm_div_ss(g_XMOne,vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorSqrtEst(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32( VL, VL );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32( vTemp, zero );
-    // Sqrt (estimate)
-    float32x2_t Result = vrsqrte_f32( vTemp );
-    Result = vmul_f32( vTemp, Result );
-    Result = vbsl_f32( VEqualsZero, zero, Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    return _mm_sqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ss(vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = _mm_sqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Length
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorSqrt(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32( VL, VL );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32( vTemp, zero );
-    // Sqrt
-    float32x2_t S0 = vrsqrte_f32( vTemp );
-    float32x2_t P0 = vmul_f32( vTemp, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( vTemp, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t Result = vmul_f32( S1, R1 );
-    Result = vmul_f32( vTemp, Result );
-    Result = vbsl_f32( VEqualsZero, zero, Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    return _mm_sqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ss(vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// XMVector2NormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2ReciprocalLength(V);
-    Result = XMVectorMultiply(V, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32( VL, VL );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    // Reciprocal sqrt (estimate)
-    vTemp = vrsqrte_f32( vTemp );
-    // Normalize
-    float32x2_t Result = vmul_f32( VL, vTemp );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, V);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    vLengthSq = _mm_mul_ps(vLengthSq, V);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = _mm_rsqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    vLengthSq = _mm_mul_ps(vLengthSq,V);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Normalize
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR vResult = XMVector2Length( V );
-    float fLength = vResult.vector4_f32[0];
-
-    // Prevent divide by zero
-    if (fLength > 0) {
-        fLength = 1.0f/fLength;
-    }
-    
-    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
-    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
-    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
-    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32( VL, VL );
-    vTemp = vpadd_f32( vTemp, vTemp );
-    uint32x2_t VEqualsZero = vceq_f32( vTemp, vdup_n_f32(0) );
-    uint32x2_t VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) );
-    // Reciprocal sqrt (2 iterations of Newton-Raphson)
-    float32x2_t S0 = vrsqrte_f32( vTemp );
-    float32x2_t P0 = vmul_f32( vTemp, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( vTemp, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    vTemp = vmul_f32( S1, R1 );
-    // Normalize
-    float32x2_t Result = vmul_f32( VL, vTemp );
-    Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
-    Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#elif defined(_XM_SSE3_INTRINSICS_)
-    // Perform the dot product on x and y only
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_moveldup_ps(vLengthSq);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y only
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ClampLength
-(
-    FXMVECTOR V, 
-    float    LengthMin, 
-    float    LengthMax
-)
-{
-    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
-    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
-    return XMVector2ClampLengthV(V, ClampMin, ClampMax);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV
-(
-    FXMVECTOR V, 
-    FXMVECTOR LengthMin, 
-    FXMVECTOR LengthMax
-)
-{
-    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
-    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
-    assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
-    assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
-    assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
-
-    XMVECTOR LengthSq = XMVector2LengthSq(V);
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
-
-    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
-    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
-
-    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
-
-    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
-
-    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
-    Length = XMVectorSelect(LengthSq, Length, Select);
-    Normal = XMVectorSelect(LengthSq, Normal, Select);
-
-    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
-    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
-
-    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
-    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
-
-    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
-
-    // Preserve the original vector (with no precision loss) if the length falls within the given range
-    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
-    Result = XMVectorSelect(Result, V, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Reflect
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal
-)
-{
-    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
-
-    XMVECTOR Result;
-    Result = XMVector2Dot(Incident, Normal);
-    Result = XMVectorAdd(Result, Result);
-    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Refract
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal, 
-    float    RefractionIndex
-)
-{
-    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
-    return XMVector2RefractV(Incident, Normal, Index);
-}
-
-//------------------------------------------------------------------------------
-
-// Return the refraction of a 2D vector
-inline XMVECTOR XM_CALLCONV XMVector2RefractV
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal, 
-    FXMVECTOR RefractionIndex
-)
-{
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
-    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]);
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    float RY = 1.0f-(IDotN*IDotN);
-    float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]);
-    RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]);
-    if (RX>=0.0f) {
-        RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX)));
-    } else {
-        RX = 0.0f;
-    }
-    if (RY>=0.0f) {
-        RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY)));
-    } else {
-        RY = 0.0f;
-    }
-
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = RX;
-    vResult.vector4_f32[1] = RY;
-    vResult.vector4_f32[2] = 0.0f;   
-    vResult.vector4_f32[3] = 0.0f;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t IL = vget_low_f32( Incident );
-    float32x2_t NL = vget_low_f32( Normal );
-    float32x2_t RIL = vget_low_f32( RefractionIndex );
-    // Get the 2D Dot product of Incident-Normal
-    float32x2_t vTemp = vmul_f32(IL, NL);
-    float32x2_t IDotN = vpadd_f32( vTemp, vTemp );
-    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN);
-    vTemp = vmul_f32(vTemp,RIL);
-    vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL );
-    // If any terms are <=0, sqrt() will fail, punt to zero
-    uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) );
-    // Sqrt(vTemp)
-    float32x2_t S0 = vrsqrte_f32(vTemp);
-    float32x2_t P0 = vmul_f32( vTemp, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( vTemp, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t S2 = vmul_f32( S1, R1 );
-    vTemp = vmul_f32( vTemp, S2 );
-    // R = RefractionIndex * IDotN + sqrt(R)
-    vTemp = vmla_f32( vTemp, RIL, IDotN );
-    // Result = RefractionIndex * Incident - Normal * R
-    float32x2_t vResult = vmul_f32(RIL,IL);
-    vResult = vmls_f32( vResult, vTemp, NL );
-    vResult = vand_u32(vResult,vMask);
-    return vcombine_f32(vResult, vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
-    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
-    // Get the 2D Dot product of Incident-Normal
-    XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
-    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN);
-    vTemp = _mm_sub_ps(g_XMOne,vTemp);
-    vTemp = _mm_mul_ps(vTemp,RefractionIndex);
-    vTemp = _mm_mul_ps(vTemp,RefractionIndex);
-    vTemp = _mm_sub_ps(g_XMOne,vTemp);
-    // If any terms are <=0, sqrt() will fail, punt to zero
-    XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero);
-    // R = RefractionIndex * IDotN + sqrt(R)
-    vTemp = _mm_sqrt_ps(vTemp);
-    XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN);
-    vTemp = _mm_add_ps(vTemp,vResult);
-    // Result = RefractionIndex * Incident - Normal * R
-    vResult = _mm_mul_ps(RefractionIndex,Incident);
-    vTemp = _mm_mul_ps(vTemp,Normal);
-    vResult = _mm_sub_ps(vResult,vTemp);
-    vResult = _mm_and_ps(vResult,vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Orthogonal
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = -V.vector4_f32[1];
-    Result.vector4_f32[1] = V.vector4_f32[0];
-    Result.vector4_f32[2] = 0.f;
-    Result.vector4_f32[3] = 0.f;
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 };
-    const float32x2_t zero = vdup_n_f32(0);
-
-    float32x2_t VL = vget_low_f32( V );
-    float32x2_t Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) );
-    return vcombine_f32( Result, zero );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
-    vResult = _mm_mul_ps(vResult,g_XMNegateX);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst
-(
-    FXMVECTOR N1, 
-    FXMVECTOR N2
-)
-{
-    XMVECTOR Result = XMVector2Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACosEst(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals
-(
-    FXMVECTOR N1, 
-    FXMVECTOR N2
-)
-{
-    XMVECTOR Result = XMVector2Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
-    Result = XMVectorACos(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    XMVECTOR L1 = XMVector2ReciprocalLength(V1);
-    XMVECTOR L2 = XMVector2ReciprocalLength(V2);
-
-    XMVECTOR Dot = XMVector2Dot(V1, V2);
-
-    L1 = XMVectorMultiply(L1, L2);
-
-    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
-    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
-
-    return XMVectorACos(CosAngle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance
-(
-    FXMVECTOR LinePoint1, 
-    FXMVECTOR LinePoint2, 
-    FXMVECTOR Point
-)
-{
-    // Given a vector PointVector from LinePoint1 to Point and a vector
-    // LineVector from LinePoint1 to LinePoint2, the scaled distance 
-    // PointProjectionScale from LinePoint1 to the perpendicular projection
-    // of PointVector onto the line is defined as:
-    //
-    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
-
-    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
-    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
-
-    XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
-
-    XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
-    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
-
-    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
-    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
-
-    return XMVector2Length(DistanceVector);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2IntersectLine
-(
-    FXMVECTOR Line1Point1, 
-    FXMVECTOR Line1Point2, 
-    FXMVECTOR Line2Point1, 
-    GXMVECTOR Line2Point2
-)
-{
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
-    XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
-    XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
-
-    XMVECTOR C1 = XMVector2Cross(V1, V2);
-    XMVECTOR C2 = XMVector2Cross(V2, V3);
-
-    XMVECTOR Result;
-    const XMVECTOR Zero = XMVectorZero();
-    if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
-    {
-        if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
-        {
-            // Coincident
-            Result = g_XMInfinity.v;
-        }
-        else
-        {
-            // Parallel
-            Result = g_XMQNaN.v;
-        }
-    }
-    else
-    {
-        // Intersection point = Line1Point1 + V1 * (C2 / C1)
-        XMVECTOR Scale = XMVectorReciprocal(C1);
-        Scale = XMVectorMultiply(C2, Scale);
-        Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
-    }
-
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
-    XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
-    XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
-    // Generate the cross products
-    XMVECTOR C1 = XMVector2Cross(V1, V2);
-    XMVECTOR C2 = XMVector2Cross(V2, V3);
-    // If C1 is not close to epsilon, use the calculated value
-    XMVECTOR vResultMask = _mm_setzero_ps();
-    vResultMask = _mm_sub_ps(vResultMask,C1);
-    vResultMask = _mm_max_ps(vResultMask,C1);
-    // 0xFFFFFFFF if the calculated value is to be used
-    vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon);
-    // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
-    XMVECTOR vFailMask = _mm_setzero_ps();
-    vFailMask = _mm_sub_ps(vFailMask,C2);
-    vFailMask = _mm_max_ps(vFailMask,C2);
-    vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon);
-    XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity);
-    vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN);
-    // vFail is NAN or INF
-    vFail = _mm_or_ps(vFail,vFailMask);
-    // Intersection point = Line1Point1 + V1 * (C2 / C1)
-    XMVECTOR vResult = _mm_div_ps(C2,C1);
-    vResult = _mm_mul_ps(vResult,V1);
-    vResult = _mm_add_ps(vResult,Line1Point1);
-    // Use result, or failure value
-    vResult = _mm_and_ps(vResult,vResultMask);
-    vResultMask = _mm_andnot_ps(vResultMask,vFail);
-    vResult = _mm_or_ps(vResult,vResultMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Transform
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    float32x4_t Result = vmlaq_lane_f32( M.r[3], M.r[1], VL, 1 ); // Y
-    return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-    vResult = _mm_mul_ps(vResult,M.r[0]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    vTemp = _mm_mul_ps(vTemp,M.r[1]);
-    vResult = _mm_add_ps(vResult,vTemp);
-    vResult = _mm_add_ps(vResult,M.r[3]);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
-(
-    XMFLOAT4*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT2* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
-
-    assert(OutputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT2)*4;
-
-                float32x2_t r3 = vget_low_f32( row3 );
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
-
-                __prefetch( pInputVector );
-
-                r3 = vget_high_f32( row3 );
-                r = vget_high_f32( row0 );
-                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
-                XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
-  
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( row1 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
-                vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                float32x4x4_t R;
-                R.val[0] = vResult0;
-                R.val[1] = vResult1;
-                R.val[2] = vResult2;
-                R.val[3] = vResult3;
-
-                vst4q_f32( reinterpret_cast<float*>(pOutputVector), R );
-                pOutputVector += sizeof(XMFLOAT4)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-        pInputVector += InputStride; 
-
-        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
-
-        vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if ( two > 0 )
-    {
-        if ( InputStride == sizeof(XMFLOAT2) )
-        {
-            if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
-            {
-                // Packed input, aligned output
-                for (size_t j = 0; j < two; ++j)
-                {
-                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    pInputVector += sizeof(XMFLOAT2)*2;
-
-                    XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                    XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-
-                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-                    X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-
-                    vTemp = _mm_mul_ps( Y, row1 );
-                    vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-            else
-            {
-                // Packed input, unaligned output
-                for (size_t j = 0; j < two; ++j)
-                {
-                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    pInputVector += sizeof(XMFLOAT2)*2;
-
-                    XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                    XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-
-                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-                    X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-
-                    vTemp = _mm_mul_ps( Y, row1 );
-                    vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
-    {
-        if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
-        {
-            // Aligned input, aligned output
-            for (; i < VectorCount; i++)
-            {
-                XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
-                pInputVector += InputStride; 
-
-                XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-
-                XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                vTemp = _mm_add_ps( vTemp, row3 );
-                vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                pOutputVector += OutputStride;
-            }
-        }
-        else
-        {
-            // Aligned input, unaligned output
-            for (; i < VectorCount; i++)
-            {
-                XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
-                pInputVector += InputStride; 
-
-                XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-
-                XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                vTemp = _mm_add_ps( vTemp, row3 );
-                vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                pOutputVector += OutputStride;
-            }
-        }
-    }
-    else
-    {
-        // Unaligned input
-        for (; i < VectorCount; i++)
-        {
-            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
-            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
-            pInputVector += InputStride; 
-
-            XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0));
-            XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0));
-
-            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, row3 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-            _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    XMVECTOR W = XMVectorSplatW(Result);
-    return XMVectorDivide( Result, W );
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
-(
-    XMFLOAT2*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT2* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
-
-    assert(OutputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        XMVECTOR W = XMVectorSplatW(Result);
-
-        Result = XMVectorDivide(Result, W);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT2)*4;
-
-                float32x2_t r3 = vget_low_f32( row3 );
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
-
-                __prefetch( pInputVector );
-
-                r3 = vget_high_f32( row3 );
-                r = vget_high_f32( row0 );
-                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
-  
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( row1 );
-                W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                
-                V.val[0] = vmulq_f32( vResult0, Reciprocal );
-                V.val[1] = vmulq_f32( vResult1, Reciprocal );
-
-                vst2q_f32( reinterpret_cast<float*>(pOutputVector),V );
-                pOutputVector += sizeof(XMFLOAT2)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-        pInputVector += InputStride; 
-
-        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
-
-        V = vget_high_f32( vResult );
-        float32x2_t W = vdup_lane_f32( V, 1 );
-
-        // 2 iterations of Newton-Raphson refinement of reciprocal for W
-        float32x2_t Reciprocal = vrecpe_f32( W );
-        float32x2_t S = vrecps_f32( Reciprocal, W );
-        Reciprocal = vmul_f32( S, Reciprocal );
-        S = vrecps_f32( Reciprocal, W );
-        Reciprocal = vmul_f32( S, Reciprocal );
-
-        V = vget_low_f32( vResult );
-        V = vmul_f32( V, Reciprocal );
-
-        vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if ( two > 0 )
-    {
-        if ( InputStride == sizeof(XMFLOAT2) )
-        {
-            if ( OutputStride == sizeof(XMFLOAT2) )
-            {
-                if ( !((uintptr_t)pOutputStream & 0xF) )
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < two; ++j)
-                    {
-                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        pInputVector += sizeof(XMFLOAT2)*2; 
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        XMVECTOR V1 = _mm_div_ps( vTemp, W );
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-
-                        vTemp = _mm_mul_ps( Y, row1 );
-                        vTemp2 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        XMVECTOR V2 = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_movelh_ps( V1, V2 );
-
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                        pOutputVector += sizeof(XMFLOAT2)*2;
-
-                        i += 2;
-                    }
-                }
-                else
-                {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < two; ++j)
-                    {
-                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        pInputVector += sizeof(XMFLOAT2)*2; 
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        XMVECTOR V1 = _mm_div_ps( vTemp, W );
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-
-                        vTemp = _mm_mul_ps( Y, row1 );
-                        vTemp2 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        XMVECTOR V2 = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_movelh_ps( V1, V2 );
-
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                        pOutputVector += sizeof(XMFLOAT2)*2;
-
-                        i += 2;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < two; ++j)
-                {
-                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    pInputVector += sizeof(XMFLOAT2)*2;
-
-                    // Result 1
-                    XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                    vTemp = _mm_div_ps( vTemp, W );
-                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-                    X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-
-                    vTemp = _mm_mul_ps( Y, row1 );
-                    vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                    vTemp = _mm_div_ps( vTemp, W );
-                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
-    {
-        // Aligned input
-        for (; i < VectorCount; i++)
-        {
-            XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
-            pInputVector += InputStride; 
-
-            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, row3 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-            XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-            vTemp = _mm_div_ps( vTemp, W );
-            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-            pOutputVector += OutputStride;
-        }
-    }
-    else
-    {
-        // Unaligned input
-        for (; i < VectorCount; i++)
-        {
-            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
-            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
-            pInputVector += InputStride; 
-
-            XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
-            XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
-
-            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, row3 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-
-            XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-            vTemp = _mm_div_ps( vTemp, W );
-            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    float32x4_t Result = vmulq_lane_f32( M.r[1], VL, 1 ); // Y
-    return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-    vResult = _mm_mul_ps(vResult,M.r[0]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    vTemp = _mm_mul_ps(vTemp,M.r[1]);
-    vResult = _mm_add_ps(vResult,vTemp);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
-(
-    XMFLOAT2*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT2* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
-
-    assert(OutputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiply(Y, row1);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT2)*4;
-
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
-                XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
-
-                __prefetch( pInputVector );
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                V.val[0] = vResult0;
-                V.val[1] = vResult1;
-
-                vst2q_f32( reinterpret_cast<float*>(pOutputVector), V );
-                pOutputVector += sizeof(XMFLOAT2)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-        pInputVector += InputStride; 
-
-        XMVECTOR vResult = vmulq_lane_f32( row0, V, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
-
-        V = vget_low_f32( vResult );
-        vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if ( two > 0 )
-    {
-        if ( InputStride == sizeof(XMFLOAT2) )
-        {
-            if ( OutputStride == sizeof(XMFLOAT2) )
-            {
-                if ( !((uintptr_t)pOutputStream & 0xF) )
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < two; ++j)
-                    {
-                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        pInputVector += sizeof(XMFLOAT2)*2; 
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                        XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 );
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-
-                        vTemp = _mm_mul_ps( Y, row1 );
-                        vTemp2 = _mm_mul_ps( X, row0 );
-                        XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 );
-
-                        vTemp = _mm_movelh_ps( V1, V2 );
-
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                        pOutputVector += sizeof(XMFLOAT2)*2;
-
-                        i += 2;
-                    }
-                }
-                else
-                {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < two; ++j)
-                    {
-                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        pInputVector += sizeof(XMFLOAT2)*2; 
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                        XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 );
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-
-                        vTemp = _mm_mul_ps( Y, row1 );
-                        vTemp2 = _mm_mul_ps( X, row0 );
-                        XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 );
-
-                        vTemp = _mm_movelh_ps( V1, V2 );
-
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                        pOutputVector += sizeof(XMFLOAT2)*2;
-
-                        i += 2;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < two; ++j)
-                {
-                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    pInputVector += sizeof(XMFLOAT2)*2;
-
-                    // Result 1
-                    XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
-                    X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-
-                    vTemp = _mm_mul_ps( Y, row1 );
-                    vTemp2 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
-    {
-        // Aligned input
-        for (; i < VectorCount; i++)
-        {
-            XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
-            pInputVector += InputStride; 
-
-            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-            pOutputVector += OutputStride;
-        }
-    }
-    else
-    {
-        // Unaligned input
-        for (; i < VectorCount; i++)
-        {
-            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
-            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
-            pInputVector += InputStride; 
-
-            XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
-            XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
-
-            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-/****************************************************************************
- *
- * 3D Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3Equal
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3EqualR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] == V2.vector4_f32[2]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] != V2.vector4_f32[2]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-    int iTest = _mm_movemask_ps(vTemp)&7;
-    uint32_t CR = 0;
-    if (iTest==7)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3EqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3EqualIntR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 
-        (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
-        (V1.vector4_u32[2] == V2.vector4_u32[2]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 
-        (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
-        (V1.vector4_u32[2] != V2.vector4_u32[2]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7;
-    uint32_t CR = 0;
-    if (iTemp==7)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTemp)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3NearEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR Epsilon
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float dx, dy, dz;
-
-    dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
-    dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
-    dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
-    return (((dx <= Epsilon.vector4_f32[0]) &&
-            (dy <= Epsilon.vector4_f32[1]) &&
-            (dz <= Epsilon.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vDelta = vsubq_f32( V1, V2 );
-    uint32x4_t vResult = vacleq_f32( vDelta, Epsilon );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp,vDelta);
-    vTemp = _mm_max_ps(vTemp,vDelta);
-    vTemp = _mm_cmple_ps(vTemp,Epsilon);
-    // w is don't care
-    return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3NotEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1)  & 0xFFFFFFU) != 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&7)!=7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3NotEqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1)  & 0xFFFFFFU) != 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3Greater
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3GreaterR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] > V2.vector4_f32[2]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] <= V2.vector4_f32[2]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp)&7;
-    if (iTest==7) 
-    {
-        CR =  XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3GreaterOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] >= V2.vector4_f32[2]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] < V2.vector4_f32[2]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp)&7;
-    if (iTest==7) 
-    {
-        CR =  XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3Less
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcltq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3LessOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcleq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
-    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3InBounds
-(
-    FXMVECTOR V, 
-    FXMVECTOR Bounds
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
-        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
-        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    uint32x4_t ivTemp1 = vcleq_f32(V,Bounds);
-    // Negate the bounds
-    float32x4_t vTemp2 = vnegq_f32(Bounds);
-    // Test if greater or equal (Reversed)
-    uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V);
-    // Blend answers
-    ivTemp1 = vandq_u32(ivTemp1,ivTemp2);
-    // in bounds?
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2,V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
-    // x,y and z in bounds? (w is don't care)
-    return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0);
-#else
-    return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3IsNaN
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    return (XMISNAN(V.vector4_f32[0]) ||
-            XMISNAN(V.vector4_f32[1]) ||
-            XMISNAN(V.vector4_f32[2]));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    uint32x4_t vTempNan = vceqq_f32( V, V );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    // If x or y or z are NaN, the mask is zero
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
-    // If x or y or z are NaN, the mask is non-zero
-    return ((_mm_movemask_ps(vTempNan)&7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3IsInfinite
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (XMISINF(V.vector4_f32[0]) ||
-            XMISINF(V.vector4_f32[1]) ||
-            XMISINF(V.vector4_f32[2]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask );
-    // Compare to infinity
-    vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
-    // If any are infinity, the signs are true.
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
-    // If x,y or z are infinity, the signs are true.
-    return ((_mm_movemask_ps(vTemp)&7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Dot
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = 
-    vResult.vector4_f32[1] = 
-    vResult.vector4_f32[2] = 
-    vResult.vector4_f32[3] = fValue;
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTemp = vmulq_f32( V1, V2 );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    return vcombine_f32( v1, v1 );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_dp_ps( V1, V2, 0x7f );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
-    vTemp = _mm_and_ps(vTemp, g_XMMask3);
-    vTemp = _mm_hadd_ps(vTemp,vTemp);
-    return _mm_hadd_ps(vTemp,vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(V1,V2);
-    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
-    // Result.vector4_f32[0] = x+y
-    vDot = _mm_add_ss(vDot,vTemp);
-    // x=Dot.vector4_f32[2]
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // Result.vector4_f32[0] = (x+y)+z
-    vDot = _mm_add_ss(vDot,vTemp);
-    // Splat x
-    return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Cross
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vResult = {
-        (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
-        (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
-        (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
-        0.0f
-    };
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t v1xy = vget_low_f32(V1);
-    float32x2_t v2xy = vget_low_f32(V2);
-
-    float32x2_t v1yx = vrev64_f32( v1xy );
-    float32x2_t v2yx = vrev64_f32( v2xy );
-
-    float32x2_t v1zz = vdup_lane_f32( vget_high_f32(V1), 0 );
-    float32x2_t v2zz = vdup_lane_f32( vget_high_f32(V2), 0 );
-
-    XMVECTOR vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) );
-    vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) );
-    vResult = veorq_u32( vResult, g_XMFlipY );
-    return vandq_u32( vResult, g_XMMask3 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // y1,z1,x1,w1
-    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1));
-    // z2,x2,y2,w2
-    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2));
-    // Perform the left operation
-    XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2);
-    // z1,x1,y1,w1
-    vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1));
-    // y2,z2,x2,w2
-    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2));
-    // Perform the right operation
-    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
-    // Subract the right from left, and return answer
-    vResult = _mm_sub_ps(vResult,vTemp1);
-    // Set w to zero
-    return _mm_and_ps(vResult,g_XMMask3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthSq
-(
-    FXMVECTOR V
-)
-{
-    return XMVector3Dot(V, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorReciprocalSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32( v1 );
-    return vcombine_f32(v2, v2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    return _mm_rsqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq  = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
-    // x+z, y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    // y,y,y,y
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // x+z+y,??,??,??
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    // Splat the length squared
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    // Get the reciprocal
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorReciprocalSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    // Reciprocal sqrt
-    float32x2_t  S0 = vrsqrte_f32(v1);
-    float32x2_t  P0 = vmul_f32( v1, S0 );
-    float32x2_t  R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t  S1 = vmul_f32( S0, R0 );
-    float32x2_t  P1 = vmul_f32( v1, S1 );
-    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t Result = vmul_f32( S1, R1 );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
-    return _mm_div_ps( g_XMOne, vLengthSq );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    vDot = _mm_and_ps(vDot, g_XMMask3);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_sqrt_ps(vDot);
-    vDot = _mm_div_ps(g_XMOne,vDot);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-     // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(V,V);
-    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
-    // Result.x = x+y
-    vDot = _mm_add_ss(vDot,vTemp);
-    // x=Dot.z
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // Result.x = (x+y)+z
-    vDot = _mm_add_ss(vDot,vTemp);
-    // Splat x
-    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
-    // Get the reciprocal
-    vDot = _mm_sqrt_ps(vDot);
-    // Get the reciprocal
-    vDot = _mm_div_ps(g_XMOne,vDot);
-    return vDot;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
-    // Sqrt (estimate)
-    float32x2_t Result = vrsqrte_f32( v1 );
-    Result = vmul_f32( v1, Result );
-    Result = vbsl_f32( VEqualsZero, zero, Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    return _mm_sqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
-    // x+z, y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    // y,y,y,y
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // x+z+y,??,??,??
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    // Splat the length squared
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Length
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
-    // Sqrt
-    float32x2_t S0 = vrsqrte_f32( v1 );
-    float32x2_t P0 = vmul_f32( v1, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( v1, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t Result = vmul_f32( S1, R1 );
-    Result = vmul_f32( v1, Result );
-    Result = vbsl_f32( VEqualsZero, zero, Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    return _mm_sqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
-    // x+z, y
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    // y,y,y,y
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // x+z+y,??,??,??
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    // Splat the length squared
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// XMVector3NormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector3ReciprocalLength(V);
-    Result = XMVectorMultiply(V, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32( v1 );
-    // Normalize
-    return vmulq_f32( V, vcombine_f32(v2,v2) );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, V);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    vDot = _mm_and_ps(vDot, g_XMMask3);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_rsqrt_ps(vDot);
-    vDot = _mm_mul_ps(vDot,V);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-     // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(V,V);
-    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
-    // Result.x = x+y
-    vDot = _mm_add_ss(vDot,vTemp);
-    // x=Dot.z
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    // Result.x = (x+y)+z
-    vDot = _mm_add_ss(vDot,vTemp);
-    // Splat x
-    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
-    // Get the reciprocal
-    vDot = _mm_rsqrt_ps(vDot);
-    // Perform the normalization
-    vDot = _mm_mul_ps(vDot,V);
-    return vDot;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Normalize
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float fLength;
-    XMVECTOR vResult;
-
-    vResult = XMVector3Length( V );
-    fLength = vResult.vector4_f32[0];
-
-    // Prevent divide by zero
-    if (fLength > 0) {
-        fLength = 1.0f/fLength;
-    }
-    
-    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
-    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
-    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
-    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vdup_lane_f32( v2, 0 );
-    v1 = vadd_f32( v1, v2 );
-    uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
-    uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
-    // Reciprocal sqrt (2 iterations of Newton-Raphson)
-    float32x2_t S0 = vrsqrte_f32( v1 );
-    float32x2_t P0 = vmul_f32( v1, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( v1, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    v2 = vmul_f32( S1, R1 );
-    // Normalize
-    XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
-    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
-    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#elif defined(_XM_SSE3_INTRINSICS_)
-    // Perform the dot product on x,y and z only
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z only
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
-    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ClampLength
-(
-    FXMVECTOR V, 
-    float    LengthMin, 
-    float    LengthMax
-)
-{
-    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
-    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
-
-    return XMVector3ClampLengthV(V, ClampMin, ClampMax);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV
-(
-    FXMVECTOR V, 
-    FXMVECTOR LengthMin, 
-    FXMVECTOR LengthMax
-)
-{
-    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
-    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
-    assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
-    assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
-    assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
-
-    XMVECTOR LengthSq = XMVector3LengthSq(V);
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
-
-    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
-    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
-
-    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
-
-    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
-
-    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
-    Length = XMVectorSelect(LengthSq, Length, Select);
-    Normal = XMVectorSelect(LengthSq, Normal, Select);
-
-    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
-    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
-
-    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
-    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
-
-    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
-
-    // Preserve the original vector (with no precision loss) if the length falls within the given range
-    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
-    Result = XMVectorSelect(Result, V, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Reflect
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal
-)
-{
-    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
-
-    XMVECTOR Result = XMVector3Dot(Incident, Normal);
-    Result = XMVectorAdd(Result, Result);
-    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Refract
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal, 
-    float    RefractionIndex
-)
-{
-    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
-    return XMVector3RefractV(Incident, Normal, Index);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3RefractV
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal, 
-    FXMVECTOR RefractionIndex
-)
-{
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
-    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR  Zero = XMVectorZero();
-
-    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
-    R = XMVectorMultiply(R, RefractionIndex);
-    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
-
-    if (XMVector4LessOrEqual(R, Zero))
-    {
-        // Total internal reflection
-        return Zero;
-    }
-    else
-    {
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = XMVectorSqrt(R);
-        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
-
-        // Result = RefractionIndex * Incident - Normal * R
-        XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
-        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
-
-        return Result;
-    }
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR IDotN = XMVector3Dot(Incident,Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN);
-    R = vmulq_f32(R, RefractionIndex);
-    R = vmlsq_f32(g_XMOne, R, RefractionIndex );
-
-    uint32x4_t vResult = vcleq_f32(R,g_XMZero);
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
-    {
-        // Total internal reflection
-        vResult = g_XMZero;
-    }
-    else
-    {
-        // Sqrt(R)
-        float32x4_t S0 = vrsqrteq_f32(R);
-        float32x4_t P0 = vmulq_f32( R, S0 );
-        float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
-        float32x4_t S1 = vmulq_f32( S0, R0 );
-        float32x4_t P1 = vmulq_f32( R, S1 );
-        float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
-        float32x4_t S2 = vmulq_f32( S1, R1 );
-        R = vmulq_f32( R, S2 );
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = vmlaq_f32( R, RefractionIndex, IDotN );
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = vmulq_f32(RefractionIndex, Incident);
-        vResult = vmlsq_f32( vResult, R, Normal );
-    }
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
-    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
-    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR R = _mm_mul_ps(IDotN, IDotN);
-    R = _mm_sub_ps(g_XMOne,R);
-    R = _mm_mul_ps(R, RefractionIndex);
-    R = _mm_mul_ps(R, RefractionIndex);
-    R = _mm_sub_ps(g_XMOne,R);
-
-    XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
-    if (_mm_movemask_ps(vResult)==0x0f)
-    {
-        // Total internal reflection
-        vResult = g_XMZero;
-    }
-    else
-    {
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = _mm_sqrt_ps(R);
-        vResult = _mm_mul_ps(RefractionIndex,IDotN);
-        R = _mm_add_ps(R,vResult);
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = _mm_mul_ps(RefractionIndex, Incident);
-        R = _mm_mul_ps(R,Normal);
-        vResult = _mm_sub_ps(vResult,R);
-    }
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Orthogonal
-(
-    FXMVECTOR V
-)
-{
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);
-
-    XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
-
-    XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
-    XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
-
-    XMVECTOR S = XMVectorAdd(YZYY, Z);
-    XMVECTOR D = XMVectorSubtract(YZYY, Z);
-
-    XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
-
-    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
-    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);
-
-    return XMVectorSelect(R1, R0, Select);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst
-(
-    FXMVECTOR N1, 
-    FXMVECTOR N2
-)
-{
-    XMVECTOR Result = XMVector3Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACosEst(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals
-(
-    FXMVECTOR N1, 
-    FXMVECTOR N2
-)
-{
-    XMVECTOR Result = XMVector3Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACos(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    XMVECTOR L1 = XMVector3ReciprocalLength(V1);
-    XMVECTOR L2 = XMVector3ReciprocalLength(V2);
-
-    XMVECTOR Dot = XMVector3Dot(V1, V2);
-
-    L1 = XMVectorMultiply(L1, L2);
-
-    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
-    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
-
-    return XMVectorACos(CosAngle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance
-(
-    FXMVECTOR LinePoint1, 
-    FXMVECTOR LinePoint2, 
-    FXMVECTOR Point
-)
-{
-    // Given a vector PointVector from LinePoint1 to Point and a vector
-    // LineVector from LinePoint1 to LinePoint2, the scaled distance 
-    // PointProjectionScale from LinePoint1 to the perpendicular projection
-    // of PointVector onto the line is defined as:
-    //
-    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
-
-    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
-    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
-
-    XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
-
-    XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
-    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
-
-    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
-    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
-
-    return XMVector3Length(DistanceVector);
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline void XM_CALLCONV XMVector3ComponentsFromNormal
-(
-    XMVECTOR* pParallel, 
-    XMVECTOR* pPerpendicular, 
-    FXMVECTOR  V, 
-    FXMVECTOR  Normal
-)
-{
-    assert(pParallel != nullptr);
-    assert(pPerpendicular != nullptr);
-
-    XMVECTOR Scale = XMVector3Dot(V, Normal);
-
-    XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
-
-    *pParallel = Parallel;
-    *pPerpendicular = XMVectorSubtract(V, Parallel);
-}
-
-//------------------------------------------------------------------------------
-// Transform a vector using a rotation expressed as a unit quaternion
-
-inline XMVECTOR XM_CALLCONV XMVector3Rotate
-(
-    FXMVECTOR V, 
-    FXMVECTOR RotationQuaternion
-)
-{
-    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
-    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
-    XMVECTOR Result = XMQuaternionMultiply(Q, A);
-    return XMQuaternionMultiply(Result, RotationQuaternion);
-}
-
-//------------------------------------------------------------------------------
-// Transform a vector using the inverse of a rotation expressed as a unit quaternion
-
-inline XMVECTOR XM_CALLCONV XMVector3InverseRotate
-(
-    FXMVECTOR V, 
-    FXMVECTOR RotationQuaternion
-)
-{
-    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
-    XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
-    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
-    return XMQuaternionMultiply(Result, Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Transform
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    XMVECTOR vResult = vmlaq_lane_f32( M.r[3], M.r[0], VL, 0 ); // X
-    vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
-    return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-    vResult = _mm_mul_ps(vResult,M.r[0]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    vTemp = _mm_mul_ps(vTemp,M.r[1]);
-    vResult = _mm_add_ps(vResult,vTemp);
-    vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    vTemp = _mm_mul_ps(vTemp,M.r[2]);
-    vResult = _mm_add_ps(vResult,vTemp);
-    vResult = _mm_add_ps(vResult,M.r[3]);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream
-(
-    XMFLOAT4*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT3* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT3)*4;
-
-                float32x2_t r3 = vget_low_f32( row3 );
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
-
-                __prefetch( pInputVector );
-
-                r3 = vget_high_f32( row3 );
-                r = vget_high_f32( row0 );
-                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
-                XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
-  
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( row1 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
-                vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                r = vget_low_f32( row2 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
-
-                r = vget_high_f32( row2 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
-                vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
-
-                float32x4x4_t R;
-                R.val[0] = vResult0;
-                R.val[1] = vResult1;
-                R.val[2] = vResult2;
-                R.val[3] = vResult3;
-
-                vst4q_f32( reinterpret_cast<float*>(pOutputVector), R );
-                pOutputVector += sizeof(XMFLOAT4)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-        float32x2_t zero = vdup_n_f32(0);
-        float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
-        pInputVector += InputStride; 
-
-        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, VL, 1); // Y
-        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
-
-        vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(XMFLOAT3))
-        {
-            if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
-            {
-                // Packed input, aligned output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                    pInputVector += sizeof(XMFLOAT3)*4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1,L2,L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-            else
-            {
-                // Packed input, unaligned output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                    pInputVector += sizeof(XMFLOAT3)*4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1,L2,L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
-    {
-        // Aligned output
-        for (; i < VectorCount; ++i)
-        {
-            #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
-            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-            pInputVector += InputStride; 
-
-            XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-            XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-            XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, row3 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-            vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-            XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
-            pOutputVector += OutputStride;
-        }
-    }
-    else
-    {
-        // Unaligned output
-        for (; i < VectorCount; ++i)
-        {
-            #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
-            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-            pInputVector += InputStride; 
-
-            XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-            XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-            XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-            XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-            vTemp = _mm_add_ps( vTemp, row3 );
-            vTemp = _mm_add_ps( vTemp, vTemp2 );
-            vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-            _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    XMVECTOR W = XMVectorSplatW(Result);
-    return XMVectorDivide( Result, W );
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
-(
-    XMFLOAT3*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT3* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        XMVECTOR W = XMVectorSplatW(Result);
-
-        Result = XMVectorDivide(Result, W);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT3)*4;
-
-                float32x2_t r3 = vget_low_f32( row3 );
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
-
-                __prefetch( pInputVector );
-
-                r3 = vget_high_f32( row3 );
-                r = vget_high_f32( row0 );
-                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
-                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
-  
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( row1 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
-                W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                r = vget_low_f32( row2 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
-
-                r = vget_high_f32( row2 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
-                W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
-
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                
-                V.val[0] = vmulq_f32( vResult0, Reciprocal );
-                V.val[1] = vmulq_f32( vResult1, Reciprocal );
-                V.val[2] = vmulq_f32( vResult2, Reciprocal );
-
-                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
-                pOutputVector += sizeof(XMFLOAT3)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-        float32x2_t zero = vdup_n_f32(0);
-        float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
-        pInputVector += InputStride; 
-
-        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
-        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
-
-        VH = vget_high_f32(vResult);
-        XMVECTOR W = vdupq_lane_f32( VH, 1 );
-
-        // 2 iterations of Newton-Raphson refinement of reciprocal for W
-        float32x4_t Reciprocal = vrecpeq_f32( W );
-        float32x4_t S = vrecpsq_f32( Reciprocal, W );
-        Reciprocal = vmulq_f32( S, Reciprocal );
-        S = vrecpsq_f32( Reciprocal, W );
-        Reciprocal = vmulq_f32( S, Reciprocal );
-
-        vResult = vmulq_f32( vResult, Reciprocal );
-
-        VL = vget_low_f32( vResult );
-        vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
-        vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(XMFLOAT3))
-        {
-            if (OutputStride == sizeof(XMFLOAT3))
-            {
-                if ( !((uintptr_t)pOutputStream & 0xF) )
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V1 = _mm_div_ps( vTemp, W );
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V2 = _mm_div_ps( vTemp, W );
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V3 = _mm_div_ps( vTemp, W );
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V4 = _mm_div_ps( vTemp, W );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V1 = _mm_div_ps( vTemp, W );
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V2 = _mm_div_ps( vTemp, W );
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V3 = _mm_div_ps( vTemp, W );
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, row3 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                        V4 = _mm_div_ps( vTemp, W );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                    pInputVector += sizeof(XMFLOAT3)*4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1,L2,L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                    vTemp = _mm_div_ps( vTemp, W );
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                    vTemp = _mm_div_ps( vTemp, W );
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                    vTemp = _mm_div_ps( vTemp, W );
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, row3 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-                    vTemp = _mm_div_ps( vTemp, W );
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
-        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride; 
-
-        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-        vTemp = _mm_add_ps( vTemp, row3 );
-        vTemp = _mm_add_ps( vTemp, vTemp2 );
-        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-
-        vTemp = _mm_div_ps( vTemp, W );
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
-    vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
-    return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-    vResult = _mm_mul_ps(vResult,M.r[0]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    vTemp = _mm_mul_ps(vTemp,M.r[1]);
-    vResult = _mm_add_ps(vResult,vTemp);
-    vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    vTemp = _mm_mul_ps(vTemp,M.r[2]);
-    vResult = _mm_add_ps(vResult,vTemp);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream
-(
-    XMFLOAT3*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT3* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiply(Z, row2);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT3)*4;
-
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
-                XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
-
-                __prefetch( pInputVector );
-
-                r = vget_high_f32( row0 );
-                XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
-
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( row1 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                r = vget_low_f32( row2 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
-
-                r = vget_high_f32( row2 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
-
-                V.val[0] = vResult0;
-                V.val[1] = vResult1;
-                V.val[2] = vResult2;
-
-                vst3q_f32( reinterpret_cast<float*>(pOutputVector), V );
-                pOutputVector += sizeof(XMFLOAT3)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-        float32x2_t zero = vdup_n_f32(0);
-        float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
-        pInputVector += InputStride; 
-
-        XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
-        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
-
-        VL = vget_low_f32( vResult );
-        vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
-        vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(XMFLOAT3))
-        {
-            if (OutputStride == sizeof(XMFLOAT3))
-            {
-                if ( !((uintptr_t)pOutputStream & 0xF) )
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V1 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V2 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V3 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V4 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V1 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V2 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V3 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, row2 );
-                        vTemp2 = _mm_mul_ps( Y, row1 );
-                        vTemp3 = _mm_mul_ps( X, row0 );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        V4 = _mm_add_ps( vTemp, vTemp3 );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                    pInputVector += sizeof(XMFLOAT3)*4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1,L2,L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, row2 );
-                    vTemp2 = _mm_mul_ps( Y, row1 );
-                    vTemp3 = _mm_mul_ps( X, row0 );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
-        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride; 
-
-        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
-        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
-        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
-        vTemp = _mm_add_ps( vTemp, vTemp2 );
-        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Project
-(
-    FXMVECTOR V, 
-    float    ViewportX, 
-    float    ViewportY, 
-    float    ViewportWidth, 
-    float    ViewportHeight, 
-    float    ViewportMinZ, 
-    float    ViewportMaxZ, 
-    FXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
-    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    XMVECTOR Result = XMVector3TransformCoord(V, Transform);
-
-    Result = XMVectorMultiplyAdd(Result, Scale, Offset);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
-(
-    XMFLOAT3*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT3* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    float           ViewportX, 
-    float           ViewportY, 
-    float           ViewportWidth, 
-    float           ViewportHeight, 
-    float           ViewportMinZ, 
-    float           ViewportMaxZ, 
-    FXMMATRIX     Projection, 
-    CXMMATRIX     View, 
-    CXMMATRIX     World
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
-    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
-
-        XMVECTOR Result = XMVector3TransformCoord(V, Transform);
-        Result = XMVectorMultiplyAdd(Result, Scale, Offset);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
-        {
-            XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth);
-            XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight);
-            XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ);
-
-            XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth);
-            XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight);
-            XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ);
-
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT3)*4;
-
-                float32x2_t r3 = vget_low_f32( Transform.r[3] );
-                float32x2_t r = vget_low_f32( Transform.r[0] );
-                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
-
-                __prefetch( pInputVector );
-
-                r3 = vget_high_f32( Transform.r[3] );
-                r = vget_high_f32( Transform.r[0] );
-                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
-                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
-
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( Transform.r[1] );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( Transform.r[1] );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
-                W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                r = vget_low_f32( Transform.r[2] );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
-
-                r = vget_high_f32( Transform.r[2] );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
-                W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
-
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-
-                vResult0 = vmulq_f32( vResult0, Reciprocal );
-                vResult1 = vmulq_f32( vResult1, Reciprocal );
-                vResult2 = vmulq_f32( vResult2, Reciprocal );
-
-                V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX );
-                V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY );
-                V.val[2] = vmlaq_f32( OffsetZ, vResult2, ScaleZ );
-
-                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
-                pOutputVector += sizeof(XMFLOAT3)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    if ( i < VectorCount)
-    {
-        XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
-        XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-        for (; i < VectorCount; i++)
-        {
-            float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-            float32x2_t zero = vdup_n_f32(0);
-            float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
-            pInputVector += InputStride; 
-
-            XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X
-            vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y
-            vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z
-
-            VH = vget_high_f32(vResult);
-            XMVECTOR W = vdupq_lane_f32( VH, 1 );
-
-            // 2 iterations of Newton-Raphson refinement of reciprocal for W
-            float32x4_t Reciprocal = vrecpeq_f32( W );
-            float32x4_t S = vrecpsq_f32( Reciprocal, W );
-            Reciprocal = vmulq_f32( S, Reciprocal );
-            S = vrecpsq_f32( Reciprocal, W );
-            Reciprocal = vmulq_f32( S, Reciprocal );
-
-            vResult = vmulq_f32( vResult, Reciprocal );
-
-            vResult = vmlaq_f32( Offset, vResult, Scale );
-
-            VL = vget_low_f32( vResult );
-            vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
-            vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
-            pOutputVector += OutputStride;
-        }
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
-    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(XMFLOAT3))
-        {
-            if (OutputStride == sizeof(XMFLOAT3))
-            {
-                if ( !((uintptr_t)pOutputStream & 0xF) )
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V1 = _mm_add_ps( vTemp, Offset );
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V2 = _mm_add_ps( vTemp, Offset );
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V3 = _mm_add_ps( vTemp, Offset );
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V4 = _mm_add_ps( vTemp, Offset );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V1 = _mm_add_ps( vTemp, Offset );
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V2 = _mm_add_ps( vTemp, Offset );
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V3 = _mm_add_ps( vTemp, Offset );
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        vTemp = _mm_div_ps( vTemp, W );
-
-                        vTemp = _mm_mul_ps( vTemp, Scale );
-                        V4 = _mm_add_ps( vTemp, Offset );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                    pInputVector += sizeof(XMFLOAT3)*4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1,L2,L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    vTemp = _mm_mul_ps( vTemp, Scale );
-                    vTemp = _mm_add_ps( vTemp, Offset );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    vTemp = _mm_mul_ps( vTemp, Scale );
-                    vTemp = _mm_add_ps( vTemp, Offset );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    vTemp = _mm_mul_ps( vTemp, Scale );
-                    vTemp = _mm_add_ps( vTemp, Offset );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    vTemp = _mm_mul_ps( vTemp, Scale );
-                    vTemp = _mm_add_ps( vTemp, Offset );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
-        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride; 
-
-        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-        vTemp = _mm_add_ps( vTemp, vTemp2 );
-        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-        vTemp = _mm_div_ps( vTemp, W );
-
-        vTemp = _mm_mul_ps( vTemp, Scale );
-        vTemp = _mm_add_ps( vTemp, Offset );
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Unproject
-(
-    FXMVECTOR V, 
-    float     ViewportX, 
-    float     ViewportY, 
-    float     ViewportWidth, 
-    float     ViewportHeight, 
-    float     ViewportMinZ, 
-    float     ViewportMaxZ, 
-    FXMMATRIX Projection, 
-    CXMMATRIX View, 
-    CXMMATRIX World
-)
-{
-    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
-
-    return XMVector3TransformCoord(Result, Transform);
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
-(
-    XMFLOAT3*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT3* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    float           ViewportX, 
-    float           ViewportY, 
-    float           ViewportWidth, 
-    float           ViewportHeight, 
-    float           ViewportMinZ, 
-    float           ViewportMaxZ, 
-    FXMMATRIX       Projection, 
-    CXMMATRIX       View, 
-    CXMMATRIX       World)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
-
-        Result = XMVector3TransformCoord(Result, Transform);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    float sx = 1.f / (ViewportWidth * 0.5f);
-    float sy = 1.f / (-ViewportHeight * 0.5f);
-    float sz = 1.f / (ViewportMaxZ - ViewportMinZ);
-
-    float ox = (-ViewportX * sx) - 1.f;
-    float oy = (-ViewportY * sy) + 1.f;
-    float oz = (-ViewportMinZ * sz);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT3)*4;
-
-                XMVECTOR ScaleX = vdupq_n_f32(sx);
-                XMVECTOR OffsetX = vdupq_n_f32(ox);
-                XMVECTOR VX = vmlaq_f32( OffsetX, ScaleX, V.val[0] );
-
-                float32x2_t r3 = vget_low_f32( Transform.r[3] );
-                float32x2_t r = vget_low_f32( Transform.r[0] );
-                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N
-
-                __prefetch( pInputVector );
-
-                r3 = vget_high_f32( Transform.r[3] );
-                r = vget_high_f32( Transform.r[0] );
-                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O
-                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P
-
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                XMVECTOR ScaleY = vdupq_n_f32(sy);
-                XMVECTOR OffsetY = vdupq_n_f32(oy);
-                XMVECTOR VY = vmlaq_f32( OffsetY, ScaleY, V.val[1] );
-
-                r = vget_low_f32( Transform.r[1] );
-                vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( Transform.r[1] );
-                vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O
-                W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                XMVECTOR ScaleZ = vdupq_n_f32(sz);
-                XMVECTOR OffsetZ = vdupq_n_f32(oz);
-                XMVECTOR VZ = vmlaq_f32( OffsetZ, ScaleZ, V.val[2] );
-
-                r = vget_low_f32( Transform.r[2] );
-                vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M
-                vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
-
-                r = vget_high_f32( Transform.r[2] );
-                vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O
-                W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
-
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                S = vrecpsq_f32( Reciprocal, W );
-                Reciprocal = vmulq_f32( S, Reciprocal );
-                
-                V.val[0] = vmulq_f32( vResult0, Reciprocal );
-                V.val[1] = vmulq_f32( vResult1, Reciprocal );
-                V.val[2] = vmulq_f32( vResult2, Reciprocal );
-
-                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
-                pOutputVector += sizeof(XMFLOAT3)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    if (i < VectorCount)
-    {
-        float32x2_t ScaleL = vcreate_f32(((uint64_t)*(const uint32_t *)&sx) | ((uint64_t)(*(const uint32_t *)&sy) << 32));
-        float32x2_t ScaleH = vcreate_f32((uint64_t)*(const uint32_t *)&sz);
-
-        float32x2_t OffsetL = vcreate_f32(((uint64_t)*(const uint32_t *)&ox) | ((uint64_t)(*(const uint32_t *)&oy) << 32));
-        float32x2_t OffsetH = vcreate_f32((uint64_t)*(const uint32_t *)&oz);
-
-        for (; i < VectorCount; i++)
-        {
-            float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
-            float32x2_t zero = vdup_n_f32(0);
-            float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
-            pInputVector += InputStride; 
-
-            VL = vmla_f32( OffsetL, VL, ScaleL );
-            VH = vmla_f32( OffsetH, VH, ScaleH );
-
-            XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X
-            vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y
-            vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z
-
-            VH = vget_high_f32(vResult);
-            XMVECTOR W = vdupq_lane_f32( VH, 1 );
-
-            // 2 iterations of Newton-Raphson refinement of reciprocal for W
-            float32x4_t Reciprocal = vrecpeq_f32( W );
-            float32x4_t S = vrecpsq_f32( Reciprocal, W );
-            Reciprocal = vmulq_f32( S, Reciprocal );
-            S = vrecpsq_f32( Reciprocal, W );
-            Reciprocal = vmulq_f32( S, Reciprocal );
-
-            vResult = vmulq_f32( vResult, Reciprocal );
-
-            VL = vget_low_f32( vResult );
-            vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
-            vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
-            pOutputVector += OutputStride;
-        }
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = _mm_mul_ps(Scale, Offset);
-    Offset = _mm_add_ps(Offset, D);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(XMFLOAT3))
-        {
-            if (OutputStride == sizeof(XMFLOAT3))
-            {
-                if ( !((uintptr_t)pOutputStream & 0xF) )
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        V1 = _mm_mul_ps( V1, Scale );
-                        V1 = _mm_add_ps( V1, Offset );
-
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V1 = _mm_div_ps( vTemp, W );
-
-                        // Result 2
-                        V2 = _mm_mul_ps( V2, Scale );
-                        V2 = _mm_add_ps( V2, Offset );
-
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V2 = _mm_div_ps( vTemp, W );
-
-                        // Result 3
-                        V3 = _mm_mul_ps( V3, Scale );
-                        V3 = _mm_add_ps( V3, Offset );
-
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V3 = _mm_div_ps( vTemp, W );
-
-                        // Result 4
-                        V4 = _mm_mul_ps( V4, Scale );
-                        V4 = _mm_add_ps( V4, Offset );
-
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V4 = _mm_div_ps( vTemp, W );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                        pInputVector += sizeof(XMFLOAT3)*4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1,L2,L3);
-
-                        // Result 1
-                        V1 = _mm_mul_ps( V1, Scale );
-                        V1 = _mm_add_ps( V1, Offset );
-
-                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V1 = _mm_div_ps( vTemp, W );
-
-                        // Result 2
-                        V2 = _mm_mul_ps( V2, Scale );
-                        V2 = _mm_add_ps( V2, Offset );
-
-                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V2 = _mm_div_ps( vTemp, W );
-
-                        // Result 3
-                        V3 = _mm_mul_ps( V3, Scale );
-                        V3 = _mm_add_ps( V3, Offset );
-
-                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V3 = _mm_div_ps( vTemp, W );
-
-                        // Result 4
-                        V4 = _mm_mul_ps( V4, Scale );
-                        V4 = _mm_add_ps( V4, Offset );
-
-                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                        vTemp = _mm_add_ps( vTemp, vTemp2 );
-                        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                        V4 = _mm_div_ps( vTemp, W );
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
-                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
-                        pOutputVector += sizeof(XMFLOAT3)*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
-                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
-                    pInputVector += sizeof(XMFLOAT3)*4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1,L2,L3);
-
-                    // Result 1
-                    V1 = _mm_mul_ps( V1, Scale );
-                    V1 = _mm_add_ps( V1, Offset );
-
-                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
-                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
-                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    V2 = _mm_mul_ps( V2, Scale );
-                    V2 = _mm_add_ps( V2, Offset );
-
-                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    V3 = _mm_mul_ps( V3, Scale );
-                    V3 = _mm_add_ps( V3, Offset );
-
-                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    V4 = _mm_mul_ps( V4, Scale );
-                    V4 = _mm_add_ps( V4, Offset );
-
-                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
-                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
-                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
-
-                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
-                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-                    vTemp = _mm_add_ps( vTemp, vTemp2 );
-                    vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-                    vTemp = _mm_div_ps( vTemp, W );
-
-                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
-        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride;
-
-        V = _mm_mul_ps( V, Scale );
-        V = _mm_add_ps( V, Offset );
-
-        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
-        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
-        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
-
-        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
-        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
-        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
-        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
-        vTemp = _mm_add_ps( vTemp, vTemp2 );
-        vTemp = _mm_add_ps( vTemp, vTemp3 );
-
-        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
-        vTemp = _mm_div_ps( vTemp, W );
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-/****************************************************************************
- *
- * 4D Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4Equal
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4EqualR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-
-    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
-        (V1.vector4_f32[3] == V2.vector4_f32[3]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
-        (V1.vector4_f32[3] != V2.vector4_f32[3]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
-    int iTest = _mm_movemask_ps(vTemp);
-    uint32_t CR = 0;
-    if (iTest==0xf)     // All equal?
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (iTest==0)  // All not equal?
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4EqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4EqualIntR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if (V1.vector4_u32[0] == V2.vector4_u32[0] && 
-        V1.vector4_u32[1] == V2.vector4_u32[1] &&
-        V1.vector4_u32[2] == V2.vector4_u32[2] &&
-        V1.vector4_u32[3] == V2.vector4_u32[3])
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (V1.vector4_u32[0] != V2.vector4_u32[0] && 
-        V1.vector4_u32[1] != V2.vector4_u32[1] &&
-        V1.vector4_u32[2] != V2.vector4_u32[2] &&
-        V1.vector4_u32[3] != V2.vector4_u32[3])
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
-    uint32_t CR = 0;
-    if (iTest==0xf)     // All equal?
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (iTest==0)  // All not equal?
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-inline bool XM_CALLCONV XMVector4NearEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR Epsilon
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float dx, dy, dz, dw;
-
-    dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
-    dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
-    dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
-    dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]);
-    return (((dx <= Epsilon.vector4_f32[0]) &&
-            (dy <= Epsilon.vector4_f32[1]) &&
-            (dz <= Epsilon.vector4_f32[2]) &&
-            (dw <= Epsilon.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vDelta = vsubq_f32( V1, V2 );
-    uint32x4_t vResult = vacleq_f32( vDelta, Epsilon );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp,vDelta);
-    vTemp = _mm_max_ps(vTemp,vDelta);
-    vTemp = _mm_cmple_ps(vTemp,Epsilon);
-    return ((_mm_movemask_ps(vTemp)==0xf) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4NotEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2);
-    return ((_mm_movemask_ps(vTemp)) != 0);
-#else
-    return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4NotEqualInt
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_u32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
-    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0);
-#else
-    return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4Greater
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4GreaterR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if (V1.vector4_f32[0] > V2.vector4_f32[0] && 
-        V1.vector4_f32[1] > V2.vector4_f32[1] &&
-        V1.vector4_f32[2] > V2.vector4_f32[2] &&
-        V1.vector4_f32[3] > V2.vector4_f32[3])
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && 
-        V1.vector4_f32[1] <= V2.vector4_f32[1] &&
-        V1.vector4_f32[2] <= V2.vector4_f32[2] &&
-        V1.vector4_f32[3] <= V2.vector4_f32[3])
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    uint32_t CR = 0;
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest==0xf) {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4GreaterOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
-        (V1.vector4_f32[3] >= V2.vector4_f32[3]))
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
-        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
-        (V1.vector4_f32[3] < V2.vector4_f32[3]))
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
-
-    uint32_t CR = 0;
-    if ( r == 0xFFFFFFFFU )
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if ( !r )
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    uint32_t CR = 0;
-    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest==0x0f)
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    }
-    else if (!iTest)
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4Less
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcltq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
-    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4LessOrEqual
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcleq_f32( V1, V2 );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
-    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4InBounds
-(
-    FXMVECTOR V, 
-    FXMVECTOR Bounds
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
-        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
-        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
-        (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    uint32x4_t ivTemp1 = vcleq_f32(V,Bounds);
-    // Negate the bounds
-    float32x4_t vTemp2 = vnegq_f32(Bounds);
-    // Test if greater or equal (Reversed)
-    uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V);
-    // Blend answers
-    ivTemp1 = vandq_u32(ivTemp1,ivTemp2);
-    // in bounds?
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2,V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
-    // All in bounds?
-    return ((_mm_movemask_ps(vTemp1)==0x0f) != 0);
-#else
-    return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4IsNaN
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    return (XMISNAN(V.vector4_f32[0]) ||
-            XMISNAN(V.vector4_f32[1]) ||
-            XMISNAN(V.vector4_f32[2]) ||
-            XMISNAN(V.vector4_f32[3]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    uint32x4_t vTempNan = vceqq_f32( V, V );
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    // If any are NaN, the mask is zero
-    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test against itself. NaN is always not equal
-    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
-    // If any are NaN, the mask is non-zero
-    return (_mm_movemask_ps(vTempNan)!=0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4IsInfinite
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    return (XMISINF(V.vector4_f32[0]) ||
-            XMISINF(V.vector4_f32[1]) ||
-            XMISINF(V.vector4_f32[2]) ||
-            XMISINF(V.vector4_f32[3]));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask );
-    // Compare to infinity
-    vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
-    // If any are infinity, the signs are true.
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    return ( vget_lane_u32(vTemp.val[1], 1) != 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
-    // If any are infinity, the signs are true.
-    return (_mm_movemask_ps(vTemp) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Dot
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] =
-    Result.vector4_f32[1] =
-    Result.vector4_f32[2] =
-    Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTemp = vmulq_f32( V1, V2 );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    return vcombine_f32( v1, v1 );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_dp_ps( V1, V2, 0xff );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
-    vTemp = _mm_hadd_ps(vTemp, vTemp);
-    return _mm_hadd_ps(vTemp, vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp2 = V2;
-    XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
-    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
-    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
-    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
-    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Cross
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2, 
-    FXMVECTOR V3
-)
-{
-    // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
-    //   ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
-    //   ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
-    //   ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;   
-
-    Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]);
-    Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]);
-    Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]);
-    Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const float32x2_t select = vget_low_f32( g_XMMaskX );
-
-    // Term1: V2zwyz * V3wzwy
-    const float32x2_t v2xy = vget_low_f32(V2);
-    const float32x2_t v2zw = vget_high_f32(V2);
-    const float32x2_t v2yx = vrev64_f32(v2xy);
-    const float32x2_t v2wz = vrev64_f32(v2zw);
-    const float32x2_t v2yz = vbsl_f32( select, v2yx, v2wz );
-
-    const float32x2_t v3zw = vget_high_f32(V3);
-    const float32x2_t v3wz = vrev64_f32(v3zw);
-    const float32x2_t v3xy = vget_low_f32(V3);
-    const float32x2_t v3wy = vbsl_f32( select, v3wz, v3xy );
-
-    float32x4_t vTemp1 = vcombine_f32(v2zw,v2yz);
-    float32x4_t vTemp2 = vcombine_f32(v3wz,v3wy);
-    XMVECTOR vResult = vmulq_f32( vTemp1, vTemp2 );
-
-    // - V2wzwy * V3zwyz
-    const float32x2_t v2wy = vbsl_f32( select, v2wz, v2xy );
-
-    const float32x2_t v3yx = vrev64_f32(v3xy);
-    const float32x2_t v3yz = vbsl_f32( select, v3yx, v3wz );
-
-    vTemp1 = vcombine_f32(v2wz,v2wy);
-    vTemp2 = vcombine_f32(v3zw,v3yz);
-    vResult = vmlsq_f32( vResult, vTemp1, vTemp2 );
-
-    // term1 * V1yxxx
-    const float32x2_t v1xy = vget_low_f32(V1);
-    const float32x2_t v1yx = vrev64_f32(v1xy);
-
-    vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) );
-    vResult = vmulq_f32( vResult, vTemp1 );
-
-    // Term2: V2ywxz * V3wxwx
-    const float32x2_t v2yw = vrev64_f32(v2wy);
-    const float32x2_t v2xz = vbsl_f32( select, v2xy, v2wz );
-
-    const float32x2_t v3wx = vbsl_f32( select, v3wz, v3yx );
-
-    vTemp1 = vcombine_f32(v2yw,v2xz);
-    vTemp2 = vcombine_f32(v3wx,v3wx);
-    float32x4_t vTerm = vmulq_f32( vTemp1, vTemp2 );
-
-    // - V2wxwx * V3ywxz
-    const float32x2_t v2wx = vbsl_f32( select, v2wz, v2yx );
-
-    const float32x2_t v3yw = vrev64_f32(v3wy);
-    const float32x2_t v3xz = vbsl_f32( select, v3xy, v3wz );
-
-    vTemp1 = vcombine_f32(v2wx,v2wx);
-    vTemp2 = vcombine_f32(v3yw,v3xz);
-    vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
-
-    // vResult - term2 * V1zzyy
-    const float32x2_t v1zw = vget_high_f32(V1);
-
-    vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) );
-    vResult = vmlsq_f32( vResult, vTerm, vTemp1 );
-
-    // Term3: V2yzxy * V3zxyx
-    const float32x2_t v3zx = vrev64_f32(v3xz);
-
-    vTemp1 = vcombine_f32(v2yz,v2xy);
-    vTemp2 = vcombine_f32(v3zx,v3yx);
-    vTerm = vmulq_f32( vTemp1, vTemp2 );
-
-    // - V2zxyx * V3yzxy
-    const float32x2_t v2zx = vrev64_f32(v2xz);
-
-    vTemp1 = vcombine_f32(v2zx,v2yx);
-    vTemp2 = vcombine_f32(v3yz,v3xy);
-    vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
-
-    // vResult + term3 * V1wwwz
-    const float32x2_t v1wz = vrev64_f32(v1zw);
-
-    vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz );
-    return vmlaq_f32( vResult, vTerm, vTemp1 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // V2zwyz * V3wzwy
-    XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2));
-    XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3));
-    vResult = _mm_mul_ps(vResult,vTemp3);
-    // - V2wzwy * V3zwyz
-    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3));
-    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1));
-    vTemp2 = _mm_mul_ps(vTemp2,vTemp3);
-    vResult = _mm_sub_ps(vResult,vTemp2);
-    // term1 * V1yxxx
-    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1));
-    vResult = _mm_mul_ps(vResult,vTemp1);
-
-    // V2ywxz * V3wxwx
-    vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1));
-    vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3));
-    vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
-    // - V2wxwx * V3ywxz
-    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1));
-    vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1));
-    vTemp2 = _mm_mul_ps(vTemp2,vTemp1);
-    vTemp3 = _mm_sub_ps(vTemp3,vTemp2);
-    // vResult - temp * V1zzyy
-    vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2));
-    vTemp1 = _mm_mul_ps(vTemp1,vTemp3);
-    vResult = _mm_sub_ps(vResult,vTemp1);
-
-    // V2yzxy * V3zxyx
-    vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1));
-    vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2));
-    vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
-    // - V2zxyx * V3yzxy
-    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1));
-    vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1));
-    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
-    vTemp3 = _mm_sub_ps(vTemp3,vTemp1);
-    // vResult + term * V1wwwz
-    vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3));
-    vTemp3 = _mm_mul_ps(vTemp3,vTemp1);
-    vResult = _mm_add_ps(vResult,vTemp3);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthSq
-(
-    FXMVECTOR V
-)
-{
-    return XMVector4Dot(V, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorReciprocalSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32( v1 );
-    return vcombine_f32(v2, v2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    return _mm_rsqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
-    // Get the reciprocal
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorReciprocalSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    // Reciprocal sqrt
-    float32x2_t  S0 = vrsqrte_f32(v1);
-    float32x2_t  P0 = vmul_f32( v1, S0 );
-    float32x2_t  R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t  S1 = vmul_f32( S0, R0 );
-    float32x2_t  P1 = vmul_f32( v1, S1 );
-    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t Result = vmul_f32( S1, R1 );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
-    return _mm_div_ps( g_XMOne, vLengthSq );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
-    // Get the reciprocal
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    // Accurate!
-    vLengthSq = _mm_div_ps(g_XMOne,vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
-    // Sqrt (estimate)
-    float32x2_t Result = vrsqrte_f32( v1 );
-    Result = vmul_f32( v1, Result );
-    Result = vbsl_f32( VEqualsZero, zero, Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    return _mm_sqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Length
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_) 
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
-    // Sqrt
-    float32x2_t S0 = vrsqrte_f32( v1 );
-    float32x2_t P0 = vmul_f32( v1, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( v1, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    float32x2_t Result = vmul_f32( S1, R1 );
-    Result = vmul_f32( v1, Result );
-    Result = vbsl_f32( VEqualsZero, zero, Result );
-    return vcombine_f32( Result, Result );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    return _mm_sqrt_ps( vTemp );
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// XMVector4NormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector4ReciprocalLength(V);
-    Result = XMVectorMultiply(V, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32( v1 );
-    // Normalize
-    return vmulq_f32( V, vcombine_f32(v2,v2) );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
-    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
-    return _mm_mul_ps(vResult, V);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_rsqrt_ps(vDot);
-    vDot = _mm_mul_ps(vDot, V);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
-    // Get the reciprocal
-    XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_mul_ps(vResult,V);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Normalize
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-    float fLength;
-    XMVECTOR vResult;
-
-    vResult = XMVector4Length( V );
-    fLength = vResult.vector4_f32[0];
-
-    // Prevent divide by zero
-    if (fLength > 0) {
-        fLength = 1.0f/fLength;
-    }
-    
-    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
-    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
-    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
-    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32( V, V );
-    float32x2_t v1 = vget_low_f32( vTemp );
-    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vadd_f32( v1, v2 );
-    v1 = vpadd_f32( v1, v1 );
-    uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
-    uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
-    // Reciprocal sqrt (2 iterations of Newton-Raphson)
-    float32x2_t S0 = vrsqrte_f32( v1 );
-    float32x2_t P0 = vmul_f32( v1, S0 );
-    float32x2_t R0 = vrsqrts_f32( P0, S0 );
-    float32x2_t S1 = vmul_f32( S0, R0 );
-    float32x2_t P1 = vmul_f32( v1, S1 );
-    float32x2_t R1 = vrsqrts_f32( P1, S1 );
-    v2 = vmul_f32( S1, R1 );
-    // Normalize
-    XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
-    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
-    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#elif defined(_XM_SSE3_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V,vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult,vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
-    vResult = _mm_or_ps(vTemp1,vTemp2);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ClampLength
-(
-    FXMVECTOR V, 
-    float    LengthMin, 
-    float    LengthMax
-)
-{
-    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
-    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
-
-    return XMVector4ClampLengthV(V, ClampMin, ClampMax);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV
-(
-    FXMVECTOR V, 
-    FXMVECTOR LengthMin, 
-    FXMVECTOR LengthMax
-)
-{
-    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
-    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
-    assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
-    assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
-    assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
-
-    XMVECTOR LengthSq = XMVector4LengthSq(V);
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
-
-    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
-    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
-
-    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
-
-    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
-
-    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
-    Length = XMVectorSelect(LengthSq, Length, Select);
-    Normal = XMVectorSelect(LengthSq, Normal, Select);
-
-    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
-    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
-
-    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
-    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
-
-    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
-
-    // Preserve the original vector (with no precision loss) if the length falls within the given range
-    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
-    Result = XMVectorSelect(Result, V, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Reflect
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal
-)
-{
-    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
-
-    XMVECTOR Result = XMVector4Dot(Incident, Normal);
-    Result = XMVectorAdd(Result, Result);
-    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Refract
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal, 
-    float    RefractionIndex
-)
-{
-    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
-    return XMVector4RefractV(Incident, Normal, Index);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4RefractV
-(
-    FXMVECTOR Incident, 
-    FXMVECTOR Normal, 
-    FXMVECTOR RefractionIndex
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR        IDotN;
-    XMVECTOR        R;
-    const XMVECTOR  Zero = XMVectorZero();
-
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
-    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
-
-    IDotN = XMVector4Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
-    R = XMVectorMultiply(R, RefractionIndex);
-    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
-
-    if (XMVector4LessOrEqual(R, Zero))
-    {
-        // Total internal reflection
-        return Zero;
-    }
-    else
-    {
-        XMVECTOR Result;
-
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = XMVectorSqrt(R);
-        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
-
-        // Result = RefractionIndex * Incident - Normal * R
-        Result = XMVectorMultiply(RefractionIndex, Incident);
-        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
-
-        return Result;
-    }
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN);
-    R = vmulq_f32(R, RefractionIndex);
-    R = vmlsq_f32(g_XMOne, R, RefractionIndex );
-
-    uint32x4_t vResult = vcleq_f32(R,g_XMZero);
-    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
-    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
-    {
-        // Total internal reflection
-        vResult = g_XMZero;
-    }
-    else
-    {
-        // Sqrt(R)
-        float32x4_t S0 = vrsqrteq_f32(R);
-        float32x4_t P0 = vmulq_f32( R, S0 );
-        float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
-        float32x4_t S1 = vmulq_f32( S0, R0 );
-        float32x4_t P1 = vmulq_f32( R, S1 );
-        float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
-        float32x4_t S2 = vmulq_f32( S1, R1 );
-        R = vmulq_f32( R, S2 );
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = vmlaq_f32( R, RefractionIndex, IDotN );
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = vmulq_f32(RefractionIndex, Incident);
-        vResult = vmlsq_f32( vResult, R, Normal );
-    }
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR R = _mm_mul_ps(IDotN,IDotN);
-    R = _mm_sub_ps(g_XMOne,R);
-    R = _mm_mul_ps(R, RefractionIndex);
-    R = _mm_mul_ps(R, RefractionIndex);
-    R = _mm_sub_ps(g_XMOne,R);
-
-    XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
-    if (_mm_movemask_ps(vResult)==0x0f)
-    {
-        // Total internal reflection
-        vResult = g_XMZero;
-    }
-    else
-    {
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = _mm_sqrt_ps(R);
-        vResult = _mm_mul_ps(RefractionIndex, IDotN);
-        R = _mm_add_ps(R,vResult);
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = _mm_mul_ps(RefractionIndex, Incident);
-        R = _mm_mul_ps(R,Normal);
-        vResult = _mm_sub_ps(vResult,R);
-    }
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Orthogonal
-(
-    FXMVECTOR V
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result.vector4_f32[0] = V.vector4_f32[2];
-    Result.vector4_f32[1] = V.vector4_f32[3];
-    Result.vector4_f32[2] = -V.vector4_f32[0];
-    Result.vector4_f32[3] = -V.vector4_f32[1];
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f };
-
-    float32x4_t Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) );
-    return vmulq_f32( Result, Negate );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f};
-    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2));
-    vResult = _mm_mul_ps(vResult,FlipZW);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst
-(
-    FXMVECTOR N1, 
-    FXMVECTOR N2
-)
-{
-    XMVECTOR Result = XMVector4Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACosEst(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals
-(
-    FXMVECTOR N1, 
-    FXMVECTOR N2
-)
-{
-    XMVECTOR Result = XMVector4Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACos(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors
-(
-    FXMVECTOR V1, 
-    FXMVECTOR V2
-)
-{
-    XMVECTOR L1 = XMVector4ReciprocalLength(V1);
-    XMVECTOR L2 = XMVector4ReciprocalLength(V2);
-
-    XMVECTOR Dot = XMVector4Dot(V1, V2);
-
-    L1 = XMVectorMultiply(L1, L2);
-
-    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
-    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
-
-    return XMVectorACos(CosAngle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Transform
-(
-    FXMVECTOR V, 
-    FXMMATRIX M
-)
-{
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
-    float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
-    float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
-    float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = fX;
-    vResult.vector4_f32[1] = fY;
-    vResult.vector4_f32[2] = fZ; 
-    vResult.vector4_f32[3] = fW;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32( V );
-    XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
-    vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
-    float32x2_t VH = vget_high_f32( V );
-    vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0  ); // Z
-    return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat x,y,z and w
-    XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-    // Mul by the matrix
-    vTempX = _mm_mul_ps(vTempX,M.r[0]);
-    vTempY = _mm_mul_ps(vTempY,M.r[1]);
-    vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
-    vTempW = _mm_mul_ps(vTempW,M.r[3]);
-    // Add them all together
-    vTempX = _mm_add_ps(vTempX,vTempY);
-    vTempZ = _mm_add_ps(vTempZ,vTempW);
-    vTempX = _mm_add_ps(vTempX,vTempZ);
-    return vTempX;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream
-(
-    XMFLOAT4*       pOutputStream, 
-    size_t          OutputStride, 
-    const XMFLOAT4* pInputStream, 
-    size_t          InputStride, 
-    size_t          VectorCount, 
-    FXMMATRIX       M
-)
-{
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT4));
-
-    assert(OutputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++)
-    {
-        XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector);
-        XMVECTOR W = XMVectorSplatW(V);
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiply(W, row3);
-        Result = XMVectorMultiplyAdd(Z, row2, Result);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
-        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
-
-        pInputVector += InputStride; 
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if ( four > 0 )
-    {
-        if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4)))
-        {
-            for (size_t j = 0; j < four; ++j)
-            {
-                float32x4x4_t V = vld4q_f32( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += sizeof(XMFLOAT4)*4; 
-
-                float32x2_t r = vget_low_f32( row0 );
-                XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
-                XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
-
-                __prefetch( pInputVector );
-
-                r = vget_high_f32( row0 );
-                XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
-                XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx
-
-                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
-
-                r = vget_low_f32( row1 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
-
-                r = vget_high_f32( row1 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
-                vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
-
-                r = vget_low_f32( row2 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
-
-                r = vget_high_f32( row2 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
-                vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
-
-                r = vget_low_f32( row3 );
-                vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw
-                vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) );
-
-                r = vget_high_f32( row3 );
-                vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow
-                vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw
-
-                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) );
-
-                V.val[0] = vResult0;
-                V.val[1] = vResult1;
-                V.val[2] = vResult2;
-                V.val[3] = vResult3;
-
-                vst4q_f32( reinterpret_cast<float*>(pOutputVector), V );
-                pOutputVector += sizeof(XMFLOAT4)*4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++)
-    {
-        XMVECTOR V = vld1q_f32( reinterpret_cast<const float*>(pInputVector) );
-        pInputVector += InputStride; 
-
-        float32x2_t VL = vget_low_f32( V );
-        XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X
-        vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
-        float32x2_t VH = vget_high_f32( V );
-        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
-        vResult = vmlaq_lane_f32( vResult, row3, VH, 1 ); // W
-
-        vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
-    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
-    {
-        if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) )
-        {
-            // Aligned input, aligned output
-            for (size_t i = 0; i < VectorCount; i++)
-            {
-                __m128 V = _mm_load_ps( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += InputStride; 
-        
-                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-
-                vTempX = _mm_mul_ps(vTempX,row0);
-                vTempY = _mm_mul_ps(vTempY,row1);
-                vTempZ = _mm_mul_ps(vTempZ,row2);
-                vTempW = _mm_mul_ps(vTempW,row3);
-
-                vTempX = _mm_add_ps(vTempX,vTempY);
-                vTempZ = _mm_add_ps(vTempZ,vTempW);
-                vTempX = _mm_add_ps(vTempX,vTempZ);
-
-                XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTempX );
-                pOutputVector += OutputStride;
-            }
-        }
-        else
-        {
-            // Unaligned input, aligned output
-            for (size_t i = 0; i < VectorCount; i++)
-            {
-                __m128 V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += InputStride; 
-        
-                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-
-                vTempX = _mm_mul_ps(vTempX,row0);
-                vTempY = _mm_mul_ps(vTempY,row1);
-                vTempZ = _mm_mul_ps(vTempZ,row2);
-                vTempW = _mm_mul_ps(vTempW,row3);
-
-                vTempX = _mm_add_ps(vTempX,vTempY);
-                vTempZ = _mm_add_ps(vTempZ,vTempW);
-                vTempX = _mm_add_ps(vTempX,vTempZ);
-
-                XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTempX );
-                pOutputVector += OutputStride;
-            }
-        }
-    }
-    else
-    {
-        if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) )
-        {
-            // Aligned input, unaligned output
-            for (size_t i = 0; i < VectorCount; i++)
-            {
-                __m128 V = _mm_load_ps( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += InputStride; 
-        
-                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-
-                vTempX = _mm_mul_ps(vTempX,row0);
-                vTempY = _mm_mul_ps(vTempY,row1);
-                vTempZ = _mm_mul_ps(vTempZ,row2);
-                vTempW = _mm_mul_ps(vTempW,row3);
-
-                vTempX = _mm_add_ps(vTempX,vTempY);
-                vTempZ = _mm_add_ps(vTempZ,vTempW);
-                vTempX = _mm_add_ps(vTempX,vTempZ);
-
-                _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTempX );
-                pOutputVector += OutputStride;
-            }
-        }
-        else
-        {
-            // Unaligned input, unaligned output
-            for (size_t i = 0; i < VectorCount; i++)
-            {
-                __m128 V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
-                pInputVector += InputStride; 
-        
-                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
-
-                vTempX = _mm_mul_ps(vTempX,row0);
-                vTempY = _mm_mul_ps(vTempY,row1);
-                vTempZ = _mm_mul_ps(vTempZ,row2);
-                vTempW = _mm_mul_ps(vTempW,row3);
-
-                vTempX = _mm_add_ps(vTempX,vTempY);
-                vTempZ = _mm_add_ps(vTempZ,vTempW);
-                vTempX = _mm_add_ps(vTempX,vTempZ);
-
-                _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTempX );
-                pOutputVector += OutputStride;
-            }
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-/****************************************************************************
- *
- * XMVECTOR operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V)
-{
-    return V;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V)
-{
-    return XMVectorNegate(V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator+=
-(
-    XMVECTOR&       V1,
-    FXMVECTOR       V2
-)
-{
-    V1 = XMVectorAdd(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator-=
-(
-    XMVECTOR&       V1,
-    FXMVECTOR       V2
-)
-{
-    V1 = XMVectorSubtract(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator*=
-(
-    XMVECTOR&       V1,
-    FXMVECTOR       V2
-)
-{
-    V1 = XMVectorMultiply(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator/=
-(
-    XMVECTOR&       V1,
-    FXMVECTOR       V2
-)
-{
-    V1 = XMVectorDivide(V1,V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& operator*=
-(
-    XMVECTOR&   V,
-    const float S
-)
-{
-    V = XMVectorScale(V, S);
-    return V;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& operator/=
-(
-    XMVECTOR&   V,
-    const float S
-)
-{
-    XMVECTOR vS = XMVectorReplicate( S );
-    V = XMVectorDivide(V, vS);
-    return V;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator+
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-    return XMVectorAdd(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator-
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-    return XMVectorSubtract(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator*
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-    return XMVectorMultiply(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator/
-(
-    FXMVECTOR V1,
-    FXMVECTOR V2
-)
-{
-    return XMVectorDivide(V1,V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator*
-(
-    FXMVECTOR      V,
-    const float    S
-)
-{
-    return XMVectorScale(V, S);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator/
-(
-    FXMVECTOR      V,
-    const float    S
-)
-{
-    XMVECTOR vS = XMVectorReplicate( S );
-    return XMVectorDivide(V, vS);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator*
-(
-    float           S,
-    FXMVECTOR  	    V
-)
-{
-    return XMVectorScale(V, S);
-}
-
-#if defined(_XM_NO_INTRINSICS_)
-#undef XMISNAN
-#undef XMISINF
-#endif
-
-#if defined(_XM_SSE_INTRINSICS_)
-#undef XM3UNPACK3INTO4
-#undef XM3PACK4INTO3
-#endif
+//-------------------------------------------------------------------------------------
+// DirectXMathVector.inl -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_XM_NO_INTRINSICS_)
+#define XMISNAN(x)  ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0)
+#define XMISINF(x)  ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000)
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_)
+
+#define XM3UNPACK3INTO4(l1,l2,l3) \
+    XMVECTOR V3 = _mm_shuffle_ps(l2,l3,_MM_SHUFFLE(0,0,3,2));\
+    XMVECTOR V2 = _mm_shuffle_ps(l2,l1,_MM_SHUFFLE(3,3,1,0));\
+    V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,1,0,2));\
+    XMVECTOR V4 = _mm_castsi128_ps( _mm_srli_si128(_mm_castps_si128(L3),32/8) );
+
+#define XM3PACK4INTO3(v2x) \
+    v2x = _mm_shuffle_ps(V2,V3,_MM_SHUFFLE(1,0,2,1));\
+    V2 = _mm_shuffle_ps(V2,V1,_MM_SHUFFLE(2,2,0,0));\
+    V1 = _mm_shuffle_ps(V1,V2,_MM_SHUFFLE(0,2,1,0));\
+    V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(0,0,2,2));\
+    V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(2,1,2,0));\
+
+#endif
+
+/****************************************************************************
+ *
+ * General Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Assignment operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Return a vector with all elements equaling zero
+inline XMVECTOR XM_CALLCONV XMVectorZero()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_f32(0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_setzero_ps();
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with four floating point values
+inline XMVECTOR XM_CALLCONV XMVectorSet
+(
+    float x, 
+    float y, 
+    float z, 
+    float w
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {x,y,z,w};
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32));
+    float32x2_t V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32));
+    return vcombine_f32(V0, V1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_set_ps( w, z, y, x );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with four integer values
+inline XMVECTOR XM_CALLCONV XMVectorSetInt
+(
+    uint32_t x, 
+    uint32_t y, 
+    uint32_t z, 
+    uint32_t w
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult = {x,y,z,w};
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32));
+    uint32x2_t V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32));
+    return vcombine_u32(V0, V1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_set_epi32( w, z, y, x );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated floating point value
+inline XMVECTOR XM_CALLCONV XMVectorReplicate
+(
+    float Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = Value;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_f32( Value );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_set_ps1( Value );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr
+(
+    const float *pValue
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float Value = pValue[0];
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = Value;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_dup_f32( pValue );
+#elif defined(_XM_AVX_INTRINSICS_)
+    return _mm_broadcast_ss( pValue );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ps1( pValue );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated integer value
+inline XMVECTOR XM_CALLCONV XMVectorReplicateInt
+(
+    uint32_t Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] = 
+    vResult.u[1] = 
+    vResult.u[2] = 
+    vResult.u[3] = Value;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_u32( Value );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_set1_epi32( Value );
+    return _mm_castsi128_ps(vTemp);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr
+(
+    const uint32_t *pValue
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t Value = pValue[0];
+    XMVECTORU32 vResult;
+    vResult.u[0] = 
+    vResult.u[1] = 
+    vResult.u[2] = 
+    vResult.u[3] = Value;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_dup_u32(pValue);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ps1(reinterpret_cast<const float *>(pValue));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with all bits set (true mask)
+inline XMVECTOR XM_CALLCONV XMVectorTrueInt()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU};
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_s32(-1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_set1_epi32(-1);
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with all bits clear (false mask)
+inline XMVECTOR XM_CALLCONV XMVectorFalseInt()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_u32(0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_setzero_ps();
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the x component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatX
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = V.vector4_f32[0];
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32( vget_low_f32( V ), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the y component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatY
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = V.vector4_f32[1];
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32( vget_low_f32( V ), 1 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the z component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatZ
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = V.vector4_f32[2];
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32( vget_high_f32( V ), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the w component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatW
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = V.vector4_f32[3];
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32( vget_high_f32( V ), 1 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of 1.0f,1.0f,1.0f,1.0f
+inline XMVECTOR XM_CALLCONV XMVectorSplatOne()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = 1.0f;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_f32(1.0f);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMOne;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of INF,INF,INF,INF
+inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_u32[0] = 
+    vResult.vector4_u32[1] = 
+    vResult.vector4_u32[2] = 
+    vResult.vector4_u32[3] = 0x7F800000;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_u32(0x7F800000);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMInfinity;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
+inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_u32[0] = 
+    vResult.vector4_u32[1] = 
+    vResult.vector4_u32[2] = 
+    vResult.vector4_u32[3] = 0x7FC00000;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_u32(0x7FC00000);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMQNaN;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
+inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_u32[0] = 
+    vResult.vector4_u32[1] = 
+    vResult.vector4_u32[2] = 
+    vResult.vector4_u32[3] = 0x34000000;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_u32(0x34000000);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMEpsilon;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
+inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_u32[0] = 
+    vResult.vector4_u32[1] = 
+    vResult.vector4_u32[2] = 
+    vResult.vector4_u32[3] = 0x80000000U;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_u32(0x80000000U);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_set1_epi32( 0x80000000 );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a floating point value via an index. This is not a recommended
+// function to use due to performance loss.
+inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i)
+{
+    assert( i < 4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[i];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return V.n128_f32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+    return V.m128_f32[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return the X component in an FPU register. 
+inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cvtss_f32(V);
+#endif
+}
+
+// Return the Y component in an FPU register. 
+inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    return _mm_cvtss_f32(vTemp);
+#endif
+}
+
+// Return the Z component in an FPU register. 
+inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    return _mm_cvtss_f32(vTemp);
+#endif
+}
+
+// Return the W component in an FPU register. 
+inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+    return _mm_cvtss_f32(vTemp);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store a component indexed by i into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i)
+{
+    assert( f != nullptr );
+    assert( i <  4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    *f = V.vector4_f32[i];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    *f = V.n128_f32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+    *f = V.m128_f32[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store the X component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetXPtr(float *x, FXMVECTOR V)
+{
+    assert( x != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *x = V.vector4_f32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(x,V,0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss(x,V);
+#endif
+}
+
+// Store the Y component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V)
+{
+    assert( y != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *y = V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(y,V,1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    *((int*)y) = _mm_extract_ps( V, 1 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_ss(y,vResult);
+#endif
+}
+
+// Store the Z component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V)
+{
+    assert( z != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *z = V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(z,V,2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    *((int*)z) = _mm_extract_ps( V, 2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss(z,vResult);
+#endif
+}
+
+// Store the W component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V)
+{
+    assert( w != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *w = V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(w,V,3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    *((int*)w) = _mm_extract_ps( V, 3 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+    _mm_store_ss(w,vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return an integer value via an index. This is not a recommended
+// function to use due to performance loss.
+inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i)
+{
+    assert( i < 4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[i];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return V.n128_u32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+    return V.m128_u32[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return the X component in an integer register. 
+inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
+#endif
+}
+
+// Return the Y component in an integer register. 
+inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(V, 1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
+#endif
+}
+
+// Return the Z component in an integer register. 
+inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(V, 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
+#endif
+}
+
+// Return the W component in an integer register. 
+inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(V, 3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store a component indexed by i into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i)
+{
+    assert( x != nullptr );
+    assert( i <  4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    *x = V.vector4_u32[i];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    *x = V.n128_u32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+    *x = V.m128_u32[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store the X component into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V)
+{
+    assert( x != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *x = V.vector4_u32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(x,*reinterpret_cast<const uint32x4_t*>(&V),0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss(reinterpret_cast<float *>(x),V);
+#endif
+}
+
+// Store the Y component into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
+{
+    assert( y != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *y = V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(y,*reinterpret_cast<const uint32x4_t*>(&V),1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_ss(reinterpret_cast<float *>(y),vResult);
+#endif
+}
+
+// Store the Z component into a 32 bit integer locaCantion in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
+{
+    assert( z != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *z = V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(z,*reinterpret_cast<const uint32x4_t*>(&V),2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss(reinterpret_cast<float *>(z),vResult);
+#endif
+}
+
+// Store the W component into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
+{
+    assert( w != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    *w = V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(w,*reinterpret_cast<const uint32x4_t*>(&V),3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+    _mm_store_ss(reinterpret_cast<float *>(w),vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Set a single indexed floating point component
+inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
+{
+    assert( i < 4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U = V;
+    U.vector4_f32[i] = f;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR U = V;
+    U.n128_f32[i] = f;
+    return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR U = V;
+    U.m128_f32[i] = f;
+    return U;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = x;
+    U.vector4_f32[1] = V.vector4_f32[1];
+    U.vector4_f32[2] = V.vector4_f32[2];
+    U.vector4_f32[3] = V.vector4_f32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(x,V,0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(x);
+    vResult = _mm_move_ss(V,vResult);
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = V.vector4_f32[0];
+    U.vector4_f32[1] = y;
+    U.vector4_f32[2] = V.vector4_f32[2];
+    U.vector4_f32[3] = V.vector4_f32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(y,V,1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(y);
+    vResult = _mm_insert_ps( V, vResult, 0x10 );
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_set_ss(y);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
+    return vResult;
+#endif
+}
+// Sets the Z component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = V.vector4_f32[0];
+    U.vector4_f32[1] = V.vector4_f32[1];
+    U.vector4_f32[2] = z;
+    U.vector4_f32[3] = V.vector4_f32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(z,V,2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(z);
+    vResult = _mm_insert_ps( V, vResult, 0x20 );
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_set_ss(z);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = V.vector4_f32[0];
+    U.vector4_f32[1] = V.vector4_f32[1];
+    U.vector4_f32[2] = V.vector4_f32[2];
+    U.vector4_f32[3] = w;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(w,V,3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(w);
+    vResult = _mm_insert_ps( V, vResult, 0x30 );
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_set_ss(w);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i)
+{
+    assert( f != nullptr );
+    assert( i < 4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U = V;
+    U.vector4_f32[i] = *f;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR U = V;
+    U.n128_f32[i] = *f;
+    return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR U = V;
+    U.m128_f32[i] = *f;
+    return U;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float *x)
+{
+    assert( x != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = *x;
+    U.vector4_f32[1] = V.vector4_f32[1];
+    U.vector4_f32[2] = V.vector4_f32[2];
+    U.vector4_f32[3] = V.vector4_f32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(x,V,0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_load_ss(x);
+    vResult = _mm_move_ss(V,vResult);
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float *y)
+{
+    assert( y != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = V.vector4_f32[0];
+    U.vector4_f32[1] = *y;
+    U.vector4_f32[2] = V.vector4_f32[2];
+    U.vector4_f32[3] = V.vector4_f32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(y,V,1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(y);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
+    return vResult;
+#endif
+}
+
+// Sets the Z component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float *z)
+{
+    assert( z != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = V.vector4_f32[0];
+    U.vector4_f32[1] = V.vector4_f32[1];
+    U.vector4_f32[2] = *z;
+    U.vector4_f32[3] = V.vector4_f32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(z,V,2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(z);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float *w)
+{
+    assert( w != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_f32[0] = V.vector4_f32[0];
+    U.vector4_f32[1] = V.vector4_f32[1];
+    U.vector4_f32[2] = V.vector4_f32[2];
+    U.vector4_f32[3] = *w;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(w,V,3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(w);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i)
+{
+    assert( i < 4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U = V;
+    U.vector4_u32[i] = x;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTORU32 tmp;
+    tmp.v = V;
+    tmp.u[i] = x;
+    return tmp;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTORU32 tmp;
+    tmp.v = V;
+    tmp.u[i] = x;
+    return tmp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = x;
+    U.vector4_u32[1] = V.vector4_u32[1];
+    U.vector4_u32[2] = V.vector4_u32[2];
+    U.vector4_u32[3] = V.vector4_u32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_u32(x,V,0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cvtsi32_si128(x);
+    XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp));
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = V.vector4_u32[0];
+    U.vector4_u32[1] = y;
+    U.vector4_u32[2] = V.vector4_u32[2];
+    U.vector4_u32[3] = V.vector4_u32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_u32(y,V,1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
+    return _mm_castsi128_ps( vResult );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
+    // Convert input to vector
+    __m128i vTemp = _mm_cvtsi32_si128(y);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
+    return vResult;
+#endif
+}
+
+// Sets the Z component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = V.vector4_u32[0];
+    U.vector4_u32[1] = V.vector4_u32[1];
+    U.vector4_u32[2] = z;
+    U.vector4_u32[3] = V.vector4_u32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_u32(z,V,2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
+    return _mm_castsi128_ps( vResult );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
+    // Convert input to vector
+    __m128i vTemp = _mm_cvtsi32_si128(z);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = V.vector4_u32[0];
+    U.vector4_u32[1] = V.vector4_u32[1];
+    U.vector4_u32[2] = V.vector4_u32[2];
+    U.vector4_u32[3] = w;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_u32(w,V,3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
+    return _mm_castsi128_ps( vResult );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
+    // Convert input to vector
+    __m128i vTemp = _mm_cvtsi32_si128(w);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i)
+{
+    assert( x != nullptr );
+    assert( i < 4 );
+    _Analysis_assume_( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U = V;
+    U.vector4_u32[i] = *x;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTORU32 tmp;
+    tmp.v = V;
+    tmp.u[i] = *x;
+    return tmp;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTORU32 tmp;
+    tmp.v = V;
+    tmp.u[i] = *x;
+    return tmp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x)
+{
+    assert( x != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = *x;
+    U.vector4_u32[1] = V.vector4_u32[1];
+    U.vector4_u32[2] = V.vector4_u32[2];
+    U.vector4_u32[3] = V.vector4_u32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_u32(x,*reinterpret_cast<const uint32x4_t *>(&V),0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x));
+    XMVECTOR vResult = _mm_move_ss(V,vTemp);
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y)
+{
+    assert( y != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = V.vector4_u32[0];
+    U.vector4_u32[1] = *y;
+    U.vector4_u32[2] = V.vector4_u32[2];
+    U.vector4_u32[3] = V.vector4_u32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_u32(y,*reinterpret_cast<const uint32x4_t *>(&V),1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
+    return vResult;
+#endif
+}
+
+// Sets the Z component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z)
+{
+    assert( z != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = V.vector4_u32[0];
+    U.vector4_u32[1] = V.vector4_u32[1];
+    U.vector4_u32[2] = *z;
+    U.vector4_u32[3] = V.vector4_u32[3];
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_u32(z,*reinterpret_cast<const uint32x4_t *>(&V),2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w)
+{
+    assert( w != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR U;
+    U.vector4_u32[0] = V.vector4_u32[0];
+    U.vector4_u32[1] = V.vector4_u32[1];
+    U.vector4_u32[2] = V.vector4_u32[2];
+    U.vector4_u32[3] = *w;
+    return U;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_u32(w,*reinterpret_cast<const uint32x4_t *>(&V),3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult,vTemp);
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle
+(
+    FXMVECTOR V,
+    uint32_t E0,
+    uint32_t E1,
+    uint32_t E2,
+    uint32_t E3
+)
+{
+    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result = { V.vector4_f32[E0],
+                        V.vector4_f32[E1],
+                        V.vector4_f32[E2],
+                        V.vector4_f32[E3] };
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const uint32_t ControlElement[ 4 ] =
+    {
+        0x03020100, // XM_SWIZZLE_X
+        0x07060504, // XM_SWIZZLE_Y
+        0x0B0A0908, // XM_SWIZZLE_Z
+        0x0F0E0D0C, // XM_SWIZZLE_W
+    };
+
+    int8x8x2_t tbl;
+    tbl.val[0] = vget_low_f32(V);
+    tbl.val[1] = vget_high_f32(V);
+
+    uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) );
+    const uint8x8_t rL = vtbl2_u8( tbl, idx );
+
+    idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) );
+    const uint8x8_t rH = vtbl2_u8( tbl, idx );
+
+    return vcombine_f32( rL, rH );
+#elif defined(_XM_AVX_INTRINSICS_)
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    return _mm_permutevar_ps( V, vControl );
+#else
+    const uint32_t *aPtr = (const uint32_t* )(&V);
+
+    XMVECTOR Result;
+    uint32_t *pWork = (uint32_t*)(&Result);
+
+    pWork[0] = aPtr[E0];
+    pWork[1] = aPtr[E1];
+    pWork[2] = aPtr[E2];
+    pWork[3] = aPtr[E3];
+
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+inline XMVECTOR XM_CALLCONV XMVectorPermute
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    uint32_t PermuteX,
+    uint32_t PermuteY,
+    uint32_t PermuteZ,
+    uint32_t PermuteW
+)
+{
+    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const uint32_t ControlElement[ 8 ] =
+    {
+        0x03020100, // XM_PERMUTE_0X
+        0x07060504, // XM_PERMUTE_0Y
+        0x0B0A0908, // XM_PERMUTE_0Z
+        0x0F0E0D0C, // XM_PERMUTE_0W
+        0x13121110, // XM_PERMUTE_1X
+        0x17161514, // XM_PERMUTE_1Y
+        0x1B1A1918, // XM_PERMUTE_1Z
+        0x1F1E1D1C, // XM_PERMUTE_1W
+    };
+
+    int8x8x4_t tbl;
+    tbl.val[0] = vget_low_f32(V1);
+    tbl.val[1] = vget_high_f32(V1);
+    tbl.val[2] = vget_low_f32(V2);
+    tbl.val[3] = vget_high_f32(V2);
+
+    uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) );
+    const uint8x8_t rL = vtbl4_u8( tbl, idx );
+
+    idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) );
+    const uint8x8_t rH = vtbl4_u8( tbl, idx );
+
+    return vcombine_f32( rL, rH );
+#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORU32 three = { 3, 3, 3, 3 };
+
+    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    
+    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
+    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
+
+    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
+    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
+
+    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
+    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
+
+    return _mm_or_ps( masked1, masked2 );
+#else
+ 
+    const uint32_t *aPtr[2];
+    aPtr[0] = (const uint32_t* )(&V1);
+    aPtr[1] = (const uint32_t* )(&V2);
+
+    XMVECTOR Result;
+    uint32_t *pWork = (uint32_t*)(&Result);
+
+    const uint32_t i0 = PermuteX & 3;
+    const uint32_t vi0 = PermuteX >> 2;
+    pWork[0] = aPtr[vi0][i0];
+
+    const uint32_t i1 = PermuteY & 3;
+    const uint32_t vi1 = PermuteY >> 2;
+    pWork[1] = aPtr[vi1][i1];
+
+    const uint32_t i2 = PermuteZ & 3;
+    const uint32_t vi2 = PermuteZ >> 2;
+    pWork[2] = aPtr[vi2][i2];
+
+    const uint32_t i3 = PermuteW & 3;
+    const uint32_t vi3 = PermuteW >> 2;
+    pWork[3] = aPtr[vi3][i3];
+
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Define a control vector to be used in XMVectorSelect 
+// operations.  The four integers specified in XMVectorSelectControl
+// serve as indices to select between components in two vectors.
+// The first index controls selection for the first component of 
+// the vectors involved in a select operation, the second index 
+// controls selection for the second component etc.  A value of
+// zero for an index causes the corresponding component from the first 
+// vector to be selected whereas a one causes the component from the
+// second vector to be selected instead.
+
+inline XMVECTOR XM_CALLCONV XMVectorSelectControl
+(
+    uint32_t VectorIndex0, 
+    uint32_t VectorIndex1, 
+    uint32_t VectorIndex2, 
+    uint32_t VectorIndex3
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    // x=Index0,y=Index1,z=Index2,w=Index3
+    __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0);
+    // Any non-zero entries become 0xFFFFFFFF else 0
+    vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero);
+    return _mm_castsi128_ps(vTemp);
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    int32x2_t V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32));
+    int32x2_t V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32));
+    int32x4_t vTemp = vcombine_s32(V0, V1);
+    // Any non-zero entries become 0xFFFFFFFF else 0
+    return vcgtq_s32(vTemp,g_XMZero);
+#else
+    XMVECTOR    ControlVector;
+    const uint32_t  ControlElement[] =
+                {
+                    XM_SELECT_0,
+                    XM_SELECT_1
+                };
+
+    assert(VectorIndex0 < 2);
+    assert(VectorIndex1 < 2);
+    assert(VectorIndex2 < 2);
+    assert(VectorIndex3 < 2);
+    _Analysis_assume_(VectorIndex0 < 2);
+    _Analysis_assume_(VectorIndex1 < 2);
+    _Analysis_assume_(VectorIndex2 < 2);
+    _Analysis_assume_(VectorIndex3 < 2);
+
+    ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
+    ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
+    ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
+    ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
+
+    return ControlVector;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSelect
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR Control
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]);
+    Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]);
+    Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]);
+    Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vbslq_f32( Control, V2, V1 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1);
+    XMVECTOR vTemp2 = _mm_and_ps(V2,Control);
+    return _mm_or_ps(vTemp1,vTemp2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMergeXY
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = V1.vector4_u32[0];
+    Result.vector4_u32[1] = V2.vector4_u32[0];
+    Result.vector4_u32[2] = V1.vector4_u32[1];
+    Result.vector4_u32[3] = V2.vector4_u32[1];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vzipq_f32( V1, V2 ).val[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_unpacklo_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMergeZW
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = V1.vector4_u32[2];
+    Result.vector4_u32[1] = V2.vector4_u32[2];
+    Result.vector4_u32[2] = V1.vector4_u32[3];
+    Result.vector4_u32[3] = V2.vector4_u32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vzipq_f32( V1, V2 ).val[1];
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_unpackhi_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
+                                  uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3)
+{
+    XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
+    return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control );
+}
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vceqq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpeq_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorEqualR
+(
+    uint32_t*    pCR,
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    assert( pCR != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    uint32_t CR = 0;
+    if (ux&uy&uz&uw)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!(ux|uy|uz|uw))
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = ux;
+    Control.vector4_u32[1] = uy;
+    Control.vector4_u32[2] = uz;
+    Control.vector4_u32[3] = uw;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        // All elements are equal
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        // All elements are not equal
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest==0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Treat the components of the vectors as unsigned integers and
+// compare individual bits between the two.  This is useful for
+// comparing control vectors and result vectors returned from
+// other comparison operations.
+
+inline XMVECTOR XM_CALLCONV XMVectorEqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vceqq_u32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorEqualIntR
+(
+    uint32_t*    pCR,
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    assert( pCR != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control = XMVectorEqualInt(V1, V2);
+
+    *pCR = 0;
+    if (XMVector4EqualInt(Control, XMVectorTrueInt()))
+    {
+        // All elements are equal
+        *pCR |= XM_CRMASK_CR6TRUE;
+    }
+    else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
+    {
+        // All elements are not equal
+        *pCR |= XM_CRMASK_CR6FALSE;
+    }
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        // All elements are equal
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        // All elements are not equal
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
+    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V));
+    uint32_t CR = 0;
+    if (iTemp==0x0F)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTemp)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNearEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0];
+    float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1];
+    float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2];
+    float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3];
+
+    fDeltax = fabsf(fDeltax);
+    fDeltay = fabsf(fDeltay);
+    fDeltaz = fabsf(fDeltaz);
+    fDeltaw = fabsf(fDeltaw);
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vDelta = vsubq_f32(V1,V2);
+    return vacleq_f32( vDelta, Epsilon );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp,vDelta);
+    vTemp = _mm_max_ps(vTemp,vDelta);
+    vTemp = _mm_cmple_ps(vTemp,Epsilon);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNotEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmvnq_u32(vceqq_f32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpneq_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmvnq_u32(vceqq_u32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
+    return _mm_xor_ps(_mm_castsi128_ps(V),g_XMNegOneMask);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorGreater
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vcgtq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpgt_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorGreaterR
+(
+    uint32_t*    pCR,
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    assert( pCR != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    uint32_t CR = 0;
+    if (ux&uy&uz&uw)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!(ux|uy|uz|uw))
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = ux;
+    Control.vector4_u32[1] = uy;
+    Control.vector4_u32[2] = uz;
+    Control.vector4_u32[3] = uw;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest==0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vcgeq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpge_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR
+(
+    uint32_t*    pCR,
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    assert( pCR != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    uint32_t CR = 0;
+    if (ux&uy&uz&uw)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!(ux|uy|uz|uw))
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = ux;
+    Control.vector4_u32[1] = uy;
+    Control.vector4_u32[2] = uz;
+    Control.vector4_u32[3] = uw;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        // All elements are greater or equal
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        // All elements are not greater or equal
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest==0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLess
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vcltq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmplt_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vcleq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmple_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorInBounds
+(
+    FXMVECTOR V, 
+    FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+    Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = vnegq_f32(Bounds);
+    // Test if greater or equal (Reversed)
+    vTemp2 = vcleq_f32(vTemp2,V);
+    // Blend answers
+    vTemp1 = vandq_u32(vTemp1,vTemp2);
+    return vTemp1;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2,V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+    return vTemp1;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorInBoundsR
+(
+    uint32_t*    pCR,
+    FXMVECTOR V, 
+    FXMVECTOR Bounds
+)
+{
+    assert( pCR != nullptr );
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+
+    uint32_t CR = 0;
+    if (ux&uy&uz&uw)
+    {
+        // All elements are in bounds
+        CR = XM_CRMASK_CR6BOUNDS;
+    }
+    *pCR = CR;
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = ux;
+    Control.vector4_u32[1] = uy;
+    Control.vector4_u32[2] = uz;
+    Control.vector4_u32[3] = uw;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = vnegq_f32(Bounds);
+    // Test if greater or equal (Reversed)
+    vTemp2 = vcleq_f32(vTemp2,V);
+    // Blend answers
+    vTemp1 = vandq_u32(vTemp1,vTemp2);
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        // All elements are in bounds
+        CR = XM_CRMASK_CR6BOUNDS;
+    }
+    *pCR = CR;
+    return vTemp1;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2,V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+
+    uint32_t CR = 0;
+    if (_mm_movemask_ps(vTemp1)==0xf) {
+        // All elements are in bounds
+        CR = XM_CRMASK_CR6BOUNDS;
+    }
+    *pCR = CR;
+    return vTemp1;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorIsNaN
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    uint32x4_t vTempNan = vceqq_f32( V, V );
+    // Flip results
+    return vmvnq_u32( vTempNan );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    return _mm_cmpneq_ps(V,V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorIsInfinite
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control;
+    Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x4_t vTemp = vandq_u32(V,g_XMAbsMask);
+    // Compare to infinity
+    vTemp = vceqq_f32(vTemp,g_XMInfinity);
+    // If any are infinity, the signs are true.
+    return vTemp;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+    // If any are infinity, the signs are true.
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Rounding and clamping operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMin
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
+    Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
+    Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
+    Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vminq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_min_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMax
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
+    Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
+    Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
+    Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmaxq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_max_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _XM_NO_ROUNDF_
+
+namespace Internal
+{
+    inline float round_to_nearest( float x )
+    {
+        float i = floorf(x);
+        x -= i;
+        if(x < 0.5f)
+            return i;
+        if(x > 0.5f)
+            return i + 1.f;
+
+        float int_part;
+        modff( i / 2.f, &int_part );
+        if ( (2.f*int_part) == i )
+        {
+            return i;
+        }
+
+        return i + 1.f;
+    }
+};
+
+#endif
+
+#if !defined(_XM_NO_INTRINSICS_)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+inline XMVECTOR XM_CALLCONV XMVectorRound
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+#ifdef _XM_NO_ROUNDF_
+    XMVECTOR Result;
+    Result.vector4_f32[0] = Internal::round_to_nearest( V.vector4_f32[0] );
+    Result.vector4_f32[1] = Internal::round_to_nearest( V.vector4_f32[1] );
+    Result.vector4_f32[2] = Internal::round_to_nearest( V.vector4_f32[2] );
+    Result.vector4_f32[3] = Internal::round_to_nearest( V.vector4_f32[3] );
+    return Result;
+#else
+    XMVECTOR Result;
+    Result.vector4_f32[0] = roundf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = roundf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = roundf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = roundf( V.vector4_f32[3] );
+    return Result;
+#endif
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
+    uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign );
+    float32x4_t R1 = vaddq_f32( V, sMagic );
+    R1 = vsubq_f32( R1, sMagic );
+    float32x4_t R2 = vabsq_f32( V );
+    uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
+    XMVECTOR vResult = vbslq_f32( mask, R1, V );
+    return vResult;
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 sign = _mm_and_ps( V, g_XMNegativeZero );
+    __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign );
+    __m128 R1 = _mm_add_ps( V, sMagic );
+    R1 = _mm_sub_ps( R1, sMagic );
+    __m128 R2 = _mm_and_ps( V, g_XMAbsMask );
+    __m128 mask = _mm_cmple_ps( R2, g_XMNoFraction );
+    R2 = _mm_andnot_ps(mask,V);
+    R1 = _mm_and_ps(R1,mask);
+    XMVECTOR vResult = _mm_xor_ps(R1, R2);
+    return vResult;
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTruncate
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    uint32_t     i;
+
+    // Avoid C4701
+    Result.vector4_f32[0] = 0.0f;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (XMISNAN(V.vector4_f32[i]))
+        {
+            Result.vector4_u32[i] = 0x7FC00000;
+        }
+        else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
+        {
+            Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]);
+        }
+        else
+        {
+            Result.vector4_f32[i] = V.vector4_f32[i];
+        }
+    }
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTest = vabsq_f32( V );
+    vTest = vcltq_f32( vTest, g_XMNoFraction );
+
+    int32x4_t vInt = vcvtq_s32_f32( V );
+    XMVECTOR vResult = vcvtq_f32_s32( vInt );
+
+    // All numbers less than 8388608 will use the round to int
+    // All others, use the ORIGINAL value
+    return vbslq_f32( vTest, vResult, V );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // To handle NAN, INF and numbers greater than 8388608, use masking
+    // Get the abs value
+    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
+    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
+    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+    // Convert to int and back to float for rounding with truncation
+    __m128i vInt = _mm_cvttps_epi32(V);
+    // Convert back to floats
+    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+    // All numbers less than 8388608 will use the round to int
+    vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
+    // All others, use the ORIGINAL value
+    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
+    vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorFloor
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = floorf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = floorf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = floorf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = floorf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTest = vabsq_f32( V );
+    vTest = vcltq_f32( vTest, g_XMNoFraction );
+    // Truncate
+    int32x4_t vInt = vcvtq_s32_f32( V );
+    XMVECTOR vResult = vcvtq_f32_s32( vInt );
+    XMVECTOR vLarger = vcgtq_f32( vResult, V );
+    // 0 -> 0, 0xffffffff -> -1.0f
+    vLarger = vcvtq_f32_s32( vLarger );
+    vResult = vaddq_f32( vResult, vLarger );
+    // All numbers less than 8388608 will use the round to int
+    // All others, use the ORIGINAL value
+    return vbslq_f32( vTest, vResult, V );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_floor_ps( V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // To handle NAN, INF and numbers greater than 8388608, use masking
+    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
+    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+    // Truncate
+    __m128i vInt = _mm_cvttps_epi32(V);
+    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+    __m128 vLarger = _mm_cmpgt_ps( vResult, V );
+    // 0 -> 0, 0xffffffff -> -1.0f
+    vLarger = _mm_cvtepi32_ps( _mm_castps_si128( vLarger ) );
+    vResult = _mm_add_ps( vResult, vLarger );
+    // All numbers less than 8388608 will use the round to int
+    vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
+    // All others, use the ORIGINAL value
+    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
+    vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCeiling
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = ceilf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = ceilf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = ceilf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = ceilf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTest = vabsq_f32( V );
+    vTest = vcltq_f32( vTest, g_XMNoFraction );
+    // Truncate
+    int32x4_t vInt = vcvtq_s32_f32( V );
+    XMVECTOR vResult = vcvtq_f32_s32( vInt );
+    XMVECTOR vSmaller = vcltq_f32( vResult, V );
+    // 0 -> 0, 0xffffffff -> -1.0f
+    vSmaller = vcvtq_f32_s32( vSmaller );
+    vResult = vsubq_f32( vResult, vSmaller );
+    // All numbers less than 8388608 will use the round to int
+    // All others, use the ORIGINAL value
+    return vbslq_f32( vTest, vResult, V );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_ceil_ps( V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // To handle NAN, INF and numbers greater than 8388608, use masking
+    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
+    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+    // Truncate
+    __m128i vInt = _mm_cvttps_epi32(V);
+    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+    __m128 vSmaller = _mm_cmplt_ps( vResult, V );
+    // 0 -> 0, 0xffffffff -> -1.0f
+    vSmaller = _mm_cvtepi32_ps( _mm_castps_si128( vSmaller ) );
+    vResult = _mm_sub_ps( vResult, vSmaller );
+    // All numbers less than 8388608 will use the round to int
+    vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
+    // All others, use the ORIGINAL value
+    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
+    vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorClamp
+(
+    FXMVECTOR V, 
+    FXMVECTOR Min, 
+    FXMVECTOR Max
+)
+{
+    assert(XMVector4LessOrEqual(Min, Max));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVectorMax(Min, V);
+    Result = XMVectorMin(Max, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult = vmaxq_f32(Min,V);
+    vResult = vminq_f32(vResult,Max);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult = _mm_max_ps(Min,V);
+    vResult = _mm_min_ps(vResult,Max);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSaturate
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    return XMVectorClamp(V, Zero, g_XMOne.v);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Set <0 to 0
+    XMVECTOR vResult = vmaxq_f32(V, vdupq_n_f32(0) );
+    // Set>1 to 1
+    return vminq_f32(vResult, vdupq_n_f32(1.0f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Set <0 to 0
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    // Set>1 to 1
+    return _mm_min_ps(vResult,g_XMOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Bitwise logical operations
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAndInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0];
+    Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1];
+    Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2];
+    Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vandq_u32(V1,V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_and_ps(V1,V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAndCInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0];
+    Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1];
+    Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2];
+    Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vbicq_u32(V1,V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorOrInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0];
+    Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1];
+    Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2];
+    Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vorrq_u32(V1,V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNorInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]);
+    Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]);
+    Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]);
+    Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t Result = vorrq_u32(V1,V2);
+    return vbicq_u32(g_XMNegOneMask, Result);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i Result;
+    Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
+    Result = _mm_andnot_si128( Result,g_XMNegOneMask);
+    return _mm_castsi128_ps(Result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorXorInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0];
+    Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1];
+    Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2];
+    Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return veorq_u32(V1,V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNegate
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = -V.vector4_f32[0];
+    Result.vector4_f32[1] = -V.vector4_f32[1];
+    Result.vector4_f32[2] = -V.vector4_f32[2];
+    Result.vector4_f32[3] = -V.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vnegq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR Z;
+
+    Z = _mm_setzero_ps();
+
+    return _mm_sub_ps( Z, V );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0];
+    Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1];
+    Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2];
+    Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vaddq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_add_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSum
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = 
+    Result.vector4_f32[1] = 
+    Result.vector4_f32[2] = 
+    Result.vector4_f32[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v1 = vget_low_f32(V);
+    float32x2_t v2 = vget_high_f32(V);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    return vcombine_f32(v1, v1);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_hadd_ps(V, V);
+    return _mm_hadd_ps(vTemp,vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
+    XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
+    return _mm_add_ps(vTemp, vTemp2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAddAngles
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    // Add the given angles together.  If the range of V1 is such
+    // that -Pi <= V1 < Pi and the range of V2 is such that
+    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
+    // will be -Pi <= Result < Pi.
+    XMVECTOR Result = XMVectorAdd(V1, V2);
+
+    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
+    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
+
+    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
+    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
+
+    Result = XMVectorAdd(Result, Offset);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = vaddq_f32(V1,V2);
+    // Less than Pi?
+    uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi);
+    vOffset = vandq_u32(vOffset,g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = vaddq_f32(vResult,vOffset);
+    // Greater than or equal to Pi?
+    vOffset = vcgeq_f32(vResult,g_XMPi);
+    vOffset = vandq_u32(vOffset,g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = vsubq_f32(vResult,vOffset);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = _mm_add_ps(V1,V2);
+    // Less than Pi?
+    XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
+    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = _mm_add_ps(vResult,vOffset);
+    // Greater than or equal to Pi?
+    vOffset = _mm_cmpge_ps(vResult,g_XMPi);
+    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = _mm_sub_ps(vResult,vOffset);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0];
+    Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1];
+    Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2];
+    Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsubq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_sub_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    // Subtract the given angles.  If the range of V1 is such
+    // that -Pi <= V1 < Pi and the range of V2 is such that
+    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
+    // will be -Pi <= Result < Pi.
+    XMVECTOR Result = XMVectorSubtract(V1, V2);
+
+    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
+    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
+
+    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
+    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
+
+    Result = XMVectorAdd(Result, Offset);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = vsubq_f32(V1,V2);
+    // Less than Pi?
+    uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi);
+    vOffset = vandq_u32(vOffset,g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = vaddq_f32(vResult,vOffset);
+    // Greater than or equal to Pi?
+    vOffset = vcgeq_f32(vResult,g_XMPi);
+    vOffset = vandq_u32(vOffset,g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = vsubq_f32(vResult,vOffset);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = _mm_sub_ps(V1,V2);
+    // Less than Pi?
+    XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
+    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = _mm_add_ps(vResult,vOffset);
+    // Greater than or equal to Pi?
+    vOffset = _mm_cmpge_ps(vResult,g_XMPi);
+    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = _mm_sub_ps(vResult,vOffset);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiply
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0];
+    Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1];
+    Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2];
+    Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3];
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmulq_f32( V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_mul_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0];
+    Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1];
+    Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2];
+    Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3];
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmlaq_f32( V3, V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_mul_ps( V1, V2 );
+    return _mm_add_ps(vResult, V3 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorDivide
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0];
+    Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1];
+    Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2];
+    Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x4_t Reciprocal = vrecpeq_f32(V2);
+    float32x4_t S = vrecpsq_f32( Reciprocal, V2 );
+    Reciprocal = vmulq_f32( S, Reciprocal );
+    S = vrecpsq_f32( Reciprocal, V2 );
+    Reciprocal = vmulq_f32( S, Reciprocal );
+    return vmulq_f32( V1, Reciprocal );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_div_ps( V1, V2 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]);
+    Result.vector4_f32[1] = V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]);
+    Result.vector4_f32[2] = V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]);
+    Result.vector4_f32[3] = V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmlsq_f32( V3, V1, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR R = _mm_mul_ps( V1, V2 );
+    return _mm_sub_ps( V3, R );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorScale
+(
+    FXMVECTOR V, 
+    float    ScaleFactor
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V.vector4_f32[0] * ScaleFactor;
+    Result.vector4_f32[1] = V.vector4_f32[1] * ScaleFactor;
+    Result.vector4_f32[2] = V.vector4_f32[2] * ScaleFactor;
+    Result.vector4_f32[3] = V.vector4_f32[3] * ScaleFactor;
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmulq_n_f32( V, ScaleFactor );
+#elif defined(_XM_SSE_INTRINSICS_)
+   XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
+   return _mm_mul_ps(vResult,V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
+    Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
+    Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
+    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vrecpeq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_rcp_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocal
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
+    Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
+    Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
+    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 2 iterations of Newton-Raphson refinement
+    float32x4_t Reciprocal = vrecpeq_f32(V);
+    float32x4_t S = vrecpsq_f32( Reciprocal, V );
+    Reciprocal = vmulq_f32( S, Reciprocal );
+    S = vrecpsq_f32( Reciprocal, V );
+    return vmulq_f32( S, Reciprocal );
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_div_ps(g_XMOne,V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return an estimated square root
+inline XMVECTOR XM_CALLCONV XMVectorSqrtEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 1 iteration of Newton-Raphson refinment of sqrt
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32( V, S0 );
+    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+    float32x4_t S1 = vmulq_f32( S0, R0 );
+
+    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
+    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
+    XMVECTOR Result = vmulq_f32( V, S1 );
+    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
+    return XMVectorSelect(V, Result, Select);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_sqrt_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSqrt
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 3 iterations of Newton-Raphson refinment of sqrt
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32( V, S0 );
+    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+    float32x4_t S1 = vmulq_f32( S0, R0 );
+    float32x4_t P1 = vmulq_f32( V, S1 );
+    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+    float32x4_t S2 = vmulq_f32( S1, R1 );
+    float32x4_t P2 = vmulq_f32( V, S2 );
+    float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
+    float32x4_t S3 = vmulq_f32( S2, R2 );
+
+    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
+    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
+    XMVECTOR Result = vmulq_f32( V, S3 );
+    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
+    return XMVectorSelect(V, Result, Select);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_sqrt_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vrsqrteq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_rsqrt_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x4_t S0 = vrsqrteq_f32(V);
+
+    float32x4_t P0 = vmulq_f32( V, S0 );
+    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+
+    float32x4_t S1 = vmulq_f32( S0, R0 );
+    float32x4_t P1 = vmulq_f32( V, S1 );
+    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+
+    return vmulq_f32( S1, R1 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_sqrt_ps(V);
+    vResult = _mm_div_ps(g_XMOne,vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExp2
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]);
+    Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]);
+    Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]);
+    Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t itrunc = vcvtq_s32_f32(V);
+    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
+    float32x4_t y = vsubq_f32(V, ftrunc);
+
+    float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y );
+    poly = vmlaq_f32( g_XMExpEst5, poly, y );
+    poly = vmlaq_f32( g_XMExpEst4, poly, y );
+    poly = vmlaq_f32( g_XMExpEst3, poly, y );
+    poly = vmlaq_f32( g_XMExpEst2, poly, y );
+    poly = vmlaq_f32( g_XMExpEst1, poly, y );
+    poly = vmlaq_f32( g_XMOne, poly, y );
+
+    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
+    biased = vshlq_n_s32(biased, 23);
+    float32x4_t result0 = XMVectorDivide(biased, poly);
+
+    biased = vaddq_s32(itrunc, g_XM253);
+    biased = vshlq_n_s32(biased, 23);
+    float32x4_t result1 = XMVectorDivide(biased, poly);
+    result1 = vmulq_f32(g_XMMinNormal.v, result1);
+
+    // Use selection to handle the cases
+    //  if (V is NaN) -> QNaN;
+    //  else if (V sign bit set)
+    //      if (V > -150)
+    //         if (V.exponent < -126) -> result1
+    //         else -> result0
+    //      else -> +0
+    //  else
+    //      if (V < 128) -> result0
+    //      else -> +inf
+
+    int32x4_t comp = vcltq_s32( V, g_XMBin128);
+    float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity );
+
+    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
+    float32x4_t result3 = vbslq_f32( comp, result1, result0 );
+
+    comp = vcltq_s32(V, g_XMBinNeg150);
+    float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero );
+
+    int32x4_t sign = vandq_s32(V, g_XMNegativeZero);
+    comp = vceqq_s32(sign, g_XMNegativeZero);
+    float32x4_t result5 = vbslq_f32( comp, result4, result2 );
+
+    int32x4_t t0 = vandq_s32(V, g_XMQNaNTest);
+    int32x4_t t1 = vandq_s32(V, g_XMInfinity);
+    t0 = vceqq_s32(t0, g_XMZero);
+    t1 = vceqq_s32(t1, g_XMInfinity);
+    int32x4_t isNaN = vbicq_s32( t1,t0);
+
+    float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 );
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i itrunc = _mm_cvttps_epi32(V);
+    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
+    __m128 y = _mm_sub_ps(V, ftrunc);
+    __m128 poly = _mm_mul_ps(g_XMExpEst7, y);
+    poly = _mm_add_ps(g_XMExpEst6, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst5, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst4, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst3, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst2, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst1, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMOne, poly);
+
+    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
+    biased = _mm_slli_epi32(biased, 23);
+    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
+
+    biased = _mm_add_epi32(itrunc, g_XM253);
+    biased = _mm_slli_epi32(biased, 23);
+    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
+    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
+
+    // Use selection to handle the cases
+    //  if (V is NaN) -> QNaN;
+    //  else if (V sign bit set)
+    //      if (V > -150)
+    //         if (V.exponent < -126) -> result1
+    //         else -> result0
+    //      else -> +0
+    //  else
+    //      if (V < 128) -> result0
+    //      else -> +inf
+
+    __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(V), g_XMBin128);
+    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
+    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
+    __m128i result2 = _mm_or_si128(select0, select1);
+
+    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
+    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
+    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
+    __m128i result3 = _mm_or_si128(select0, select1);
+
+    comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150);
+    select0 = _mm_and_si128(comp, result3);
+    select1 = _mm_andnot_si128(comp, g_XMZero);
+    __m128i result4 = _mm_or_si128(select0, select1);
+
+    __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero);
+    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
+    select0 = _mm_and_si128(comp, result4);
+    select1 = _mm_andnot_si128(comp, result2);
+    __m128i result5 = _mm_or_si128(select0, select1);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result5);
+    __m128i vResult = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExpE
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = expf(V.vector4_f32[0]);
+    Result.vector4_f32[1] = expf(V.vector4_f32[1]);
+    Result.vector4_f32[2] = expf(V.vector4_f32[2]);
+    Result.vector4_f32[3] = expf(V.vector4_f32[3]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // expE(V) = exp2(vin*log2(e))
+    float32x4_t Ve = vmulq_f32(g_XMLgE, V);
+
+    int32x4_t itrunc = vcvtq_s32_f32(Ve);
+    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
+    float32x4_t y = vsubq_f32(Ve, ftrunc);
+
+
+    float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y );
+    poly = vmlaq_f32( g_XMExpEst5, poly, y );
+    poly = vmlaq_f32( g_XMExpEst4, poly, y );
+    poly = vmlaq_f32( g_XMExpEst3, poly, y );
+    poly = vmlaq_f32( g_XMExpEst2, poly, y );
+    poly = vmlaq_f32( g_XMExpEst1, poly, y );
+    poly = vmlaq_f32( g_XMOne, poly, y );
+
+    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
+    biased = vshlq_n_s32(biased, 23);
+    float32x4_t result0 = XMVectorDivide(biased, poly);
+
+    biased = vaddq_s32(itrunc, g_XM253);
+    biased = vshlq_n_s32(biased, 23);
+    float32x4_t result1 = XMVectorDivide(biased, poly);
+    result1 = vmulq_f32(g_XMMinNormal.v, result1);
+
+    // Use selection to handle the cases
+    //  if (V is NaN) -> QNaN;
+    //  else if (V sign bit set)
+    //      if (V > -150)
+    //         if (V.exponent < -126) -> result1
+    //         else -> result0
+    //      else -> +0
+    //  else
+    //      if (V < 128) -> result0
+    //      else -> +inf
+
+    int32x4_t comp = vcltq_s32( Ve, g_XMBin128);
+    float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity );
+
+    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
+    float32x4_t result3 = vbslq_f32( comp, result1, result0 );
+
+    comp = vcltq_s32(Ve, g_XMBinNeg150);
+    float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero );
+
+    int32x4_t sign = vandq_s32(Ve, g_XMNegativeZero);
+    comp = vceqq_s32(sign, g_XMNegativeZero);
+    float32x4_t result5 = vbslq_f32( comp, result4, result2 );
+
+    int32x4_t t0 = vandq_s32(Ve, g_XMQNaNTest);
+    int32x4_t t1 = vandq_s32(Ve, g_XMInfinity);
+    t0 = vceqq_s32(t0, g_XMZero);
+    t1 = vceqq_s32(t1, g_XMInfinity);
+    int32x4_t isNaN = vbicq_s32( t1,t0);
+
+    float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 );
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // expE(V) = exp2(vin*log2(e))
+    __m128 Ve = _mm_mul_ps(g_XMLgE, V);
+
+    __m128i itrunc = _mm_cvttps_epi32(Ve);
+    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
+    __m128 y = _mm_sub_ps(Ve, ftrunc);
+    __m128 poly = _mm_mul_ps(g_XMExpEst7, y);
+    poly = _mm_add_ps(g_XMExpEst6, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst5, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst4, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst3, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst2, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMExpEst1, poly);
+    poly = _mm_mul_ps(poly, y);
+    poly = _mm_add_ps(g_XMOne, poly);
+
+    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
+    biased = _mm_slli_epi32(biased, 23);
+    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
+
+    biased = _mm_add_epi32(itrunc, g_XM253);
+    biased = _mm_slli_epi32(biased, 23);
+    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
+    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
+
+    // Use selection to handle the cases
+    //  if (V is NaN) -> QNaN;
+    //  else if (V sign bit set)
+    //      if (V > -150)
+    //         if (V.exponent < -126) -> result1
+    //         else -> result0
+    //      else -> +0
+    //  else
+    //      if (V < 128) -> result0
+    //      else -> +inf
+
+    __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(Ve), g_XMBin128);
+    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
+    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
+    __m128i result2 = _mm_or_si128(select0, select1);
+
+    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
+    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
+    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
+    __m128i result3 = _mm_or_si128(select0, select1);
+
+    comp = _mm_cmplt_epi32(_mm_castps_si128(Ve), g_XMBinNeg150);
+    select0 = _mm_and_si128(comp, result3);
+    select1 = _mm_andnot_si128(comp, g_XMZero);
+    __m128i result4 = _mm_or_si128(select0, select1);
+
+    __m128i sign = _mm_and_si128(_mm_castps_si128(Ve), g_XMNegativeZero);
+    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
+    select0 = _mm_and_si128(comp, result4);
+    select1 = _mm_andnot_si128(comp, result2);
+    __m128i result5 = _mm_or_si128(select0, select1);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(Ve), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(Ve), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result5);
+    __m128i vResult = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExp
+(
+    FXMVECTOR V
+)
+{
+    return XMVectorExp2(V);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(_XM_SSE_INTRINSICS_)
+
+namespace Internal
+{
+    inline __m128i multi_sll_epi32(__m128i value, __m128i count)
+    {
+        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0));
+        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r0 = _mm_sll_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r1 = _mm_sll_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r2 = _mm_sll_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r3 = _mm_sll_epi32(v, c);
+
+        // (r0,r0,r1,r1)
+        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0));
+        // (r2,r2,r3,r3)
+        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0));
+        // (r0,r1,r2,r3)
+        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0));
+        return _mm_castps_si128(result);
+    }
+
+    inline __m128i multi_srl_epi32(__m128i value, __m128i count)
+    {
+        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0));
+        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r0 = _mm_srl_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r1 = _mm_srl_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r2 = _mm_srl_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r3 = _mm_srl_epi32(v, c);
+
+        // (r0,r0,r1,r1)
+        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0));
+        // (r2,r2,r3,r3)
+        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0));
+        // (r0,r1,r2,r3)
+        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0));
+        return _mm_castps_si128(result);
+    }
+
+    inline __m128i GetLeadingBit(const __m128i value)
+    {
+        static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF};
+        static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
+        static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F};
+        static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003};
+
+        __m128i v = value, r, c, b, s;
+
+        c = _mm_cmpgt_epi32(v, g_XM0000FFFF);   // c = (v > 0xFFFF)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        r = _mm_slli_epi32(b, 4);               // r = (b << 4)
+        v = multi_srl_epi32(v, r);              // v = (v >> r)
+
+        c = _mm_cmpgt_epi32(v, g_XM000000FF);   // c = (v > 0xFF)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        s = _mm_slli_epi32(b, 3);               // s = (b << 3)
+        v = multi_srl_epi32(v, s);              // v = (v >> s)
+        r = _mm_or_si128(r, s);                 // r = (r | s)
+
+        c = _mm_cmpgt_epi32(v, g_XM0000000F);   // c = (v > 0xF)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        s = _mm_slli_epi32(b, 2);               // s = (b << 2)
+        v = multi_srl_epi32(v, s);              // v = (v >> s)
+        r = _mm_or_si128(r, s);                 // r = (r | s)
+
+        c = _mm_cmpgt_epi32(v, g_XM00000003);   // c = (v > 0x3)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        s = _mm_slli_epi32(b, 1);               // s = (b << 1)
+        v = multi_srl_epi32(v, s);              // v = (v >> s)
+        r = _mm_or_si128(r, s);                 // r = (r | s)
+
+        s = _mm_srli_epi32(v, 1);
+        r = _mm_or_si128(r, s);
+        return r;
+    }
+} // namespace Internal
+
+#endif // _XM_SSE_INTRINSICS_
+
+#if defined(_XM_ARM_NEON_INTRINSICS_)
+
+namespace Internal
+{
+    inline int32x4_t GetLeadingBit(const int32x4_t value)
+    {
+        static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF};
+        static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
+        static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F};
+        static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003};
+
+        int32x4_t v = value, r, c, b, s;
+
+        c = vcgtq_s32(v, g_XM0000FFFF);     // c = (v > 0xFFFF)
+        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
+        r = vshlq_n_s32(b, 4);              // r = (b << 4)
+        r = vnegq_s32( r );
+        v = vshlq_u32( v, r );              // v = (v >> r)
+        
+        c = vcgtq_s32(v, g_XM000000FF);     // c = (v > 0xFF)
+        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
+        s = vshlq_n_s32(b, 3);              // s = (b << 3)
+        s = vnegq_s32( s );
+        v = vshlq_u32(v, s);                // v = (v >> s)
+        r = vorrq_s32(r, s);                // r = (r | s)
+
+        c = vcgtq_s32(v, g_XM0000000F);     // c = (v > 0xF)
+        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
+        s = vshlq_n_s32(b, 2);              // s = (b << 2)
+        s = vnegq_s32( s );
+        v = vshlq_u32(v, s);                // v = (v >> s)
+        r = vorrq_s32(r, s);                // r = (r | s)
+
+        c = vcgtq_s32(v, g_XM00000003);     // c = (v > 0x3)
+        b = vshrq_n_u32(c, 31);             // b = (c ? 1 : 0)
+        s = vshlq_n_s32(b, 1);              // s = (b << 1)
+        s = vnegq_s32( s );
+        v = vshlq_u32(v, s);                // v = (v >> s)
+        r = vorrq_s32(r, s);                // r = (r | s)
+
+        s = vshrq_n_u32(v, 1);
+        r = vorrq_s32(r, s);
+        return r;
+    }
+
+} // namespace Internal
+
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLog2
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const float fScale = 1.4426950f; // (1.0f / logf(2.0f));
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale;
+    Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale;
+    Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale;
+    Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale;
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t rawBiased = vandq_s32(V, g_XMInfinity);
+    int32x4_t trailing = vandq_s32(V, g_XMQNaNTest);
+    int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    int32x4_t biased = vshrq_n_u32(rawBiased, 23);
+    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
+    int32x4_t trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    int32x4_t leading = Internal::GetLeadingBit(trailing);
+    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
+    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
+    int32x4_t trailingSub = vshlq_u32(trailing, shift);
+    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
+    int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor );
+    int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor );
+
+    // Compute the approximation.
+    int32x4_t tmp = vorrq_s32(g_XMOne, t);
+    float32x4_t y = vsubq_f32(tmp, g_XMOne);
+
+    float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y );
+    log2 = vmlaq_f32( g_XMLogEst5, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst4, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst3, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst2, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst1, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst0, log2, y );
+    log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y );
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask);
+    isInfinite = vceqq_s32(isInfinite, g_XMInfinity);
+
+    int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero);
+    int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity);
+    int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite);
+
+    int32x4_t isZero = vandq_s32((V), g_XMAbsMask);
+    isZero = vceqq_s32(isZero, g_XMZero);
+
+    int32x4_t t0 = vandq_s32((V), g_XMQNaNTest);
+    int32x4_t t1 = vandq_s32((V), g_XMInfinity);
+    t0 = vceqq_s32(t0, g_XMZero);
+    t1 = vceqq_s32(t1, g_XMInfinity);
+    int32x4_t isNaN = vbicq_s32( t1,t0);
+
+    float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 );
+    tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN );
+    result = vbslq_f32(isPositive, result, tmp);
+    result = vbslq_f32(isNaN, g_XMQNaN, result );
+    return result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    __m128i biased = _mm_srli_epi32(rawBiased, 23);
+    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
+    __m128i trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    __m128i leading = Internal::GetLeadingBit(trailing);
+    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
+    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
+    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
+    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
+
+    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
+    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
+    __m128i e = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isExponentZero, trailingSub);
+    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
+    __m128i t = _mm_or_si128(select0, select1);
+
+    // Compute the approximation.
+    __m128i tmp = _mm_or_si128(g_XMOne, t);
+    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
+
+    __m128 log2 = _mm_mul_ps(g_XMLogEst7, y);
+    log2 = _mm_add_ps(g_XMLogEst6, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst5, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst4, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst3, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst2, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst1, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst0, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e));
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
+
+    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
+    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
+    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
+
+    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
+    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
+    __m128i result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
+    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
+    tmp = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isPositive, result);
+    select1 = _mm_andnot_si128(isPositive, tmp);
+    result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result);
+    result = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLogE
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = logf(V.vector4_f32[0]);
+    Result.vector4_f32[1] = logf(V.vector4_f32[1]);
+    Result.vector4_f32[2] = logf(V.vector4_f32[2]);
+    Result.vector4_f32[3] = logf(V.vector4_f32[3]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t rawBiased = vandq_s32(V, g_XMInfinity);
+    int32x4_t trailing = vandq_s32(V, g_XMQNaNTest);
+    int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    int32x4_t biased = vshrq_n_u32(rawBiased, 23);
+    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
+    int32x4_t trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    int32x4_t leading = Internal::GetLeadingBit(trailing);
+    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
+    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
+    int32x4_t trailingSub = vshlq_u32(trailing, shift);
+    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
+    int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor );
+    int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor );
+
+    // Compute the approximation.
+    int32x4_t tmp = vorrq_s32(g_XMOne, t);
+    float32x4_t y = vsubq_f32(tmp, g_XMOne);
+
+    float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y );
+    log2 = vmlaq_f32( g_XMLogEst5, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst4, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst3, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst2, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst1, log2, y );
+    log2 = vmlaq_f32( g_XMLogEst0, log2, y );
+    log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y );
+
+    log2 = vmulq_f32(g_XMInvLgE, log2);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask);
+    isInfinite = vceqq_s32(isInfinite, g_XMInfinity);
+
+    int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero);
+    int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity);
+    int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite);
+
+    int32x4_t isZero = vandq_s32((V), g_XMAbsMask);
+    isZero = vceqq_s32(isZero, g_XMZero);
+
+    int32x4_t t0 = vandq_s32((V), g_XMQNaNTest);
+    int32x4_t t1 = vandq_s32((V), g_XMInfinity);
+    t0 = vceqq_s32(t0, g_XMZero);
+    t1 = vceqq_s32(t1, g_XMInfinity);
+    int32x4_t isNaN = vbicq_s32( t1,t0);
+
+    float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 );
+    tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN );
+    result = vbslq_f32(isPositive, result, tmp);
+    result = vbslq_f32(isNaN, g_XMQNaN, result );
+    return result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    __m128i biased = _mm_srli_epi32(rawBiased, 23);
+    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
+    __m128i trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    __m128i leading = Internal::GetLeadingBit(trailing);
+    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
+    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
+    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
+    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
+
+    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
+    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
+    __m128i e = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isExponentZero, trailingSub);
+    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
+    __m128i t = _mm_or_si128(select0, select1);
+
+    // Compute the approximation.
+    __m128i tmp = _mm_or_si128(g_XMOne, t);
+    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
+
+    __m128 log2 = _mm_mul_ps(g_XMLogEst7, y);
+    log2 = _mm_add_ps(g_XMLogEst6, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst5, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst4, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst3, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst2, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst1, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(g_XMLogEst0, log2);
+    log2 = _mm_mul_ps(log2, y);
+    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e));
+
+    log2 = _mm_mul_ps(g_XMInvLgE, log2);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
+
+    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
+    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
+    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
+
+    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
+    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
+    __m128i result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
+    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
+    tmp = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isPositive, result);
+    select1 = _mm_andnot_si128(isPositive, tmp);
+    result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result);
+    result = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLog
+(
+    FXMVECTOR V
+)
+{
+    return XMVectorLog2(V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorPow
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]);
+    Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]);
+    Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]);
+    Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
+        powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
+        powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
+        powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
+    };
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __declspec(align(16)) float a[4];
+    __declspec(align(16)) float b[4];
+    _mm_store_ps( a, V1 );
+    _mm_store_ps( b, V2 );
+    XMVECTOR vResult = _mm_setr_ps(
+        powf(a[0],b[0]),
+        powf(a[1],b[1]),
+        powf(a[2],b[2]),
+        powf(a[3],b[3]));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAbs
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = fabsf(V.vector4_f32[0]);
+    vResult.vector4_f32[1] = fabsf(V.vector4_f32[1]);
+    vResult.vector4_f32[2] = fabsf(V.vector4_f32[2]);
+    vResult.vector4_f32[3] = fabsf(V.vector4_f32[3]);
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vabsq_f32( V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_setzero_ps();
+    vResult = _mm_sub_ps(vResult,V);
+    vResult = _mm_max_ps(vResult,V);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMod
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    // V1 % V2 = V1 - V2 * truncate(V1 / V2)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Quotient = XMVectorDivide(V1, V2);
+    Quotient = XMVectorTruncate(Quotient);
+    XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vResult = XMVectorDivide(V1, V2);
+    vResult = XMVectorTruncate(vResult);
+    return vmlsq_f32( V1, vResult, V2 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_div_ps(V1, V2);
+    vResult = XMVectorTruncate(vResult);
+    vResult = _mm_mul_ps(vResult,V2);
+    vResult = _mm_sub_ps(V1,vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorModAngles
+(
+    FXMVECTOR Angles
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V;
+    XMVECTOR Result;
+
+    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+    V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
+    V = XMVectorRound(V);
+    Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+    XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi);
+    // Use the inline function due to complexity for rounding
+    vResult = XMVectorRound(vResult);
+    return vmlsq_f32( Angles, vResult, g_XMTwoPi );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+    XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi);
+    // Use the inline function due to complexity for rounding
+    vResult = XMVectorRound(vResult);
+    vResult = _mm_mul_ps(vResult,g_XMTwoPi);
+    vResult = _mm_sub_ps(Angles,vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSin
+(
+    FXMVECTOR V
+)
+{
+    // 11-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = sinf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = sinf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = sinf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = sinf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32( x );
+    float32x4_t rflx = vsubq_f32(c, x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32( comp, x, rflx );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, x);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SC0,  _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCos
+(
+    FXMVECTOR V
+)
+{
+    // 10-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = cosf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = cosf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = cosf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = cosf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32( x );
+    float32x4_t rflx = vsubq_f32(c, x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32( comp, x, rflx );
+    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0 );
+
+    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, sign);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorSinCos
+(
+    XMVECTOR* pSin, 
+    XMVECTOR* pCos, 
+    FXMVECTOR V
+)
+{
+    assert(pSin != nullptr);
+    assert(pCos != nullptr);
+
+    // 11/10-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Sin;
+    Sin.vector4_f32[0] = sinf( V.vector4_f32[0] );
+    Sin.vector4_f32[1] = sinf( V.vector4_f32[1] );
+    Sin.vector4_f32[2] = sinf( V.vector4_f32[2] );
+    Sin.vector4_f32[3] = sinf( V.vector4_f32[3] );
+
+    XMVECTOR Cos;
+    Cos.vector4_f32[0] = cosf( V.vector4_f32[0] );
+    Cos.vector4_f32[1] = cosf( V.vector4_f32[1] );
+    Cos.vector4_f32[2] = cosf( V.vector4_f32[2] );
+    Cos.vector4_f32[3] = cosf( V.vector4_f32[3] );
+
+    *pSin = Sin;
+    *pCos = Cos;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32( x );
+    float32x4_t  rflx = vsubq_f32(c, x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32( comp, x, rflx );
+    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation for sine
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pSin = vmulq_f32(Result, x);
+
+    // Compute polynomial approximation for cosine
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
+    Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pCos = vmulq_f32(Result, sign);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation of sine
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    *pSin = Result;
+
+    // Compute polynomial approximation of cosine
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_mul_ps(vConstants, x2);
+
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CC0,  _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CC0,  _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    *pCos = Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTan
+(
+    FXMVECTOR V
+)
+{
+    // Cody and Waite algorithm to compute tangent.
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = tanf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = tanf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = tanf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = tanf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 
+
+    static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f};
+    static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f};
+    static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ };
+    static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1};
+
+    XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
+
+    XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
+    XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
+    XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
+
+    XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
+
+    VA = XMVectorRound(VA);
+
+    XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
+
+    XMVECTOR VB = XMVectorAbs(VA);
+
+    VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    VB = vcvtq_u32_f32( VB );
+#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB);
+#else
+    for (size_t i = 0; i < 4; i++)
+    {
+        VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i];
+    }
+#endif
+
+    XMVECTOR VC2 = XMVectorMultiply(VC, VC);
+
+    XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
+    XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
+    XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
+    XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
+    XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
+    XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
+    XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
+    XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
+
+    XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
+    VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
+
+    XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
+    XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
+    N = XMVectorMultiplyAdd(VC2, N, T5);
+    D = XMVectorMultiplyAdd(VC2, D, T2);
+    N = XMVectorMultiply(VC2, N);
+    D = XMVectorMultiplyAdd(VC2, D, T1);
+    N = XMVectorMultiplyAdd(VC, N, VC);
+    XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
+    D = XMVectorMultiplyAdd(VC2, D, T0);
+
+    N = XMVectorSelect(N, VC, VCNearZero);
+    D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
+
+    XMVECTOR R0 = XMVectorNegate(N);
+    XMVECTOR R1 = XMVectorDivide(N,D);
+    R0 = XMVectorDivide(D,R0);
+
+    XMVECTOR VIsZero = XMVectorEqual(V, Zero);
+
+    XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
+
+    Result = XMVectorSelect(Result, Zero, VIsZero);
+
+    return Result;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSinH
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = sinhf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = sinhf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = sinhf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = sinhf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v );
+    XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v );
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+
+    return vsubq_f32(E1, E2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = _mm_mul_ps(V, Scale);
+    V1 = _mm_add_ps(V1,g_XMNegativeOne);
+    XMVECTOR V2 = _mm_mul_ps(V, Scale);
+    V2 = _mm_sub_ps(g_XMNegativeOne,V2);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+
+    return _mm_sub_ps(E1, E2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCosH
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = coshf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = coshf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = coshf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = coshf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
+    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+    return vaddq_f32(E1, E2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = _mm_mul_ps(V,Scale.v);
+    V1 = _mm_add_ps(V1,g_XMNegativeOne.v);
+    XMVECTOR V2 = _mm_mul_ps(V, Scale.v);
+    V2 = _mm_sub_ps(g_XMNegativeOne.v,V2);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+    return _mm_add_ps(E1, E2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTanH
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = tanhf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = tanhf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = tanhf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = tanhf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
+
+    XMVECTOR E = vmulq_f32(V, Scale.v);
+    E = XMVectorExp(E);
+    E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v );
+    E = XMVectorReciprocal(E);
+    return vsubq_f32(g_XMOne.v, E);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
+
+    XMVECTOR E = _mm_mul_ps(V, Scale.v);
+    E = XMVectorExp(E);
+    E = _mm_mul_ps(E,g_XMOneHalf.v);
+    E = _mm_add_ps(E,g_XMOneHalf.v);
+    E = _mm_div_ps(g_XMOne.v,E);
+    return _mm_sub_ps(g_XMOne.v,E);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorASin
+(
+    FXMVECTOR V
+)
+{
+    // 7-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = asinf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = asinf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = asinf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = asinf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
+    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32( nonnegative, t0, t1 );
+    t0 = vsubq_f32(g_XMHalfPi, t0);
+    return t0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 t0 = _mm_mul_ps(vConstants, x);
+
+    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    t0 = _mm_sub_ps(g_XMHalfPi, t0);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorACos
+(
+    FXMVECTOR V
+)
+{
+    // 7-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = acosf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = acosf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = acosf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = acosf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
+    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32( nonnegative, t0, t1 );
+    return t0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 t0 = _mm_mul_ps(vConstants, x);
+
+    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATan
+(
+    FXMVECTOR V
+)
+{
+    // 17-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = atanf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = atanf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = atanf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = atanf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t absV = vabsq_f32(V);
+    float32x4_t invV = XMVectorReciprocal(V);
+    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
+    uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+    comp = vcleq_f32(absV, g_XMOne);
+    sign = vbslq_f32(comp, g_XMZero, sign);
+    uint32x4_t x = vbslq_f32(comp, V, invV);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR TC1 = g_XMATanCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
+    XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(TC1), 1 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    const XMVECTOR TC0 = g_XMATanCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    Result = vmlaq_f32( g_XMOne, Result, x2 );
+    Result = vmulq_f32( Result, x );
+
+    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
+    result1 = vsubq_f32(result1, Result);
+
+    comp = vceqq_f32(sign, g_XMZero);
+    Result = vbslq_f32( comp, Result, result1 );
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 absV = XMVectorAbs(V);
+    __m128 invV = _mm_div_ps(g_XMOne, V);
+    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
+    __m128 select0 = _mm_and_ps(comp, g_XMOne);
+    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    __m128 sign = _mm_or_ps(select0, select1);
+    comp = _mm_cmple_ps(absV, g_XMOne);
+    select0 = _mm_and_ps(comp, g_XMZero);
+    select1 = _mm_andnot_ps(comp, sign);
+    sign = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, V);
+    select1 = _mm_andnot_ps(comp, invV);
+    __m128 x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR TC1 = g_XMATanCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    const XMVECTOR TC0 = g_XMATanCoefficients0;
+    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
+    result1 = _mm_sub_ps(result1, Result);
+
+    comp = _mm_cmpeq_ps(sign, g_XMZero);
+    select0 = _mm_and_ps(comp, Result);
+    select1 = _mm_andnot_ps(comp, result1);
+    Result = _mm_or_ps(select0, select1);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATan2
+(
+    FXMVECTOR Y, 
+    FXMVECTOR X
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] );
+    Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] );
+    Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] );
+    Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] );
+    return Result;
+#else
+
+    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
+
+    //     Y == 0 and X is Negative         -> Pi with the sign of Y
+    //     y == 0 and x is positive         -> 0 with the sign of y
+    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
+    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
+    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
+    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
+    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
+    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
+    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
+
+    static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR ATanResultValid = XMVectorTrueInt();
+
+    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
+    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
+    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
+    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
+
+    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
+    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
+    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
+    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
+    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
+    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
+
+    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
+    Pi = XMVectorOrInt(Pi, YSign);
+    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
+    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
+    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
+
+    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
+    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
+    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
+    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
+    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
+    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
+
+    XMVECTOR V = XMVectorDivide(Y, X);
+
+    XMVECTOR R0 = XMVectorATan(V);
+
+    R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive );
+    R2 = XMVectorAdd(R0, R1);
+
+    return XMVectorSelect(Result, R2, ATanResultValid);
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSinEst
+(
+    FXMVECTOR V
+)
+{
+    // 7-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = sinf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = sinf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = sinf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = sinf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32( x );
+    float32x4_t rflx = vsubq_f32(c, x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32( comp, x, rflx );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, x);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCosEst
+(
+    FXMVECTOR V
+)
+{
+    // 6-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = cosf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = cosf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = cosf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = cosf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32( x );
+    float32x4_t rflx = vsubq_f32(c, x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32( comp, x, rflx );
+    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, sign);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorSinCosEst
+(
+    XMVECTOR* pSin, 
+    XMVECTOR* pCos, 
+    FXMVECTOR  V
+)
+{
+    assert(pSin != nullptr);
+    assert(pCos != nullptr);
+
+    // 7/6-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Sin;
+    Sin.vector4_f32[0] = sinf( V.vector4_f32[0] );
+    Sin.vector4_f32[1] = sinf( V.vector4_f32[1] );
+    Sin.vector4_f32[2] = sinf( V.vector4_f32[2] );
+    Sin.vector4_f32[3] = sinf( V.vector4_f32[3] );
+
+    XMVECTOR Cos;
+    Cos.vector4_f32[0] = cosf( V.vector4_f32[0] );
+    Cos.vector4_f32[1] = cosf( V.vector4_f32[1] );
+    Cos.vector4_f32[2] = cosf( V.vector4_f32[2] );
+    Cos.vector4_f32[3] = cosf( V.vector4_f32[3] );
+
+    *pSin = Sin;
+    *pCos = Cos;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32( x );
+    float32x4_t rflx = vsubq_f32(c, x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32( comp, x, rflx );
+    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation for sine
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pSin = vmulq_f32(Result, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
+    Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pCos = vmulq_f32(Result, sign);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation for sine
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    *pSin = Result;
+
+    // Compute polynomial approximation for cosine
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    Result = _mm_mul_ps(vConstants, x2);
+
+    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    Result = _mm_add_ps(Result, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    *pCos = Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTanEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = tanf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = tanf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = tanf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = tanf( V.vector4_f32[3] );
+    return Result;
+#else
+
+    XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
+
+    XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
+    V1 = XMVectorRound(V1);
+
+    V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
+
+    XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
+    XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
+    XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
+
+    XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
+    XMVECTOR V2 = XMVectorMultiply(V1, V1);
+    XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
+    XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
+
+    XMVECTOR D = XMVectorReciprocalEst(V2T2);
+    XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
+
+    return XMVectorMultiply(N, D);
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorASinEst
+(
+    FXMVECTOR V
+)
+{
+    // 3-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = asinf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = asinf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = asinf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = asinf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
+    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32( nonnegative, t0, t1 );
+    t0 = vsubq_f32(g_XMHalfPi, t0);
+    return t0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 t0 = _mm_mul_ps(vConstants, x);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    t0 = _mm_sub_ps(g_XMHalfPi, t0);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorACosEst
+(
+    FXMVECTOR V
+)
+{
+    // 3-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = acosf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = acosf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = acosf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = acosf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
+    XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
+    t0 = vmlaq_f32( vConstants, t0, x );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
+    t0 = vmlaq_f32( vConstants, t0, x );
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32( nonnegative, t0, t1 );
+    return t0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 t0 = _mm_mul_ps(vConstants, x);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, x);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
+    t0 = _mm_add_ps(t0, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATanEst
+(
+    FXMVECTOR V
+)
+{
+    // 9-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = atanf( V.vector4_f32[0] );
+    Result.vector4_f32[1] = atanf( V.vector4_f32[1] );
+    Result.vector4_f32[2] = atanf( V.vector4_f32[2] );
+    Result.vector4_f32[3] = atanf( V.vector4_f32[3] );
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t absV = vabsq_f32(V);
+    float32x4_t invV = XMVectorReciprocalEst(V);
+    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
+    uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne );
+    comp = vcleq_f32(absV, g_XMOne);
+    sign = vbslq_f32(comp, g_XMZero, sign );
+    uint32x4_t x = vbslq_f32(comp, V, invV );
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMATanEstCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(AEC), 1 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0);
+    Result = vmlaq_f32( vConstants, Result, x2 );
+
+    // ATanEstCoefficients0 is already splatted
+    Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 );
+    Result = vmulq_f32( Result, x );
+
+    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
+    result1 = vsubq_f32(result1, Result);
+
+    comp = vceqq_f32(sign, g_XMZero);
+    Result = vbslq_f32( comp, Result, result1 );
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 absV = XMVectorAbs(V);
+    __m128 invV = _mm_div_ps(g_XMOne, V);
+    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
+    __m128 select0 = _mm_and_ps(comp, g_XMOne);
+    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    __m128 sign = _mm_or_ps(select0, select1);
+    comp = _mm_cmple_ps(absV, g_XMOne);
+    select0 = _mm_and_ps(comp, g_XMZero);
+    select1 = _mm_andnot_ps(comp, sign);
+    sign = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, V);
+    select1 = _mm_andnot_ps(comp, invV);
+    __m128 x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMATanEstCoefficients1;
+    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
+    __m128 Result = _mm_mul_ps(vConstants, x2);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
+    Result = _mm_add_ps(Result, vConstants);
+    Result = _mm_mul_ps(Result, x2);
+
+    // ATanEstCoefficients0 is already splatted
+    Result = _mm_add_ps(Result, g_XMATanEstCoefficients0);
+    Result = _mm_mul_ps(Result, x);
+    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
+    result1 = _mm_sub_ps(result1, Result);
+
+    comp = _mm_cmpeq_ps(sign, g_XMZero);
+    select0 = _mm_and_ps(comp, Result);
+    select1 = _mm_andnot_ps(comp, result1);
+    Result = _mm_or_ps(select0, select1);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATan2Est
+(
+    FXMVECTOR Y, 
+    FXMVECTOR X
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] );
+    Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] );
+    Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] );
+    Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] );
+    return Result;
+#else
+
+    static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */};
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR ATanResultValid = XMVectorTrueInt();
+
+    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
+    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
+    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
+    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
+
+    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
+    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
+    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
+    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
+    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
+    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
+
+    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
+    Pi = XMVectorOrInt(Pi, YSign);
+    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
+    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
+    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
+
+    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
+    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
+    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
+    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
+    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
+    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
+
+    XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
+    XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
+    XMVECTOR R0 = XMVectorATanEst(V);
+
+    R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive );
+    R2 = XMVectorAdd(R0, R1);
+
+    Result = XMVectorSelect(Result, R2, ATanResultValid);
+
+    return Result;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLerp
+(
+    FXMVECTOR V0, 
+    FXMVECTOR V1, 
+    float    t
+)
+{
+    // V0 + t * (V1 - V0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Scale = XMVectorReplicate(t);
+    XMVECTOR Length = XMVectorSubtract(V1, V0);
+    return XMVectorMultiplyAdd(Length, Scale, V0);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR L = vsubq_f32( V1, V0 );
+    return vmlaq_n_f32( V0, L, t );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR L = _mm_sub_ps( V1, V0 );
+    XMVECTOR S = _mm_set_ps1( t );
+    XMVECTOR Result = _mm_mul_ps( L, S );
+    return _mm_add_ps( Result, V0 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLerpV
+(
+    FXMVECTOR V0, 
+    FXMVECTOR V1, 
+    FXMVECTOR T
+)
+{
+    // V0 + T * (V1 - V0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Length = XMVectorSubtract(V1, V0);
+    return XMVectorMultiplyAdd(Length, T, V0);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR L = vsubq_f32( V1, V0 );
+    return vmlaq_f32( V0, L, T );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR Length = _mm_sub_ps( V1, V0 );
+    XMVECTOR Result = _mm_mul_ps( Length, T );
+    return _mm_add_ps( Result, V0 );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorHermite
+(
+    FXMVECTOR Position0, 
+    FXMVECTOR Tangent0, 
+    FXMVECTOR Position1, 
+    GXMVECTOR Tangent1, 
+    float    t
+)
+{
+    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+    //          (t^3 - 2 * t^2 + t) * Tangent0 +
+    //          (-2 * t^3 + 3 * t^2) * Position1 +
+    //          (t^3 - t^2) * Tangent1
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
+    XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
+    XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
+    XMVECTOR T1 = XMVectorReplicate(t3 - t2);
+
+    XMVECTOR Result = XMVectorMultiply(P0, Position0);
+    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
+    Result = XMVectorMultiplyAdd(P1, Position1, Result);
+    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f;
+    float t0 = t3 - 2.0f * t2 + t;
+    float p1 = -2.0f * t3 + 3.0f * t2;
+    float t1 = t3 - t2;
+
+    XMVECTOR vResult = vmulq_n_f32(Position0, p0 );
+    vResult = vmlaq_n_f32( vResult, Tangent0, t0 );
+    vResult = vmlaq_n_f32( vResult, Position1, p1 );
+    vResult = vmlaq_n_f32( vResult, Tangent1, t1 );
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
+    XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
+    XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
+    XMVECTOR T1 = _mm_set_ps1(t3 - t2);
+
+    XMVECTOR vResult = _mm_mul_ps(P0, Position0);
+    XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0);
+    vResult = _mm_add_ps(vResult,vTemp);
+    vTemp = _mm_mul_ps(P1, Position1);
+    vResult = _mm_add_ps(vResult,vTemp);
+    vTemp = _mm_mul_ps(T1, Tangent1);
+    vResult = _mm_add_ps(vResult,vTemp);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorHermiteV
+(
+    FXMVECTOR Position0, 
+    FXMVECTOR Tangent0, 
+    FXMVECTOR Position1, 
+    GXMVECTOR Tangent1, 
+    HXMVECTOR T
+)
+{
+    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+    //          (t^3 - 2 * t^2 + t) * Tangent0 +
+    //          (-2 * t^3 + 3 * t^2) * Position1 +
+    //          (t^3 - t^2) * Tangent1
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR T2 = XMVectorMultiply(T, T);
+    XMVECTOR T3 = XMVectorMultiply(T , T2);
+
+    XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
+    XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
+    XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
+    XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
+
+    XMVECTOR Result = XMVectorMultiply(P0, Position0);
+    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
+    Result = XMVectorMultiplyAdd(P1, Position1, Result);
+    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
+    static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
+
+    XMVECTOR T2 = vmulq_f32(T,T);
+    XMVECTOR T3 = vmulq_f32(T,T2);
+    // Mul by the constants against t^2
+    T2 = vmulq_f32(T2,CatMulT2);
+    // Mul by the constants against t^3
+    T3 = vmlaq_f32(T2, T3, CatMulT3 );
+    // T3 now has the pre-result.
+    // I need to add t.y only
+    T2 = vandq_u32(T,g_XMMaskY);
+    T3 = vaddq_f32(T3,T2);
+    // Add 1.0f to x
+    T3 = vaddq_f32(T3,g_XMIdentityR0);
+    // Now, I have the constants created
+    // Mul the x constant to Position0
+    XMVECTOR vResult = vmulq_lane_f32( Position0, vget_low_f32( T3 ), 0 ); // T3[0]
+    // Mul the y constant to Tangent0
+    vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32( T3 ), 1 ); // T3[1]
+    // Mul the z constant to Position1
+    vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32( T3 ), 0  ); // T3[2]
+    // Mul the w constant to Tangent1
+    vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32( T3 ), 1 ); // T3[3]
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
+    static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
+
+    XMVECTOR T2 = _mm_mul_ps(T,T);
+    XMVECTOR T3 = _mm_mul_ps(T,T2);
+    // Mul by the constants against t^2
+    T2 = _mm_mul_ps(T2,CatMulT2);
+    // Mul by the constants against t^3
+    T3 = _mm_mul_ps(T3,CatMulT3);
+    // T3 now has the pre-result.
+    T3 = _mm_add_ps(T3,T2);
+    // I need to add t.y only
+    T2 = _mm_and_ps(T,g_XMMaskY);
+    T3 = _mm_add_ps(T3,T2);
+    // Add 1.0f to x
+    T3 = _mm_add_ps(T3,g_XMIdentityR0);
+    // Now, I have the constants created
+    // Mul the x constant to Position0
+    XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0));
+    vResult = _mm_mul_ps(vResult,Position0);
+    // Mul the y constant to Tangent0
+    T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1));
+    T2 = _mm_mul_ps(T2,Tangent0);
+    vResult = _mm_add_ps(vResult,T2);
+    // Mul the z constant to Position1
+    T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2));
+    T2 = _mm_mul_ps(T2,Position1);
+    vResult = _mm_add_ps(vResult,T2);
+    // Mul the w constant to Tangent1
+    T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3));
+    T3 = _mm_mul_ps(T3,Tangent1);
+    vResult = _mm_add_ps(vResult,T3);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCatmullRom
+(
+    FXMVECTOR Position0, 
+    FXMVECTOR Position1, 
+    FXMVECTOR Position2, 
+    GXMVECTOR Position3, 
+    float    t
+)
+{
+    // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
+    //           (3 * t^3 - 5 * t^2 + 2) * Position1 +
+    //           (-3 * t^3 + 4 * t^2 + t) * Position2 +
+    //           (t^3 - t^2) * Position3) * 0.5
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
+    XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
+    XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
+    XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
+
+    XMVECTOR Result = XMVectorMultiply(P0, Position0);
+    Result = XMVectorMultiplyAdd(P1, Position1, Result);
+    Result = XMVectorMultiplyAdd(P2, Position2, Result);
+    Result = XMVectorMultiplyAdd(P3, Position3, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    float p0 = (-t3 + 2.0f * t2 - t) * 0.5f;
+    float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f;
+    float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f;
+    float p3 = (t3 - t2) * 0.5f;
+
+    XMVECTOR P1 = vmulq_n_f32(Position1, p1);
+    XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0);
+    XMVECTOR P3 = vmulq_n_f32(Position3, p3);
+    XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2);
+    P0 = vaddq_f32(P0,P2);
+    return P0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
+    XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
+    XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
+    XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
+
+    P0 = _mm_mul_ps(P0, Position0);
+    P1 = _mm_mul_ps(P1, Position1);
+    P2 = _mm_mul_ps(P2, Position2);
+    P3 = _mm_mul_ps(P3, Position3);
+    P0 = _mm_add_ps(P0,P1);
+    P2 = _mm_add_ps(P2,P3);
+    P0 = _mm_add_ps(P0,P2);
+    return P0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV
+(
+    FXMVECTOR Position0, 
+    FXMVECTOR Position1, 
+    FXMVECTOR Position2, 
+    GXMVECTOR Position3, 
+    HXMVECTOR T
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fx = T.vector4_f32[0];
+    float fy = T.vector4_f32[1];
+    float fz = T.vector4_f32[2];
+    float fw = T.vector4_f32[3];
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]
+                             + (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]
+                             + (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]
+                             + (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]);
+    vResult.vector4_f32[1] = 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]
+                             + (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]
+                             + (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]
+                             + (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]);
+    vResult.vector4_f32[2] = 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]
+                             + (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]
+                             + (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]
+                             + (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]);
+    vResult.vector4_f32[3] = 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]
+                             + (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]
+                             + (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]
+                             + (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]);
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
+    static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
+    static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
+    static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
+    // Cache T^2 and T^3
+    XMVECTOR T2 = vmulq_f32(T,T);
+    XMVECTOR T3 = vmulq_f32(T,T2);
+    // Perform the Position0 term
+    XMVECTOR vResult = vaddq_f32(T2,T2);
+    vResult = vsubq_f32(vResult,T);
+    vResult = vsubq_f32(vResult,T3);
+    vResult = vmulq_f32(vResult,Position0);
+    // Perform the Position1 term and add
+    XMVECTOR vTemp = vmulq_f32(T3,Catmul3);
+    vTemp = vmlsq_f32(vTemp, T2, Catmul5);
+    vTemp = vaddq_f32(vTemp,Catmul2);
+    vResult = vmlaq_f32(vResult, vTemp, Position1);
+    // Perform the Position2 term and add
+    vTemp = vmulq_f32(T2,Catmul4);
+    vTemp = vmlsq_f32(vTemp, T3, Catmul3);
+    vTemp = vaddq_f32(vTemp,T);
+    vResult = vmlaq_f32(vResult, vTemp, Position2);
+    // Position3 is the last term
+    T3 = vsubq_f32(T3,T2);
+    vResult = vmlaq_f32(vResult, T3, Position3);
+    // Multiply by 0.5f and exit
+    vResult = vmulq_f32(vResult,g_XMOneHalf);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
+    static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
+    static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
+    static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
+    // Cache T^2 and T^3
+    XMVECTOR T2 = _mm_mul_ps(T,T);
+    XMVECTOR T3 = _mm_mul_ps(T,T2);
+    // Perform the Position0 term
+    XMVECTOR vResult = _mm_add_ps(T2,T2);
+    vResult = _mm_sub_ps(vResult,T);
+    vResult = _mm_sub_ps(vResult,T3);
+    vResult = _mm_mul_ps(vResult,Position0);
+    // Perform the Position1 term and add
+    XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3);
+    XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5);
+    vTemp = _mm_sub_ps(vTemp,vTemp2);
+    vTemp = _mm_add_ps(vTemp,Catmul2);
+    vTemp = _mm_mul_ps(vTemp,Position1);
+    vResult = _mm_add_ps(vResult,vTemp);
+    // Perform the Position2 term and add
+    vTemp = _mm_mul_ps(T2,Catmul4);
+    vTemp2 = _mm_mul_ps(T3,Catmul3);
+    vTemp = _mm_sub_ps(vTemp,vTemp2);
+    vTemp = _mm_add_ps(vTemp,T);
+    vTemp = _mm_mul_ps(vTemp,Position2);
+    vResult = _mm_add_ps(vResult,vTemp);
+    // Position3 is the last term
+    T3 = _mm_sub_ps(T3,T2);
+    T3 = _mm_mul_ps(T3,Position3);
+    vResult = _mm_add_ps(vResult,T3);
+    // Multiply by 0.5f and exit
+    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorBaryCentric
+(
+    FXMVECTOR Position0, 
+    FXMVECTOR Position1, 
+    FXMVECTOR Position2, 
+    float    f, 
+    float    g
+)
+{
+    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
+    XMVECTOR ScaleF = XMVectorReplicate(f);
+
+    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
+    XMVECTOR ScaleG = XMVectorReplicate(g);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
+    Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR R1 = vsubq_f32(Position1,Position0);
+    XMVECTOR R2 = vsubq_f32(Position2,Position0);
+    R1 = vmlaq_n_f32( Position0, R1, f);
+    return vmlaq_n_f32( R1, R2, g );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
+    XMVECTOR SF = _mm_set_ps1(f);
+    XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
+    XMVECTOR SG = _mm_set_ps1(g);
+    R1 = _mm_mul_ps(R1,SF);
+    R2 = _mm_mul_ps(R2,SG);
+    R1 = _mm_add_ps(R1,Position0);
+    R1 = _mm_add_ps(R1,R2);
+    return R1;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV
+(
+    FXMVECTOR Position0, 
+    FXMVECTOR Position1, 
+    FXMVECTOR Position2, 
+    GXMVECTOR F, 
+    HXMVECTOR G
+)
+{
+    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
+    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
+    Result = XMVectorMultiplyAdd(P20, G, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR R1 = vsubq_f32(Position1,Position0);
+    XMVECTOR R2 = vsubq_f32(Position2,Position0);
+    R1 = vmlaq_f32( Position0, R1, F );
+    return vmlaq_f32( R1, R2, G);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
+    XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
+    R1 = _mm_mul_ps(R1,F);
+    R2 = _mm_mul_ps(R2,G);
+    R1 = _mm_add_ps(R1,Position0);
+    R1 = _mm_add_ps(R1,R2);
+    return R1;
+#endif
+}
+
+/****************************************************************************
+ *
+ * 2D Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2Equal
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+// z and w are don't care
+    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2EqualR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] == V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] != V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    uint64_t r = vget_lane_u64( vTemp, 0 );
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+// z and w are don't care
+    int iTest = _mm_movemask_ps(vTemp)&3;
+    uint32_t CR = 0;
+    if (iTest==3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2EqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2EqualIntR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 
+        (V1.vector4_u32[1] == V2.vector4_u32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 
+        (V1.vector4_u32[1] != V2.vector4_u32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
+    uint64_t r = vget_lane_u64( vTemp, 0 );
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3;
+    uint32_t CR = 0;
+    if (iTest==3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2NearEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
+    float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
+    return ((dx <= Epsilon.vector4_f32[0]) &&
+            (dy <= Epsilon.vector4_f32[1]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2));
+    uint32x2_t vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) );
+    uint64_t r = vget_lane_u64( vTemp, 0 );
+    return ( r == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp,vDelta);
+    vTemp = _mm_max_ps(vTemp,vDelta);
+    vTemp = _mm_cmple_ps(vTemp,Epsilon);
+    // z and w are don't care
+    return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2NotEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+// z and w are don't care
+    return (((_mm_movemask_ps(vTemp)&3)!=3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2NotEqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2Greater
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+// z and w are don't care
+    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2GreaterR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] > V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] <= V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    uint64_t r = vget_lane_u64( vTemp, 0 );
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+    int iTest = _mm_movemask_ps(vTemp)&3;
+    uint32_t CR = 0;
+    if (iTest==3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2GreaterOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] >= V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] < V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    uint64_t r = vget_lane_u64( vTemp, 0 );
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    int iTest = _mm_movemask_ps(vTemp)&3;
+    uint32_t CR = 0;
+    if (iTest == 3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2Less
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2LessOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2InBounds
+(
+    FXMVECTOR V, 
+    FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
+        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    float32x2_t B = vget_low_f32( Bounds );
+    // Test if less than or equal
+    uint32x2_t ivTemp1 = vcle_f32(VL,B);
+    // Negate the bounds
+    float32x2_t vTemp2 = vneg_f32(B);
+    // Test if greater or equal (Reversed)
+    uint32x2_t ivTemp2 = vcle_f32(vTemp2,VL);
+    // Blend answers
+    ivTemp1 = vand_u32(ivTemp1,ivTemp2);
+    // x and y in bounds?
+    return ( vget_lane_u64( ivTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2,V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+    // x and y in bounds? (z and w are don't care)
+    return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2IsNaN
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (XMISNAN(V.vector4_f32[0]) ||
+            XMISNAN(V.vector4_f32[1]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    // Test against itself. NaN is always not equal
+    uint32x2_t vTempNan = vceq_f32( VL, VL );
+    // If x or y are NaN, the mask is zero
+    return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
+    // If x or y are NaN, the mask is non-zero
+    return ((_mm_movemask_ps(vTempNan)&3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2IsInfinite
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return (XMISINF(V.vector4_f32[0]) ||
+            XMISINF(V.vector4_f32[1]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x2_t vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) );
+    // Compare to infinity
+    vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) );
+    // If any are infinity, the signs are true.
+    return vget_lane_u64( vTemp, 0 ) != 0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+    // If x or z are infinity, the signs are true.
+    return ((_mm_movemask_ps(vTemp)&3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] =
+    Result.vector4_f32[1] =
+    Result.vector4_f32[2] =
+    Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Perform the dot product on x and y
+    float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    return vcombine_f32( vTemp, vTemp );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps( V1, V2, 0x3f );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V1, V2);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_moveldup_ps(vDot);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Cross
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = fCross;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 };
+
+    float32x2_t vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) );
+    vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    return vcombine_f32( vTemp, vTemp );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap x and y
+    XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1));
+    // Perform the muls
+    vResult = _mm_mul_ps(vResult,V1);
+    // Splat y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
+    // Sub the values
+    vResult = _mm_sub_ss(vResult,vTemp);
+    // Splat the cross product
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthSq
+(
+    FXMVECTOR V
+)
+{
+    return XMVector2Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorReciprocalSqrtEst(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32( VL, VL );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    // Reciprocal sqrt (estimate)
+    vTemp = vrsqrte_f32( vTemp );
+    return vcombine_f32( vTemp, vTemp );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_rsqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ss(vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = _mm_rsqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorReciprocalSqrt(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32( VL, VL );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    // Reciprocal sqrt
+    float32x2_t  S0 = vrsqrte_f32(vTemp);
+    float32x2_t  P0 = vmul_f32( vTemp, S0 );
+    float32x2_t  R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t  S1 = vmul_f32( S0, R0 );
+    float32x2_t  P1 = vmul_f32( vTemp, S1 );
+    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t Result = vmul_f32( S1, R1 );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = _mm_sqrt_ss(vLengthSq);
+    vLengthSq = _mm_div_ss(g_XMOne,vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorSqrtEst(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32( VL, VL );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32( vTemp, zero );
+    // Sqrt (estimate)
+    float32x2_t Result = vrsqrte_f32( vTemp );
+    Result = vmul_f32( vTemp, Result );
+    Result = vbsl_f32( VEqualsZero, zero, Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = _mm_sqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Length
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorSqrt(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32( VL, VL );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32( vTemp, zero );
+    // Sqrt
+    float32x2_t S0 = vrsqrte_f32( vTemp );
+    float32x2_t P0 = vmul_f32( vTemp, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( vTemp, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t Result = vmul_f32( S1, R1 );
+    Result = vmul_f32( vTemp, Result );
+    Result = vbsl_f32( VEqualsZero, zero, Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// XMVector2NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2ReciprocalLength(V);
+    Result = XMVectorMultiply(V, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32( VL, VL );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    // Reciprocal sqrt (estimate)
+    vTemp = vrsqrte_f32( vTemp );
+    // Normalize
+    float32x2_t Result = vmul_f32( VL, vTemp );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    vLengthSq = _mm_mul_ps(vLengthSq, V);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = _mm_rsqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    vLengthSq = _mm_mul_ps(vLengthSq,V);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Normalize
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR vResult = XMVector2Length( V );
+    float fLength = vResult.vector4_f32[0];
+
+    // Prevent divide by zero
+    if (fLength > 0) {
+        fLength = 1.0f/fLength;
+    }
+    
+    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
+    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
+    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
+    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32( VL, VL );
+    vTemp = vpadd_f32( vTemp, vTemp );
+    uint32x2_t VEqualsZero = vceq_f32( vTemp, vdup_n_f32(0) );
+    uint32x2_t VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) );
+    // Reciprocal sqrt (2 iterations of Newton-Raphson)
+    float32x2_t S0 = vrsqrte_f32( vTemp );
+    float32x2_t P0 = vmul_f32( vTemp, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( vTemp, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    vTemp = vmul_f32( S1, R1 );
+    // Normalize
+    float32x2_t Result = vmul_f32( VL, vTemp );
+    Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
+    Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#elif defined(_XM_SSE3_INTRINSICS_)
+    // Perform the dot product on x and y only
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_moveldup_ps(vLengthSq);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y only
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ClampLength
+(
+    FXMVECTOR V, 
+    float    LengthMin, 
+    float    LengthMax
+)
+{
+    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
+    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
+    return XMVector2ClampLengthV(V, ClampMin, ClampMax);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV
+(
+    FXMVECTOR V, 
+    FXMVECTOR LengthMin, 
+    FXMVECTOR LengthMax
+)
+{
+    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
+    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
+    assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
+    assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
+    assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
+
+    XMVECTOR LengthSq = XMVector2LengthSq(V);
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
+
+    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
+
+    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+    Length = XMVectorSelect(LengthSq, Length, Select);
+    Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
+    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
+
+    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
+
+    // Preserve the original vector (with no precision loss) if the length falls within the given range
+    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
+    Result = XMVectorSelect(Result, V, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Reflect
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal
+)
+{
+    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+
+    XMVECTOR Result;
+    Result = XMVector2Dot(Incident, Normal);
+    Result = XMVectorAdd(Result, Result);
+    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Refract
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal, 
+    float    RefractionIndex
+)
+{
+    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
+    return XMVector2RefractV(Incident, Normal, Index);
+}
+
+//------------------------------------------------------------------------------
+
+// Return the refraction of a 2D vector
+inline XMVECTOR XM_CALLCONV XMVector2RefractV
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal, 
+    FXMVECTOR RefractionIndex
+)
+{
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]);
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    float RY = 1.0f-(IDotN*IDotN);
+    float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]);
+    RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]);
+    if (RX>=0.0f) {
+        RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX)));
+    } else {
+        RX = 0.0f;
+    }
+    if (RY>=0.0f) {
+        RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY)));
+    } else {
+        RY = 0.0f;
+    }
+
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = RX;
+    vResult.vector4_f32[1] = RY;
+    vResult.vector4_f32[2] = 0.0f;   
+    vResult.vector4_f32[3] = 0.0f;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t IL = vget_low_f32( Incident );
+    float32x2_t NL = vget_low_f32( Normal );
+    float32x2_t RIL = vget_low_f32( RefractionIndex );
+    // Get the 2D Dot product of Incident-Normal
+    float32x2_t vTemp = vmul_f32(IL, NL);
+    float32x2_t IDotN = vpadd_f32( vTemp, vTemp );
+    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN);
+    vTemp = vmul_f32(vTemp,RIL);
+    vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL );
+    // If any terms are <=0, sqrt() will fail, punt to zero
+    uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) );
+    // Sqrt(vTemp)
+    float32x2_t S0 = vrsqrte_f32(vTemp);
+    float32x2_t P0 = vmul_f32( vTemp, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( vTemp, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t S2 = vmul_f32( S1, R1 );
+    vTemp = vmul_f32( vTemp, S2 );
+    // R = RefractionIndex * IDotN + sqrt(R)
+    vTemp = vmla_f32( vTemp, RIL, IDotN );
+    // Result = RefractionIndex * Incident - Normal * R
+    float32x2_t vResult = vmul_f32(RIL,IL);
+    vResult = vmls_f32( vResult, vTemp, NL );
+    vResult = vand_u32(vResult,vMask);
+    return vcombine_f32(vResult, vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+    // Get the 2D Dot product of Incident-Normal
+    XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
+    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN);
+    vTemp = _mm_sub_ps(g_XMOne,vTemp);
+    vTemp = _mm_mul_ps(vTemp,RefractionIndex);
+    vTemp = _mm_mul_ps(vTemp,RefractionIndex);
+    vTemp = _mm_sub_ps(g_XMOne,vTemp);
+    // If any terms are <=0, sqrt() will fail, punt to zero
+    XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero);
+    // R = RefractionIndex * IDotN + sqrt(R)
+    vTemp = _mm_sqrt_ps(vTemp);
+    XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN);
+    vTemp = _mm_add_ps(vTemp,vResult);
+    // Result = RefractionIndex * Incident - Normal * R
+    vResult = _mm_mul_ps(RefractionIndex,Incident);
+    vTemp = _mm_mul_ps(vTemp,Normal);
+    vResult = _mm_sub_ps(vResult,vTemp);
+    vResult = _mm_and_ps(vResult,vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Orthogonal
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = -V.vector4_f32[1];
+    Result.vector4_f32[1] = V.vector4_f32[0];
+    Result.vector4_f32[2] = 0.f;
+    Result.vector4_f32[3] = 0.f;
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 };
+    const float32x2_t zero = vdup_n_f32(0);
+
+    float32x2_t VL = vget_low_f32( V );
+    float32x2_t Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) );
+    return vcombine_f32( Result, zero );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
+    vResult = _mm_mul_ps(vResult,g_XMNegateX);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst
+(
+    FXMVECTOR N1, 
+    FXMVECTOR N2
+)
+{
+    XMVECTOR Result = XMVector2Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACosEst(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals
+(
+    FXMVECTOR N1, 
+    FXMVECTOR N2
+)
+{
+    XMVECTOR Result = XMVector2Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
+    Result = XMVectorACos(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR L1 = XMVector2ReciprocalLength(V1);
+    XMVECTOR L2 = XMVector2ReciprocalLength(V2);
+
+    XMVECTOR Dot = XMVector2Dot(V1, V2);
+
+    L1 = XMVectorMultiply(L1, L2);
+
+    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
+    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
+
+    return XMVectorACos(CosAngle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance
+(
+    FXMVECTOR LinePoint1, 
+    FXMVECTOR LinePoint2, 
+    FXMVECTOR Point
+)
+{
+    // Given a vector PointVector from LinePoint1 to Point and a vector
+    // LineVector from LinePoint1 to LinePoint2, the scaled distance 
+    // PointProjectionScale from LinePoint1 to the perpendicular projection
+    // of PointVector onto the line is defined as:
+    //
+    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
+
+    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
+    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
+
+    XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
+
+    XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
+    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
+
+    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
+    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
+
+    return XMVector2Length(DistanceVector);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2IntersectLine
+(
+    FXMVECTOR Line1Point1, 
+    FXMVECTOR Line1Point2, 
+    FXMVECTOR Line2Point1, 
+    GXMVECTOR Line2Point2
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
+    XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
+    XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
+
+    XMVECTOR C1 = XMVector2Cross(V1, V2);
+    XMVECTOR C2 = XMVector2Cross(V2, V3);
+
+    XMVECTOR Result;
+    const XMVECTOR Zero = XMVectorZero();
+    if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
+    {
+        if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
+        {
+            // Coincident
+            Result = g_XMInfinity.v;
+        }
+        else
+        {
+            // Parallel
+            Result = g_XMQNaN.v;
+        }
+    }
+    else
+    {
+        // Intersection point = Line1Point1 + V1 * (C2 / C1)
+        XMVECTOR Scale = XMVectorReciprocal(C1);
+        Scale = XMVectorMultiply(C2, Scale);
+        Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
+    }
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
+    XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
+    XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
+    // Generate the cross products
+    XMVECTOR C1 = XMVector2Cross(V1, V2);
+    XMVECTOR C2 = XMVector2Cross(V2, V3);
+    // If C1 is not close to epsilon, use the calculated value
+    XMVECTOR vResultMask = _mm_setzero_ps();
+    vResultMask = _mm_sub_ps(vResultMask,C1);
+    vResultMask = _mm_max_ps(vResultMask,C1);
+    // 0xFFFFFFFF if the calculated value is to be used
+    vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon);
+    // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
+    XMVECTOR vFailMask = _mm_setzero_ps();
+    vFailMask = _mm_sub_ps(vFailMask,C2);
+    vFailMask = _mm_max_ps(vFailMask,C2);
+    vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon);
+    XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity);
+    vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN);
+    // vFail is NAN or INF
+    vFail = _mm_or_ps(vFail,vFailMask);
+    // Intersection point = Line1Point1 + V1 * (C2 / C1)
+    XMVECTOR vResult = _mm_div_ps(C2,C1);
+    vResult = _mm_mul_ps(vResult,V1);
+    vResult = _mm_add_ps(vResult,Line1Point1);
+    // Use result, or failure value
+    vResult = _mm_and_ps(vResult,vResultMask);
+    vResultMask = _mm_andnot_ps(vResultMask,vFail);
+    vResult = _mm_or_ps(vResult,vResultMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    float32x4_t Result = vmlaq_lane_f32( M.r[3], M.r[1], VL, 1 ); // Y
+    return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+    vResult = _mm_mul_ps(vResult,M.r[0]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    vTemp = _mm_mul_ps(vTemp,M.r[1]);
+    vResult = _mm_add_ps(vResult,vTemp);
+    vResult = _mm_add_ps(vResult,M.r[3]);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
+(
+    XMFLOAT4*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT2* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
+
+    assert(OutputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT2)*4;
+
+                float32x2_t r3 = vget_low_f32( row3 );
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
+
+                __prefetch( pInputVector );
+
+                r3 = vget_high_f32( row3 );
+                r = vget_high_f32( row0 );
+                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
+                XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
+  
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( row1 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
+                vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                float32x4x4_t R;
+                R.val[0] = vResult0;
+                R.val[1] = vResult1;
+                R.val[2] = vResult2;
+                R.val[3] = vResult3;
+
+                vst4q_f32( reinterpret_cast<float*>(pOutputVector), R );
+                pOutputVector += sizeof(XMFLOAT4)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+        pInputVector += InputStride; 
+
+        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
+
+        vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if ( two > 0 )
+    {
+        if ( InputStride == sizeof(XMFLOAT2) )
+        {
+            if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
+            {
+                // Packed input, aligned output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    pInputVector += sizeof(XMFLOAT2)*2;
+
+                    XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                    XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+
+                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+                    X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+
+                    vTemp = _mm_mul_ps( Y, row1 );
+                    vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+            else
+            {
+                // Packed input, unaligned output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    pInputVector += sizeof(XMFLOAT2)*2;
+
+                    XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                    XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+
+                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+                    X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+
+                    vTemp = _mm_mul_ps( Y, row1 );
+                    vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
+    {
+        if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
+        {
+            // Aligned input, aligned output
+            for (; i < VectorCount; i++)
+            {
+                XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
+                pInputVector += InputStride; 
+
+                XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+
+                XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                vTemp = _mm_add_ps( vTemp, row3 );
+                vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                pOutputVector += OutputStride;
+            }
+        }
+        else
+        {
+            // Aligned input, unaligned output
+            for (; i < VectorCount; i++)
+            {
+                XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
+                pInputVector += InputStride; 
+
+                XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+
+                XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                vTemp = _mm_add_ps( vTemp, row3 );
+                vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                pOutputVector += OutputStride;
+            }
+        }
+    }
+    else
+    {
+        // Unaligned input
+        for (; i < VectorCount; i++)
+        {
+            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
+            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
+            pInputVector += InputStride; 
+
+            XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0));
+            XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0));
+
+            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, row3 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+            _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    XMVECTOR W = XMVectorSplatW(Result);
+    return XMVectorDivide( Result, W );
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
+(
+    XMFLOAT2*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT2* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
+
+    assert(OutputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        XMVECTOR W = XMVectorSplatW(Result);
+
+        Result = XMVectorDivide(Result, W);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT2)*4;
+
+                float32x2_t r3 = vget_low_f32( row3 );
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
+
+                __prefetch( pInputVector );
+
+                r3 = vget_high_f32( row3 );
+                r = vget_high_f32( row0 );
+                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
+  
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( row1 );
+                W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                
+                V.val[0] = vmulq_f32( vResult0, Reciprocal );
+                V.val[1] = vmulq_f32( vResult1, Reciprocal );
+
+                vst2q_f32( reinterpret_cast<float*>(pOutputVector),V );
+                pOutputVector += sizeof(XMFLOAT2)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+        pInputVector += InputStride; 
+
+        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
+
+        V = vget_high_f32( vResult );
+        float32x2_t W = vdup_lane_f32( V, 1 );
+
+        // 2 iterations of Newton-Raphson refinement of reciprocal for W
+        float32x2_t Reciprocal = vrecpe_f32( W );
+        float32x2_t S = vrecps_f32( Reciprocal, W );
+        Reciprocal = vmul_f32( S, Reciprocal );
+        S = vrecps_f32( Reciprocal, W );
+        Reciprocal = vmul_f32( S, Reciprocal );
+
+        V = vget_low_f32( vResult );
+        V = vmul_f32( V, Reciprocal );
+
+        vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if ( two > 0 )
+    {
+        if ( InputStride == sizeof(XMFLOAT2) )
+        {
+            if ( OutputStride == sizeof(XMFLOAT2) )
+            {
+                if ( !((uintptr_t)pOutputStream & 0xF) )
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        pInputVector += sizeof(XMFLOAT2)*2; 
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        XMVECTOR V1 = _mm_div_ps( vTemp, W );
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+
+                        vTemp = _mm_mul_ps( Y, row1 );
+                        vTemp2 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        XMVECTOR V2 = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_movelh_ps( V1, V2 );
+
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                        pOutputVector += sizeof(XMFLOAT2)*2;
+
+                        i += 2;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        pInputVector += sizeof(XMFLOAT2)*2; 
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        XMVECTOR V1 = _mm_div_ps( vTemp, W );
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+
+                        vTemp = _mm_mul_ps( Y, row1 );
+                        vTemp2 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        XMVECTOR V2 = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_movelh_ps( V1, V2 );
+
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                        pOutputVector += sizeof(XMFLOAT2)*2;
+
+                        i += 2;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    pInputVector += sizeof(XMFLOAT2)*2;
+
+                    // Result 1
+                    XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                    vTemp = _mm_div_ps( vTemp, W );
+                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+                    X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+
+                    vTemp = _mm_mul_ps( Y, row1 );
+                    vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                    vTemp = _mm_div_ps( vTemp, W );
+                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
+    {
+        // Aligned input
+        for (; i < VectorCount; i++)
+        {
+            XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
+            pInputVector += InputStride; 
+
+            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, row3 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+            XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+            vTemp = _mm_div_ps( vTemp, W );
+            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+            pOutputVector += OutputStride;
+        }
+    }
+    else
+    {
+        // Unaligned input
+        for (; i < VectorCount; i++)
+        {
+            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
+            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
+            pInputVector += InputStride; 
+
+            XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
+            XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
+
+            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, row3 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+
+            XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+            vTemp = _mm_div_ps( vTemp, W );
+            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    float32x4_t Result = vmulq_lane_f32( M.r[1], VL, 1 ); // Y
+    return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+    vResult = _mm_mul_ps(vResult,M.r[0]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    vTemp = _mm_mul_ps(vTemp,M.r[1]);
+    vResult = _mm_add_ps(vResult,vTemp);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
+(
+    XMFLOAT2*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT2* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
+
+    assert(OutputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiply(Y, row1);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT2)*4;
+
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
+                XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
+
+                __prefetch( pInputVector );
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                V.val[0] = vResult0;
+                V.val[1] = vResult1;
+
+                vst2q_f32( reinterpret_cast<float*>(pOutputVector), V );
+                pOutputVector += sizeof(XMFLOAT2)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+        pInputVector += InputStride; 
+
+        XMVECTOR vResult = vmulq_lane_f32( row0, V, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
+
+        V = vget_low_f32( vResult );
+        vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if ( two > 0 )
+    {
+        if ( InputStride == sizeof(XMFLOAT2) )
+        {
+            if ( OutputStride == sizeof(XMFLOAT2) )
+            {
+                if ( !((uintptr_t)pOutputStream & 0xF) )
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        pInputVector += sizeof(XMFLOAT2)*2; 
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                        XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 );
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+
+                        vTemp = _mm_mul_ps( Y, row1 );
+                        vTemp2 = _mm_mul_ps( X, row0 );
+                        XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 );
+
+                        vTemp = _mm_movelh_ps( V1, V2 );
+
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                        pOutputVector += sizeof(XMFLOAT2)*2;
+
+                        i += 2;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        pInputVector += sizeof(XMFLOAT2)*2; 
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                        XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 );
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+                        X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+
+                        vTemp = _mm_mul_ps( Y, row1 );
+                        vTemp2 = _mm_mul_ps( X, row0 );
+                        XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 );
+
+                        vTemp = _mm_movelh_ps( V1, V2 );
+
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                        pOutputVector += sizeof(XMFLOAT2)*2;
+
+                        i += 2;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    pInputVector += sizeof(XMFLOAT2)*2;
+
+                    // Result 1
+                    XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
+                    X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+
+                    vTemp = _mm_mul_ps( Y, row1 );
+                    vTemp2 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
+    {
+        // Aligned input
+        for (; i < VectorCount; i++)
+        {
+            XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
+            pInputVector += InputStride; 
+
+            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+            pOutputVector += OutputStride;
+        }
+    }
+    else
+    {
+        // Unaligned input
+        for (; i < VectorCount; i++)
+        {
+            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
+            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
+            pInputVector += InputStride; 
+
+            XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
+            XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
+
+            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
+
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+/****************************************************************************
+ *
+ * 3D Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3Equal
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3EqualR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] == V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] != V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+    int iTest = _mm_movemask_ps(vTemp)&7;
+    uint32_t CR = 0;
+    if (iTest==7)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3EqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3EqualIntR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 
+        (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
+        (V1.vector4_u32[2] == V2.vector4_u32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 
+        (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
+        (V1.vector4_u32[2] != V2.vector4_u32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7;
+    uint32_t CR = 0;
+    if (iTemp==7)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTemp)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3NearEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float dx, dy, dz;
+
+    dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
+    dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
+    dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
+    return (((dx <= Epsilon.vector4_f32[0]) &&
+            (dy <= Epsilon.vector4_f32[1]) &&
+            (dz <= Epsilon.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vDelta = vsubq_f32( V1, V2 );
+    uint32x4_t vResult = vacleq_f32( vDelta, Epsilon );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp,vDelta);
+    vTemp = _mm_max_ps(vTemp,vDelta);
+    vTemp = _mm_cmple_ps(vTemp,Epsilon);
+    // w is don't care
+    return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3NotEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1)  & 0xFFFFFFU) != 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&7)!=7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3NotEqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1)  & 0xFFFFFFU) != 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3Greater
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3GreaterR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] > V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] <= V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp)&7;
+    if (iTest==7) 
+    {
+        CR =  XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3GreaterOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] >= V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] < V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp)&7;
+    if (iTest==7) 
+    {
+        CR =  XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3Less
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcltq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3LessOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcleq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
+    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3InBounds
+(
+    FXMVECTOR V, 
+    FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
+        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    uint32x4_t ivTemp1 = vcleq_f32(V,Bounds);
+    // Negate the bounds
+    float32x4_t vTemp2 = vnegq_f32(Bounds);
+    // Test if greater or equal (Reversed)
+    uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V);
+    // Blend answers
+    ivTemp1 = vandq_u32(ivTemp1,ivTemp2);
+    // in bounds?
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2,V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+    // x,y and z in bounds? (w is don't care)
+    return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0);
+#else
+    return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3IsNaN
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return (XMISNAN(V.vector4_f32[0]) ||
+            XMISNAN(V.vector4_f32[1]) ||
+            XMISNAN(V.vector4_f32[2]));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    uint32x4_t vTempNan = vceqq_f32( V, V );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    // If x or y or z are NaN, the mask is zero
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
+    // If x or y or z are NaN, the mask is non-zero
+    return ((_mm_movemask_ps(vTempNan)&7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3IsInfinite
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (XMISINF(V.vector4_f32[0]) ||
+            XMISINF(V.vector4_f32[1]) ||
+            XMISINF(V.vector4_f32[2]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask );
+    // Compare to infinity
+    vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
+    // If any are infinity, the signs are true.
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+    // If x,y or z are infinity, the signs are true.
+    return ((_mm_movemask_ps(vTemp)&7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = 
+    vResult.vector4_f32[1] = 
+    vResult.vector4_f32[2] = 
+    vResult.vector4_f32[3] = fValue;
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTemp = vmulq_f32( V1, V2 );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    return vcombine_f32( v1, v1 );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps( V1, V2, 0x7f );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_and_ps(vTemp, g_XMMask3);
+    vTemp = _mm_hadd_ps(vTemp,vTemp);
+    return _mm_hadd_ps(vTemp,vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(V1,V2);
+    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.vector4_f32[0] = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.vector4_f32[2]
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.vector4_f32[0] = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+    return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Cross
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vResult = {
+        (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
+        (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
+        (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
+        0.0f
+    };
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v1xy = vget_low_f32(V1);
+    float32x2_t v2xy = vget_low_f32(V2);
+
+    float32x2_t v1yx = vrev64_f32( v1xy );
+    float32x2_t v2yx = vrev64_f32( v2xy );
+
+    float32x2_t v1zz = vdup_lane_f32( vget_high_f32(V1), 0 );
+    float32x2_t v2zz = vdup_lane_f32( vget_high_f32(V2), 0 );
+
+    XMVECTOR vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) );
+    vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) );
+    vResult = veorq_u32( vResult, g_XMFlipY );
+    return vandq_u32( vResult, g_XMMask3 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // y1,z1,x1,w1
+    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1));
+    // z2,x2,y2,w2
+    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the left operation
+    XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2);
+    // z1,x1,y1,w1
+    vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1));
+    // y2,z2,x2,w2
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the right operation
+    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+    // Subract the right from left, and return answer
+    vResult = _mm_sub_ps(vResult,vTemp1);
+    // Set w to zero
+    return _mm_and_ps(vResult,g_XMMask3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthSq
+(
+    FXMVECTOR V
+)
+{
+    return XMVector3Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorReciprocalSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32( v1 );
+    return vcombine_f32(v2, v2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_rsqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq  = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
+    // x+z, y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    // y,y,y,y
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // x+z+y,??,??,??
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    // Splat the length squared
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Get the reciprocal
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorReciprocalSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    // Reciprocal sqrt
+    float32x2_t  S0 = vrsqrte_f32(v1);
+    float32x2_t  P0 = vmul_f32( v1, S0 );
+    float32x2_t  R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t  S1 = vmul_f32( S0, R0 );
+    float32x2_t  P1 = vmul_f32( v1, S1 );
+    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t Result = vmul_f32( S1, R1 );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    vDot = _mm_and_ps(vDot, g_XMMask3);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_sqrt_ps(vDot);
+    vDot = _mm_div_ps(g_XMOne,vDot);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+     // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(V,V);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.z
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
+    // Get the reciprocal
+    vDot = _mm_sqrt_ps(vDot);
+    // Get the reciprocal
+    vDot = _mm_div_ps(g_XMOne,vDot);
+    return vDot;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
+    // Sqrt (estimate)
+    float32x2_t Result = vrsqrte_f32( v1 );
+    Result = vmul_f32( v1, Result );
+    Result = vbsl_f32( VEqualsZero, zero, Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
+    // x+z, y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    // y,y,y,y
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // x+z+y,??,??,??
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    // Splat the length squared
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Length
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
+    // Sqrt
+    float32x2_t S0 = vrsqrte_f32( v1 );
+    float32x2_t P0 = vmul_f32( v1, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( v1, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t Result = vmul_f32( S1, R1 );
+    Result = vmul_f32( v1, Result );
+    Result = vbsl_f32( VEqualsZero, zero, Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
+    // x+z, y
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    // y,y,y,y
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // x+z+y,??,??,??
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    // Splat the length squared
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// XMVector3NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector3ReciprocalLength(V);
+    Result = XMVectorMultiply(V, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32( v1 );
+    // Normalize
+    return vmulq_f32( V, vcombine_f32(v2,v2) );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    vDot = _mm_and_ps(vDot, g_XMMask3);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_rsqrt_ps(vDot);
+    vDot = _mm_mul_ps(vDot,V);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+     // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(V,V);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.z
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
+    // Get the reciprocal
+    vDot = _mm_rsqrt_ps(vDot);
+    // Perform the normalization
+    vDot = _mm_mul_ps(vDot,V);
+    return vDot;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Normalize
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fLength;
+    XMVECTOR vResult;
+
+    vResult = XMVector3Length( V );
+    fLength = vResult.vector4_f32[0];
+
+    // Prevent divide by zero
+    if (fLength > 0) {
+        fLength = 1.0f/fLength;
+    }
+    
+    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
+    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
+    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
+    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vpadd_f32( v1, v1 );
+    v2 = vdup_lane_f32( v2, 0 );
+    v1 = vadd_f32( v1, v2 );
+    uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
+    uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
+    // Reciprocal sqrt (2 iterations of Newton-Raphson)
+    float32x2_t S0 = vrsqrte_f32( v1 );
+    float32x2_t P0 = vmul_f32( v1, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( v1, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    v2 = vmul_f32( S1, R1 );
+    // Normalize
+    XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
+    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
+    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#elif defined(_XM_SSE3_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ClampLength
+(
+    FXMVECTOR V, 
+    float    LengthMin, 
+    float    LengthMax
+)
+{
+    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
+    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
+
+    return XMVector3ClampLengthV(V, ClampMin, ClampMax);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV
+(
+    FXMVECTOR V, 
+    FXMVECTOR LengthMin, 
+    FXMVECTOR LengthMax
+)
+{
+    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
+    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
+    assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
+    assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
+    assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
+
+    XMVECTOR LengthSq = XMVector3LengthSq(V);
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
+
+    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
+
+    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+    Length = XMVectorSelect(LengthSq, Length, Select);
+    Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
+    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
+
+    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
+
+    // Preserve the original vector (with no precision loss) if the length falls within the given range
+    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
+    Result = XMVectorSelect(Result, V, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Reflect
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal
+)
+{
+    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+
+    XMVECTOR Result = XMVector3Dot(Incident, Normal);
+    Result = XMVectorAdd(Result, Result);
+    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Refract
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal, 
+    float    RefractionIndex
+)
+{
+    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
+    return XMVector3RefractV(Incident, Normal, Index);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3RefractV
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal, 
+    FXMVECTOR RefractionIndex
+)
+{
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR  Zero = XMVectorZero();
+
+    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
+    R = XMVectorMultiply(R, RefractionIndex);
+    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
+
+    if (XMVector4LessOrEqual(R, Zero))
+    {
+        // Total internal reflection
+        return Zero;
+    }
+    else
+    {
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = XMVectorSqrt(R);
+        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
+
+        // Result = RefractionIndex * Incident - Normal * R
+        XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
+        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
+
+        return Result;
+    }
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR IDotN = XMVector3Dot(Incident,Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN);
+    R = vmulq_f32(R, RefractionIndex);
+    R = vmlsq_f32(g_XMOne, R, RefractionIndex );
+
+    uint32x4_t vResult = vcleq_f32(R,g_XMZero);
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // Sqrt(R)
+        float32x4_t S0 = vrsqrteq_f32(R);
+        float32x4_t P0 = vmulq_f32( R, S0 );
+        float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+        float32x4_t S1 = vmulq_f32( S0, R0 );
+        float32x4_t P1 = vmulq_f32( R, S1 );
+        float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+        float32x4_t S2 = vmulq_f32( S1, R1 );
+        R = vmulq_f32( R, S2 );
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = vmlaq_f32( R, RefractionIndex, IDotN );
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = vmulq_f32(RefractionIndex, Incident);
+        vResult = vmlsq_f32( vResult, R, Normal );
+    }
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR R = _mm_mul_ps(IDotN, IDotN);
+    R = _mm_sub_ps(g_XMOne,R);
+    R = _mm_mul_ps(R, RefractionIndex);
+    R = _mm_mul_ps(R, RefractionIndex);
+    R = _mm_sub_ps(g_XMOne,R);
+
+    XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
+    if (_mm_movemask_ps(vResult)==0x0f)
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = _mm_sqrt_ps(R);
+        vResult = _mm_mul_ps(RefractionIndex,IDotN);
+        R = _mm_add_ps(R,vResult);
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = _mm_mul_ps(RefractionIndex, Incident);
+        R = _mm_mul_ps(R,Normal);
+        vResult = _mm_sub_ps(vResult,R);
+    }
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Orthogonal
+(
+    FXMVECTOR V
+)
+{
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);
+
+    XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
+
+    XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
+    XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
+
+    XMVECTOR S = XMVectorAdd(YZYY, Z);
+    XMVECTOR D = XMVectorSubtract(YZYY, Z);
+
+    XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
+
+    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
+    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);
+
+    return XMVectorSelect(R1, R0, Select);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst
+(
+    FXMVECTOR N1, 
+    FXMVECTOR N2
+)
+{
+    XMVECTOR Result = XMVector3Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACosEst(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals
+(
+    FXMVECTOR N1, 
+    FXMVECTOR N2
+)
+{
+    XMVECTOR Result = XMVector3Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACos(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR L1 = XMVector3ReciprocalLength(V1);
+    XMVECTOR L2 = XMVector3ReciprocalLength(V2);
+
+    XMVECTOR Dot = XMVector3Dot(V1, V2);
+
+    L1 = XMVectorMultiply(L1, L2);
+
+    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
+    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
+
+    return XMVectorACos(CosAngle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance
+(
+    FXMVECTOR LinePoint1, 
+    FXMVECTOR LinePoint2, 
+    FXMVECTOR Point
+)
+{
+    // Given a vector PointVector from LinePoint1 to Point and a vector
+    // LineVector from LinePoint1 to LinePoint2, the scaled distance 
+    // PointProjectionScale from LinePoint1 to the perpendicular projection
+    // of PointVector onto the line is defined as:
+    //
+    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
+
+    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
+    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
+
+    XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
+
+    XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
+    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
+
+    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
+    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
+
+    return XMVector3Length(DistanceVector);
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVector3ComponentsFromNormal
+(
+    XMVECTOR* pParallel, 
+    XMVECTOR* pPerpendicular, 
+    FXMVECTOR  V, 
+    FXMVECTOR  Normal
+)
+{
+    assert(pParallel != nullptr);
+    assert(pPerpendicular != nullptr);
+
+    XMVECTOR Scale = XMVector3Dot(V, Normal);
+
+    XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
+
+    *pParallel = Parallel;
+    *pPerpendicular = XMVectorSubtract(V, Parallel);
+}
+
+//------------------------------------------------------------------------------
+// Transform a vector using a rotation expressed as a unit quaternion
+
+inline XMVECTOR XM_CALLCONV XMVector3Rotate
+(
+    FXMVECTOR V, 
+    FXMVECTOR RotationQuaternion
+)
+{
+    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
+    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
+    XMVECTOR Result = XMQuaternionMultiply(Q, A);
+    return XMQuaternionMultiply(Result, RotationQuaternion);
+}
+
+//------------------------------------------------------------------------------
+// Transform a vector using the inverse of a rotation expressed as a unit quaternion
+
+inline XMVECTOR XM_CALLCONV XMVector3InverseRotate
+(
+    FXMVECTOR V, 
+    FXMVECTOR RotationQuaternion
+)
+{
+    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
+    XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
+    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
+    return XMQuaternionMultiply(Result, Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    XMVECTOR vResult = vmlaq_lane_f32( M.r[3], M.r[0], VL, 0 ); // X
+    vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
+    return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+    vResult = _mm_mul_ps(vResult,M.r[0]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    vTemp = _mm_mul_ps(vTemp,M.r[1]);
+    vResult = _mm_add_ps(vResult,vTemp);
+    vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    vTemp = _mm_mul_ps(vTemp,M.r[2]);
+    vResult = _mm_add_ps(vResult,vTemp);
+    vResult = _mm_add_ps(vResult,M.r[3]);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream
+(
+    XMFLOAT4*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT3* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT3)*4;
+
+                float32x2_t r3 = vget_low_f32( row3 );
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
+
+                __prefetch( pInputVector );
+
+                r3 = vget_high_f32( row3 );
+                r = vget_high_f32( row0 );
+                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
+                XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
+  
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( row1 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
+                vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                r = vget_low_f32( row2 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
+
+                r = vget_high_f32( row2 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
+                vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
+
+                float32x4x4_t R;
+                R.val[0] = vResult0;
+                R.val[1] = vResult1;
+                R.val[2] = vResult2;
+                R.val[3] = vResult3;
+
+                vst4q_f32( reinterpret_cast<float*>(pOutputVector), R );
+                pOutputVector += sizeof(XMFLOAT4)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+        float32x2_t zero = vdup_n_f32(0);
+        float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
+        pInputVector += InputStride; 
+
+        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, VL, 1); // Y
+        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
+
+        vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
+            {
+                // Packed input, aligned output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                    pInputVector += sizeof(XMFLOAT3)*4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1,L2,L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Packed input, unaligned output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                    pInputVector += sizeof(XMFLOAT3)*4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1,L2,L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+                    _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
+    {
+        // Aligned output
+        for (; i < VectorCount; ++i)
+        {
+            #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
+            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+            pInputVector += InputStride; 
+
+            XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+            XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+            XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, row3 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+            vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+            XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
+            pOutputVector += OutputStride;
+        }
+    }
+    else
+    {
+        // Unaligned output
+        for (; i < VectorCount; ++i)
+        {
+            #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
+            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+            pInputVector += InputStride; 
+
+            XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+            XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+            XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+            XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+            XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+            XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+            vTemp = _mm_add_ps( vTemp, row3 );
+            vTemp = _mm_add_ps( vTemp, vTemp2 );
+            vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+            _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    XMVECTOR W = XMVectorSplatW(Result);
+    return XMVectorDivide( Result, W );
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
+(
+    XMFLOAT3*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT3* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        XMVECTOR W = XMVectorSplatW(Result);
+
+        Result = XMVectorDivide(Result, W);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT3)*4;
+
+                float32x2_t r3 = vget_low_f32( row3 );
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
+
+                __prefetch( pInputVector );
+
+                r3 = vget_high_f32( row3 );
+                r = vget_high_f32( row0 );
+                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
+                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
+  
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( row1 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
+                W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                r = vget_low_f32( row2 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
+
+                r = vget_high_f32( row2 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
+                W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
+
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                
+                V.val[0] = vmulq_f32( vResult0, Reciprocal );
+                V.val[1] = vmulq_f32( vResult1, Reciprocal );
+                V.val[2] = vmulq_f32( vResult2, Reciprocal );
+
+                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
+                pOutputVector += sizeof(XMFLOAT3)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+        float32x2_t zero = vdup_n_f32(0);
+        float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
+        pInputVector += InputStride; 
+
+        XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
+        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
+
+        VH = vget_high_f32(vResult);
+        XMVECTOR W = vdupq_lane_f32( VH, 1 );
+
+        // 2 iterations of Newton-Raphson refinement of reciprocal for W
+        float32x4_t Reciprocal = vrecpeq_f32( W );
+        float32x4_t S = vrecpsq_f32( Reciprocal, W );
+        Reciprocal = vmulq_f32( S, Reciprocal );
+        S = vrecpsq_f32( Reciprocal, W );
+        Reciprocal = vmulq_f32( S, Reciprocal );
+
+        vResult = vmulq_f32( vResult, Reciprocal );
+
+        VL = vget_low_f32( vResult );
+        vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
+        vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if ( !((uintptr_t)pOutputStream & 0xF) )
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V1 = _mm_div_ps( vTemp, W );
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V2 = _mm_div_ps( vTemp, W );
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V3 = _mm_div_ps( vTemp, W );
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V4 = _mm_div_ps( vTemp, W );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V1 = _mm_div_ps( vTemp, W );
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V2 = _mm_div_ps( vTemp, W );
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V3 = _mm_div_ps( vTemp, W );
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, row3 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                        V4 = _mm_div_ps( vTemp, W );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                    pInputVector += sizeof(XMFLOAT3)*4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1,L2,L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                    vTemp = _mm_div_ps( vTemp, W );
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                    vTemp = _mm_div_ps( vTemp, W );
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                    vTemp = _mm_div_ps( vTemp, W );
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, row3 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+                    vTemp = _mm_div_ps( vTemp, W );
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride; 
+
+        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+        vTemp = _mm_add_ps( vTemp, row3 );
+        vTemp = _mm_add_ps( vTemp, vTemp2 );
+        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+
+        vTemp = _mm_div_ps( vTemp, W );
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
+    vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
+    return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+    vResult = _mm_mul_ps(vResult,M.r[0]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    vTemp = _mm_mul_ps(vTemp,M.r[1]);
+    vResult = _mm_add_ps(vResult,vTemp);
+    vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    vTemp = _mm_mul_ps(vTemp,M.r[2]);
+    vResult = _mm_add_ps(vResult,vTemp);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream
+(
+    XMFLOAT3*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT3* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiply(Z, row2);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT3)*4;
+
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
+                XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
+
+                __prefetch( pInputVector );
+
+                r = vget_high_f32( row0 );
+                XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
+
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( row1 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                r = vget_low_f32( row2 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
+
+                r = vget_high_f32( row2 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
+
+                V.val[0] = vResult0;
+                V.val[1] = vResult1;
+                V.val[2] = vResult2;
+
+                vst3q_f32( reinterpret_cast<float*>(pOutputVector), V );
+                pOutputVector += sizeof(XMFLOAT3)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+        float32x2_t zero = vdup_n_f32(0);
+        float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
+        pInputVector += InputStride; 
+
+        XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
+        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
+
+        VL = vget_low_f32( vResult );
+        vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
+        vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if ( !((uintptr_t)pOutputStream & 0xF) )
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V1 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V2 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V3 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V4 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V1 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V2 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V3 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, row2 );
+                        vTemp2 = _mm_mul_ps( Y, row1 );
+                        vTemp3 = _mm_mul_ps( X, row0 );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        V4 = _mm_add_ps( vTemp, vTemp3 );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                    pInputVector += sizeof(XMFLOAT3)*4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1,L2,L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+                    XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+                    XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, row2 );
+                    vTemp2 = _mm_mul_ps( Y, row1 );
+                    vTemp3 = _mm_mul_ps( X, row0 );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride; 
+
+        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+        XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
+        XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
+        XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
+        vTemp = _mm_add_ps( vTemp, vTemp2 );
+        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    FXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = XMVector3TransformCoord(V, Transform);
+
+    Result = XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
+(
+    XMFLOAT3*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT3* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    float           ViewportX, 
+    float           ViewportY, 
+    float           ViewportWidth, 
+    float           ViewportHeight, 
+    float           ViewportMinZ, 
+    float           ViewportMaxZ, 
+    FXMMATRIX     Projection, 
+    CXMMATRIX     View, 
+    CXMMATRIX     World
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+
+        XMVECTOR Result = XMVector3TransformCoord(V, Transform);
+        Result = XMVectorMultiplyAdd(Result, Scale, Offset);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth);
+            XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight);
+            XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ);
+
+            XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth);
+            XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight);
+            XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ);
+
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT3)*4;
+
+                float32x2_t r3 = vget_low_f32( Transform.r[3] );
+                float32x2_t r = vget_low_f32( Transform.r[0] );
+                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
+
+                __prefetch( pInputVector );
+
+                r3 = vget_high_f32( Transform.r[3] );
+                r = vget_high_f32( Transform.r[0] );
+                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
+                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
+
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( Transform.r[1] );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( Transform.r[1] );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
+                W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                r = vget_low_f32( Transform.r[2] );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
+
+                r = vget_high_f32( Transform.r[2] );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
+                W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
+
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+
+                vResult0 = vmulq_f32( vResult0, Reciprocal );
+                vResult1 = vmulq_f32( vResult1, Reciprocal );
+                vResult2 = vmulq_f32( vResult2, Reciprocal );
+
+                V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX );
+                V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY );
+                V.val[2] = vmlaq_f32( OffsetZ, vResult2, ScaleZ );
+
+                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
+                pOutputVector += sizeof(XMFLOAT3)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    if ( i < VectorCount)
+    {
+        XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
+        XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+        for (; i < VectorCount; i++)
+        {
+            float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+            float32x2_t zero = vdup_n_f32(0);
+            float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
+            pInputVector += InputStride; 
+
+            XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X
+            vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y
+            vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z
+
+            VH = vget_high_f32(vResult);
+            XMVECTOR W = vdupq_lane_f32( VH, 1 );
+
+            // 2 iterations of Newton-Raphson refinement of reciprocal for W
+            float32x4_t Reciprocal = vrecpeq_f32( W );
+            float32x4_t S = vrecpsq_f32( Reciprocal, W );
+            Reciprocal = vmulq_f32( S, Reciprocal );
+            S = vrecpsq_f32( Reciprocal, W );
+            Reciprocal = vmulq_f32( S, Reciprocal );
+
+            vResult = vmulq_f32( vResult, Reciprocal );
+
+            vResult = vmlaq_f32( Offset, vResult, Scale );
+
+            VL = vget_low_f32( vResult );
+            vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
+            vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
+            pOutputVector += OutputStride;
+        }
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if ( !((uintptr_t)pOutputStream & 0xF) )
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V1 = _mm_add_ps( vTemp, Offset );
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V2 = _mm_add_ps( vTemp, Offset );
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V3 = _mm_add_ps( vTemp, Offset );
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V4 = _mm_add_ps( vTemp, Offset );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V1 = _mm_add_ps( vTemp, Offset );
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V2 = _mm_add_ps( vTemp, Offset );
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V3 = _mm_add_ps( vTemp, Offset );
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        vTemp = _mm_div_ps( vTemp, W );
+
+                        vTemp = _mm_mul_ps( vTemp, Scale );
+                        V4 = _mm_add_ps( vTemp, Offset );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                    pInputVector += sizeof(XMFLOAT3)*4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1,L2,L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    vTemp = _mm_mul_ps( vTemp, Scale );
+                    vTemp = _mm_add_ps( vTemp, Offset );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    vTemp = _mm_mul_ps( vTemp, Scale );
+                    vTemp = _mm_add_ps( vTemp, Offset );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    vTemp = _mm_mul_ps( vTemp, Scale );
+                    vTemp = _mm_add_ps( vTemp, Offset );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    vTemp = _mm_mul_ps( vTemp, Scale );
+                    vTemp = _mm_add_ps( vTemp, Offset );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride; 
+
+        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+        vTemp = _mm_add_ps( vTemp, vTemp2 );
+        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+        vTemp = _mm_div_ps( vTemp, W );
+
+        vTemp = _mm_mul_ps( vTemp, Scale );
+        vTemp = _mm_add_ps( vTemp, Offset );
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    FXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return XMVector3TransformCoord(Result, Transform);
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
+(
+    XMFLOAT3*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT3* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    float           ViewportX, 
+    float           ViewportY, 
+    float           ViewportWidth, 
+    float           ViewportHeight, 
+    float           ViewportMinZ, 
+    float           ViewportMaxZ, 
+    FXMMATRIX       Projection, 
+    CXMMATRIX       View, 
+    CXMMATRIX       World)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+        Result = XMVector3TransformCoord(Result, Transform);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    float sx = 1.f / (ViewportWidth * 0.5f);
+    float sy = 1.f / (-ViewportHeight * 0.5f);
+    float sz = 1.f / (ViewportMaxZ - ViewportMinZ);
+
+    float ox = (-ViewportX * sx) - 1.f;
+    float oy = (-ViewportY * sy) + 1.f;
+    float oz = (-ViewportMinZ * sz);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT3)*4;
+
+                XMVECTOR ScaleX = vdupq_n_f32(sx);
+                XMVECTOR OffsetX = vdupq_n_f32(ox);
+                XMVECTOR VX = vmlaq_f32( OffsetX, ScaleX, V.val[0] );
+
+                float32x2_t r3 = vget_low_f32( Transform.r[3] );
+                float32x2_t r = vget_low_f32( Transform.r[0] );
+                XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N
+
+                __prefetch( pInputVector );
+
+                r3 = vget_high_f32( Transform.r[3] );
+                r = vget_high_f32( Transform.r[0] );
+                XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O
+                XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P
+
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                XMVECTOR ScaleY = vdupq_n_f32(sy);
+                XMVECTOR OffsetY = vdupq_n_f32(oy);
+                XMVECTOR VY = vmlaq_f32( OffsetY, ScaleY, V.val[1] );
+
+                r = vget_low_f32( Transform.r[1] );
+                vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( Transform.r[1] );
+                vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O
+                W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                XMVECTOR ScaleZ = vdupq_n_f32(sz);
+                XMVECTOR OffsetZ = vdupq_n_f32(oz);
+                XMVECTOR VZ = vmlaq_f32( OffsetZ, ScaleZ, V.val[2] );
+
+                r = vget_low_f32( Transform.r[2] );
+                vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
+
+                r = vget_high_f32( Transform.r[2] );
+                vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O
+                W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
+
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                S = vrecpsq_f32( Reciprocal, W );
+                Reciprocal = vmulq_f32( S, Reciprocal );
+                
+                V.val[0] = vmulq_f32( vResult0, Reciprocal );
+                V.val[1] = vmulq_f32( vResult1, Reciprocal );
+                V.val[2] = vmulq_f32( vResult2, Reciprocal );
+
+                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
+                pOutputVector += sizeof(XMFLOAT3)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        float32x2_t ScaleL = vcreate_f32(((uint64_t)*(const uint32_t *)&sx) | ((uint64_t)(*(const uint32_t *)&sy) << 32));
+        float32x2_t ScaleH = vcreate_f32((uint64_t)*(const uint32_t *)&sz);
+
+        float32x2_t OffsetL = vcreate_f32(((uint64_t)*(const uint32_t *)&ox) | ((uint64_t)(*(const uint32_t *)&oy) << 32));
+        float32x2_t OffsetH = vcreate_f32((uint64_t)*(const uint32_t *)&oz);
+
+        for (; i < VectorCount; i++)
+        {
+            float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
+            float32x2_t zero = vdup_n_f32(0);
+            float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
+            pInputVector += InputStride; 
+
+            VL = vmla_f32( OffsetL, VL, ScaleL );
+            VH = vmla_f32( OffsetH, VH, ScaleH );
+
+            XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X
+            vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y
+            vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z
+
+            VH = vget_high_f32(vResult);
+            XMVECTOR W = vdupq_lane_f32( VH, 1 );
+
+            // 2 iterations of Newton-Raphson refinement of reciprocal for W
+            float32x4_t Reciprocal = vrecpeq_f32( W );
+            float32x4_t S = vrecpsq_f32( Reciprocal, W );
+            Reciprocal = vmulq_f32( S, Reciprocal );
+            S = vrecpsq_f32( Reciprocal, W );
+            Reciprocal = vmulq_f32( S, Reciprocal );
+
+            vResult = vmulq_f32( vResult, Reciprocal );
+
+            VL = vget_low_f32( vResult );
+            vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
+            vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
+            pOutputVector += OutputStride;
+        }
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = _mm_mul_ps(Scale, Offset);
+    Offset = _mm_add_ps(Offset, D);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if ( !((uintptr_t)pOutputStream & 0xF) )
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        V1 = _mm_mul_ps( V1, Scale );
+                        V1 = _mm_add_ps( V1, Offset );
+
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V1 = _mm_div_ps( vTemp, W );
+
+                        // Result 2
+                        V2 = _mm_mul_ps( V2, Scale );
+                        V2 = _mm_add_ps( V2, Offset );
+
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V2 = _mm_div_ps( vTemp, W );
+
+                        // Result 3
+                        V3 = _mm_mul_ps( V3, Scale );
+                        V3 = _mm_add_ps( V3, Offset );
+
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V3 = _mm_div_ps( vTemp, W );
+
+                        // Result 4
+                        V4 = _mm_mul_ps( V4, Scale );
+                        V4 = _mm_add_ps( V4, Offset );
+
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V4 = _mm_div_ps( vTemp, W );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                        __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                        __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                        pInputVector += sizeof(XMFLOAT3)*4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1,L2,L3);
+
+                        // Result 1
+                        V1 = _mm_mul_ps( V1, Scale );
+                        V1 = _mm_add_ps( V1, Offset );
+
+                        XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                        XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                        XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V1 = _mm_div_ps( vTemp, W );
+
+                        // Result 2
+                        V2 = _mm_mul_ps( V2, Scale );
+                        V2 = _mm_add_ps( V2, Offset );
+
+                        Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V2 = _mm_div_ps( vTemp, W );
+
+                        // Result 3
+                        V3 = _mm_mul_ps( V3, Scale );
+                        V3 = _mm_add_ps( V3, Offset );
+
+                        Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V3 = _mm_div_ps( vTemp, W );
+
+                        // Result 4
+                        V4 = _mm_mul_ps( V4, Scale );
+                        V4 = _mm_add_ps( V4, Offset );
+
+                        Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                        Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                        X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                        vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                        vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                        vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                        vTemp = _mm_add_ps( vTemp, vTemp2 );
+                        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                        W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                        V4 = _mm_div_ps( vTemp, W );
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
+                        _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
+                        pOutputVector += sizeof(XMFLOAT3)*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                    __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );  
+                    __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
+                    pInputVector += sizeof(XMFLOAT3)*4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1,L2,L3);
+
+                    // Result 1
+                    V1 = _mm_mul_ps( V1, Scale );
+                    V1 = _mm_add_ps( V1, Offset );
+
+                    XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
+                    XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
+                    XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    V2 = _mm_mul_ps( V2, Scale );
+                    V2 = _mm_add_ps( V2, Offset );
+
+                    Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    V3 = _mm_mul_ps( V3, Scale );
+                    V3 = _mm_add_ps( V3, Offset );
+
+                    Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    V4 = _mm_mul_ps( V4, Scale );
+                    V4 = _mm_add_ps( V4, Offset );
+
+                    Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
+                    Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
+                    X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
+
+                    vTemp = _mm_mul_ps( Z, Transform.r[2] );
+                    vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+                    vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+                    vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+                    vTemp = _mm_add_ps( vTemp, vTemp2 );
+                    vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+                    vTemp = _mm_div_ps( vTemp, W );
+
+                    #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" )
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride;
+
+        V = _mm_mul_ps( V, Scale );
+        V = _mm_add_ps( V, Offset );
+
+        XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
+        XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
+        XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
+
+        XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
+        XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
+        XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
+        vTemp = _mm_add_ps( vTemp, Transform.r[3] );
+        vTemp = _mm_add_ps( vTemp, vTemp2 );
+        vTemp = _mm_add_ps( vTemp, vTemp3 );
+
+        XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
+        vTemp = _mm_div_ps( vTemp, W );
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+/****************************************************************************
+ *
+ * 4D Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4Equal
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4EqualR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+
+    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] == V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] != V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+    int iTest = _mm_movemask_ps(vTemp);
+    uint32_t CR = 0;
+    if (iTest==0xf)     // All equal?
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (iTest==0)  // All not equal?
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4EqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4EqualIntR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if (V1.vector4_u32[0] == V2.vector4_u32[0] && 
+        V1.vector4_u32[1] == V2.vector4_u32[1] &&
+        V1.vector4_u32[2] == V2.vector4_u32[2] &&
+        V1.vector4_u32[3] == V2.vector4_u32[3])
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (V1.vector4_u32[0] != V2.vector4_u32[0] && 
+        V1.vector4_u32[1] != V2.vector4_u32[1] &&
+        V1.vector4_u32[2] != V2.vector4_u32[2] &&
+        V1.vector4_u32[3] != V2.vector4_u32[3])
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
+    uint32_t CR = 0;
+    if (iTest==0xf)     // All equal?
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (iTest==0)  // All not equal?
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+inline bool XM_CALLCONV XMVector4NearEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float dx, dy, dz, dw;
+
+    dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
+    dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
+    dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
+    dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]);
+    return (((dx <= Epsilon.vector4_f32[0]) &&
+            (dy <= Epsilon.vector4_f32[1]) &&
+            (dz <= Epsilon.vector4_f32[2]) &&
+            (dw <= Epsilon.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vDelta = vsubq_f32( V1, V2 );
+    uint32x4_t vResult = vacleq_f32( vDelta, Epsilon );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp,vDelta);
+    vTemp = _mm_max_ps(vTemp,vDelta);
+    vTemp = _mm_cmple_ps(vTemp,Epsilon);
+    return ((_mm_movemask_ps(vTemp)==0xf) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4NotEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2);
+    return ((_mm_movemask_ps(vTemp)) != 0);
+#else
+    return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4NotEqualInt
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
+    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0);
+#else
+    return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4Greater
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4GreaterR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if (V1.vector4_f32[0] > V2.vector4_f32[0] && 
+        V1.vector4_f32[1] > V2.vector4_f32[1] &&
+        V1.vector4_f32[2] > V2.vector4_f32[2] &&
+        V1.vector4_f32[3] > V2.vector4_f32[3])
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && 
+        V1.vector4_f32[1] <= V2.vector4_f32[1] &&
+        V1.vector4_f32[2] <= V2.vector4_f32[2] &&
+        V1.vector4_f32[3] <= V2.vector4_f32[3])
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    uint32_t CR = 0;
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest==0xf) {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4GreaterOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] >= V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
+        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] < V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
+
+    uint32_t CR = 0;
+    if ( r == 0xFFFFFFFFU )
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ( !r )
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    uint32_t CR = 0;
+    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest==0x0f)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4Less
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcltq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
+    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4LessOrEqual
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcleq_f32( V1, V2 );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
+    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4InBounds
+(
+    FXMVECTOR V, 
+    FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
+        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
+        (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    uint32x4_t ivTemp1 = vcleq_f32(V,Bounds);
+    // Negate the bounds
+    float32x4_t vTemp2 = vnegq_f32(Bounds);
+    // Test if greater or equal (Reversed)
+    uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V);
+    // Blend answers
+    ivTemp1 = vandq_u32(ivTemp1,ivTemp2);
+    // in bounds?
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2,V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+    // All in bounds?
+    return ((_mm_movemask_ps(vTemp1)==0x0f) != 0);
+#else
+    return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4IsNaN
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (XMISNAN(V.vector4_f32[0]) ||
+            XMISNAN(V.vector4_f32[1]) ||
+            XMISNAN(V.vector4_f32[2]) ||
+            XMISNAN(V.vector4_f32[3]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    uint32x4_t vTempNan = vceqq_f32( V, V );
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    // If any are NaN, the mask is zero
+    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
+    // If any are NaN, the mask is non-zero
+    return (_mm_movemask_ps(vTempNan)!=0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4IsInfinite
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return (XMISINF(V.vector4_f32[0]) ||
+            XMISINF(V.vector4_f32[1]) ||
+            XMISINF(V.vector4_f32[2]) ||
+            XMISINF(V.vector4_f32[3]));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask );
+    // Compare to infinity
+    vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
+    // If any are infinity, the signs are true.
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    return ( vget_lane_u32(vTemp.val[1], 1) != 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+    // If any are infinity, the signs are true.
+    return (_mm_movemask_ps(vTemp) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] =
+    Result.vector4_f32[1] =
+    Result.vector4_f32[2] =
+    Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTemp = vmulq_f32( V1, V2 );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    return vcombine_f32( v1, v1 );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps( V1, V2, 0xff );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
+    vTemp = _mm_hadd_ps(vTemp, vTemp);
+    return _mm_hadd_ps(vTemp, vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp2 = V2;
+    XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
+    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
+    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Cross
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
+    //   ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
+    //   ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
+    //   ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;   
+
+    Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]);
+    Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]);
+    Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]);
+    Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const float32x2_t select = vget_low_f32( g_XMMaskX );
+
+    // Term1: V2zwyz * V3wzwy
+    const float32x2_t v2xy = vget_low_f32(V2);
+    const float32x2_t v2zw = vget_high_f32(V2);
+    const float32x2_t v2yx = vrev64_f32(v2xy);
+    const float32x2_t v2wz = vrev64_f32(v2zw);
+    const float32x2_t v2yz = vbsl_f32( select, v2yx, v2wz );
+
+    const float32x2_t v3zw = vget_high_f32(V3);
+    const float32x2_t v3wz = vrev64_f32(v3zw);
+    const float32x2_t v3xy = vget_low_f32(V3);
+    const float32x2_t v3wy = vbsl_f32( select, v3wz, v3xy );
+
+    float32x4_t vTemp1 = vcombine_f32(v2zw,v2yz);
+    float32x4_t vTemp2 = vcombine_f32(v3wz,v3wy);
+    XMVECTOR vResult = vmulq_f32( vTemp1, vTemp2 );
+
+    // - V2wzwy * V3zwyz
+    const float32x2_t v2wy = vbsl_f32( select, v2wz, v2xy );
+
+    const float32x2_t v3yx = vrev64_f32(v3xy);
+    const float32x2_t v3yz = vbsl_f32( select, v3yx, v3wz );
+
+    vTemp1 = vcombine_f32(v2wz,v2wy);
+    vTemp2 = vcombine_f32(v3zw,v3yz);
+    vResult = vmlsq_f32( vResult, vTemp1, vTemp2 );
+
+    // term1 * V1yxxx
+    const float32x2_t v1xy = vget_low_f32(V1);
+    const float32x2_t v1yx = vrev64_f32(v1xy);
+
+    vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) );
+    vResult = vmulq_f32( vResult, vTemp1 );
+
+    // Term2: V2ywxz * V3wxwx
+    const float32x2_t v2yw = vrev64_f32(v2wy);
+    const float32x2_t v2xz = vbsl_f32( select, v2xy, v2wz );
+
+    const float32x2_t v3wx = vbsl_f32( select, v3wz, v3yx );
+
+    vTemp1 = vcombine_f32(v2yw,v2xz);
+    vTemp2 = vcombine_f32(v3wx,v3wx);
+    float32x4_t vTerm = vmulq_f32( vTemp1, vTemp2 );
+
+    // - V2wxwx * V3ywxz
+    const float32x2_t v2wx = vbsl_f32( select, v2wz, v2yx );
+
+    const float32x2_t v3yw = vrev64_f32(v3wy);
+    const float32x2_t v3xz = vbsl_f32( select, v3xy, v3wz );
+
+    vTemp1 = vcombine_f32(v2wx,v2wx);
+    vTemp2 = vcombine_f32(v3yw,v3xz);
+    vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
+
+    // vResult - term2 * V1zzyy
+    const float32x2_t v1zw = vget_high_f32(V1);
+
+    vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) );
+    vResult = vmlsq_f32( vResult, vTerm, vTemp1 );
+
+    // Term3: V2yzxy * V3zxyx
+    const float32x2_t v3zx = vrev64_f32(v3xz);
+
+    vTemp1 = vcombine_f32(v2yz,v2xy);
+    vTemp2 = vcombine_f32(v3zx,v3yx);
+    vTerm = vmulq_f32( vTemp1, vTemp2 );
+
+    // - V2zxyx * V3yzxy
+    const float32x2_t v2zx = vrev64_f32(v2xz);
+
+    vTemp1 = vcombine_f32(v2zx,v2yx);
+    vTemp2 = vcombine_f32(v3yz,v3xy);
+    vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
+
+    // vResult + term3 * V1wwwz
+    const float32x2_t v1wz = vrev64_f32(v1zw);
+
+    vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz );
+    return vmlaq_f32( vResult, vTerm, vTemp1 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // V2zwyz * V3wzwy
+    XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2));
+    XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3));
+    vResult = _mm_mul_ps(vResult,vTemp3);
+    // - V2wzwy * V3zwyz
+    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3));
+    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1));
+    vTemp2 = _mm_mul_ps(vTemp2,vTemp3);
+    vResult = _mm_sub_ps(vResult,vTemp2);
+    // term1 * V1yxxx
+    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1));
+    vResult = _mm_mul_ps(vResult,vTemp1);
+
+    // V2ywxz * V3wxwx
+    vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1));
+    vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3));
+    vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
+    // - V2wxwx * V3ywxz
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1));
+    vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1));
+    vTemp2 = _mm_mul_ps(vTemp2,vTemp1);
+    vTemp3 = _mm_sub_ps(vTemp3,vTemp2);
+    // vResult - temp * V1zzyy
+    vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2));
+    vTemp1 = _mm_mul_ps(vTemp1,vTemp3);
+    vResult = _mm_sub_ps(vResult,vTemp1);
+
+    // V2yzxy * V3zxyx
+    vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1));
+    vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2));
+    vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
+    // - V2zxyx * V3yzxy
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1));
+    vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1));
+    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+    vTemp3 = _mm_sub_ps(vTemp3,vTemp1);
+    // vResult + term * V1wwwz
+    vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3));
+    vTemp3 = _mm_mul_ps(vTemp3,vTemp1);
+    vResult = _mm_add_ps(vResult,vTemp3);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthSq
+(
+    FXMVECTOR V
+)
+{
+    return XMVector4Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorReciprocalSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32( v1 );
+    return vcombine_f32(v2, v2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_rsqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
+    // Get the reciprocal
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorReciprocalSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    // Reciprocal sqrt
+    float32x2_t  S0 = vrsqrte_f32(v1);
+    float32x2_t  P0 = vmul_f32( v1, S0 );
+    float32x2_t  R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t  S1 = vmul_f32( S0, R0 );
+    float32x2_t  P1 = vmul_f32( v1, S1 );
+    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t Result = vmul_f32( S1, R1 );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
+    // Get the reciprocal
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    // Accurate!
+    vLengthSq = _mm_div_ps(g_XMOne,vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
+    // Sqrt (estimate)
+    float32x2_t Result = vrsqrte_f32( v1 );
+    Result = vmul_f32( v1, Result );
+    Result = vbsl_f32( VEqualsZero, zero, Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Length
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_) 
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
+    // Sqrt
+    float32x2_t S0 = vrsqrte_f32( v1 );
+    float32x2_t P0 = vmul_f32( v1, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( v1, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    float32x2_t Result = vmul_f32( S1, R1 );
+    Result = vmul_f32( v1, Result );
+    Result = vbsl_f32( VEqualsZero, zero, Result );
+    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// XMVector4NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector4ReciprocalLength(V);
+    Result = XMVectorMultiply(V, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32( v1 );
+    // Normalize
+    return vmulq_f32( V, vcombine_f32(v2,v2) );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_rsqrt_ps(vDot);
+    vDot = _mm_mul_ps(vDot, V);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
+    // Get the reciprocal
+    XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_mul_ps(vResult,V);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Normalize
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fLength;
+    XMVECTOR vResult;
+
+    vResult = XMVector4Length( V );
+    fLength = vResult.vector4_f32[0];
+
+    // Prevent divide by zero
+    if (fLength > 0) {
+        fLength = 1.0f/fLength;
+    }
+    
+    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
+    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
+    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
+    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32( V, V );
+    float32x2_t v1 = vget_low_f32( vTemp );
+    float32x2_t v2 = vget_high_f32( vTemp );
+    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
+    uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
+    uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
+    // Reciprocal sqrt (2 iterations of Newton-Raphson)
+    float32x2_t S0 = vrsqrte_f32( v1 );
+    float32x2_t P0 = vmul_f32( v1, S0 );
+    float32x2_t R0 = vrsqrts_f32( P0, S0 );
+    float32x2_t S1 = vmul_f32( S0, R0 );
+    float32x2_t P1 = vmul_f32( v1, S1 );
+    float32x2_t R1 = vrsqrts_f32( P1, S1 );
+    v2 = vmul_f32( S1, R1 );
+    // Normalize
+    XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
+    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
+    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#elif defined(_XM_SSE3_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ClampLength
+(
+    FXMVECTOR V, 
+    float    LengthMin, 
+    float    LengthMax
+)
+{
+    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
+    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
+
+    return XMVector4ClampLengthV(V, ClampMin, ClampMax);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV
+(
+    FXMVECTOR V, 
+    FXMVECTOR LengthMin, 
+    FXMVECTOR LengthMax
+)
+{
+    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
+    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
+    assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
+    assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
+    assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
+
+    XMVECTOR LengthSq = XMVector4LengthSq(V);
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
+
+    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
+
+    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+    Length = XMVectorSelect(LengthSq, Length, Select);
+    Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
+    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
+
+    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
+
+    // Preserve the original vector (with no precision loss) if the length falls within the given range
+    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
+    Result = XMVectorSelect(Result, V, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Reflect
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal
+)
+{
+    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+
+    XMVECTOR Result = XMVector4Dot(Incident, Normal);
+    Result = XMVectorAdd(Result, Result);
+    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Refract
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal, 
+    float    RefractionIndex
+)
+{
+    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
+    return XMVector4RefractV(Incident, Normal, Index);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4RefractV
+(
+    FXMVECTOR Incident, 
+    FXMVECTOR Normal, 
+    FXMVECTOR RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR        IDotN;
+    XMVECTOR        R;
+    const XMVECTOR  Zero = XMVectorZero();
+
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+    IDotN = XMVector4Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
+    R = XMVectorMultiply(R, RefractionIndex);
+    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
+
+    if (XMVector4LessOrEqual(R, Zero))
+    {
+        // Total internal reflection
+        return Zero;
+    }
+    else
+    {
+        XMVECTOR Result;
+
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = XMVectorSqrt(R);
+        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
+
+        // Result = RefractionIndex * Incident - Normal * R
+        Result = XMVectorMultiply(RefractionIndex, Incident);
+        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
+
+        return Result;
+    }
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN);
+    R = vmulq_f32(R, RefractionIndex);
+    R = vmlsq_f32(g_XMOne, R, RefractionIndex );
+
+    uint32x4_t vResult = vcleq_f32(R,g_XMZero);
+    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
+    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
+    if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // Sqrt(R)
+        float32x4_t S0 = vrsqrteq_f32(R);
+        float32x4_t P0 = vmulq_f32( R, S0 );
+        float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+        float32x4_t S1 = vmulq_f32( S0, R0 );
+        float32x4_t P1 = vmulq_f32( R, S1 );
+        float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+        float32x4_t S2 = vmulq_f32( S1, R1 );
+        R = vmulq_f32( R, S2 );
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = vmlaq_f32( R, RefractionIndex, IDotN );
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = vmulq_f32(RefractionIndex, Incident);
+        vResult = vmlsq_f32( vResult, R, Normal );
+    }
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR R = _mm_mul_ps(IDotN,IDotN);
+    R = _mm_sub_ps(g_XMOne,R);
+    R = _mm_mul_ps(R, RefractionIndex);
+    R = _mm_mul_ps(R, RefractionIndex);
+    R = _mm_sub_ps(g_XMOne,R);
+
+    XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
+    if (_mm_movemask_ps(vResult)==0x0f)
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = _mm_sqrt_ps(R);
+        vResult = _mm_mul_ps(RefractionIndex, IDotN);
+        R = _mm_add_ps(R,vResult);
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = _mm_mul_ps(RefractionIndex, Incident);
+        R = _mm_mul_ps(R,Normal);
+        vResult = _mm_sub_ps(vResult,R);
+    }
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Orthogonal
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = V.vector4_f32[2];
+    Result.vector4_f32[1] = V.vector4_f32[3];
+    Result.vector4_f32[2] = -V.vector4_f32[0];
+    Result.vector4_f32[3] = -V.vector4_f32[1];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f };
+
+    float32x4_t Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) );
+    return vmulq_f32( Result, Negate );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f};
+    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2));
+    vResult = _mm_mul_ps(vResult,FlipZW);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst
+(
+    FXMVECTOR N1, 
+    FXMVECTOR N2
+)
+{
+    XMVECTOR Result = XMVector4Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACosEst(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals
+(
+    FXMVECTOR N1, 
+    FXMVECTOR N2
+)
+{
+    XMVECTOR Result = XMVector4Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACos(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR L1 = XMVector4ReciprocalLength(V1);
+    XMVECTOR L2 = XMVector4ReciprocalLength(V2);
+
+    XMVECTOR Dot = XMVector4Dot(V1, V2);
+
+    L1 = XMVectorMultiply(L1, L2);
+
+    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
+    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
+
+    return XMVectorACos(CosAngle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    FXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
+    float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
+    float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
+    float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = fX;
+    vResult.vector4_f32[1] = fY;
+    vResult.vector4_f32[2] = fZ; 
+    vResult.vector4_f32[3] = fW;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32( V );
+    XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
+    vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
+    float32x2_t VH = vget_high_f32( V );
+    vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0  ); // Z
+    return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat x,y,z and w
+    XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+    // Mul by the matrix
+    vTempX = _mm_mul_ps(vTempX,M.r[0]);
+    vTempY = _mm_mul_ps(vTempY,M.r[1]);
+    vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
+    vTempW = _mm_mul_ps(vTempW,M.r[3]);
+    // Add them all together
+    vTempX = _mm_add_ps(vTempX,vTempY);
+    vTempZ = _mm_add_ps(vTempZ,vTempW);
+    vTempX = _mm_add_ps(vTempX,vTempZ);
+    return vTempX;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream
+(
+    XMFLOAT4*       pOutputStream, 
+    size_t          OutputStride, 
+    const XMFLOAT4* pInputStream, 
+    size_t          InputStride, 
+    size_t          VectorCount, 
+    FXMMATRIX       M
+)
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT4));
+
+    assert(OutputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector);
+        XMVECTOR W = XMVectorSplatW(V);
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiply(W, row3);
+        Result = XMVectorMultiplyAdd(Z, row2, Result);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" )
+        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
+
+        pInputVector += InputStride; 
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if ( four > 0 )
+    {
+        if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x4_t V = vld4q_f32( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += sizeof(XMFLOAT4)*4; 
+
+                float32x2_t r = vget_low_f32( row0 );
+                XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
+                XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
+
+                __prefetch( pInputVector );
+
+                r = vget_high_f32( row0 );
+                XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
+                XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx
+
+                __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
+
+                r = vget_low_f32( row1 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
+
+                r = vget_high_f32( row1 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
+                vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
+
+                r = vget_low_f32( row2 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
+
+                r = vget_high_f32( row2 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
+                vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
+
+                r = vget_low_f32( row3 );
+                vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw
+                vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) );
+
+                r = vget_high_f32( row3 );
+                vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow
+                vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw
+
+                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) );
+
+                V.val[0] = vResult0;
+                V.val[1] = vResult1;
+                V.val[2] = vResult2;
+                V.val[3] = vResult3;
+
+                vst4q_f32( reinterpret_cast<float*>(pOutputVector), V );
+                pOutputVector += sizeof(XMFLOAT4)*4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        XMVECTOR V = vld1q_f32( reinterpret_cast<const float*>(pInputVector) );
+        pInputVector += InputStride; 
+
+        float32x2_t VL = vget_low_f32( V );
+        XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X
+        vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
+        float32x2_t VH = vget_high_f32( V );
+        vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
+        vResult = vmlaq_lane_f32( vResult, row3, VH, 1 ); // W
+
+        vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
+    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
+    {
+        if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) )
+        {
+            // Aligned input, aligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_load_ps( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += InputStride; 
+        
+                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+
+                vTempX = _mm_mul_ps(vTempX,row0);
+                vTempY = _mm_mul_ps(vTempY,row1);
+                vTempZ = _mm_mul_ps(vTempZ,row2);
+                vTempW = _mm_mul_ps(vTempW,row3);
+
+                vTempX = _mm_add_ps(vTempX,vTempY);
+                vTempZ = _mm_add_ps(vTempZ,vTempW);
+                vTempX = _mm_add_ps(vTempX,vTempZ);
+
+                XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTempX );
+                pOutputVector += OutputStride;
+            }
+        }
+        else
+        {
+            // Unaligned input, aligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += InputStride; 
+        
+                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+
+                vTempX = _mm_mul_ps(vTempX,row0);
+                vTempY = _mm_mul_ps(vTempY,row1);
+                vTempZ = _mm_mul_ps(vTempZ,row2);
+                vTempW = _mm_mul_ps(vTempW,row3);
+
+                vTempX = _mm_add_ps(vTempX,vTempY);
+                vTempZ = _mm_add_ps(vTempZ,vTempW);
+                vTempX = _mm_add_ps(vTempX,vTempZ);
+
+                XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTempX );
+                pOutputVector += OutputStride;
+            }
+        }
+    }
+    else
+    {
+        if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) )
+        {
+            // Aligned input, unaligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_load_ps( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += InputStride; 
+        
+                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+
+                vTempX = _mm_mul_ps(vTempX,row0);
+                vTempY = _mm_mul_ps(vTempY,row1);
+                vTempZ = _mm_mul_ps(vTempZ,row2);
+                vTempW = _mm_mul_ps(vTempW,row3);
+
+                vTempX = _mm_add_ps(vTempX,vTempY);
+                vTempZ = _mm_add_ps(vTempZ,vTempW);
+                vTempX = _mm_add_ps(vTempX,vTempZ);
+
+                _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTempX );
+                pOutputVector += OutputStride;
+            }
+        }
+        else
+        {
+            // Unaligned input, unaligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
+                pInputVector += InputStride; 
+        
+                XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
+
+                vTempX = _mm_mul_ps(vTempX,row0);
+                vTempY = _mm_mul_ps(vTempY,row1);
+                vTempZ = _mm_mul_ps(vTempZ,row2);
+                vTempW = _mm_mul_ps(vTempW,row3);
+
+                vTempX = _mm_add_ps(vTempX,vTempY);
+                vTempZ = _mm_add_ps(vTempZ,vTempW);
+                vTempX = _mm_add_ps(vTempX,vTempZ);
+
+                _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTempX );
+                pOutputVector += OutputStride;
+            }
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+/****************************************************************************
+ *
+ * XMVECTOR operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V)
+{
+    return V;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V)
+{
+    return XMVectorNegate(V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator+=
+(
+    XMVECTOR&       V1,
+    FXMVECTOR       V2
+)
+{
+    V1 = XMVectorAdd(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator-=
+(
+    XMVECTOR&       V1,
+    FXMVECTOR       V2
+)
+{
+    V1 = XMVectorSubtract(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator*=
+(
+    XMVECTOR&       V1,
+    FXMVECTOR       V2
+)
+{
+    V1 = XMVectorMultiply(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator/=
+(
+    XMVECTOR&       V1,
+    FXMVECTOR       V2
+)
+{
+    V1 = XMVectorDivide(V1,V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& operator*=
+(
+    XMVECTOR&   V,
+    const float S
+)
+{
+    V = XMVectorScale(V, S);
+    return V;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& operator/=
+(
+    XMVECTOR&   V,
+    const float S
+)
+{
+    XMVECTOR vS = XMVectorReplicate( S );
+    V = XMVectorDivide(V, vS);
+    return V;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator+
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+    return XMVectorAdd(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator-
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+    return XMVectorSubtract(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator*
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+    return XMVectorMultiply(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator/
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+)
+{
+    return XMVectorDivide(V1,V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator*
+(
+    FXMVECTOR      V,
+    const float    S
+)
+{
+    return XMVectorScale(V, S);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator/
+(
+    FXMVECTOR      V,
+    const float    S
+)
+{
+    XMVECTOR vS = XMVectorReplicate( S );
+    return XMVectorDivide(V, vS);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator*
+(
+    float           S,
+    FXMVECTOR  	    V
+)
+{
+    return XMVectorScale(V, S);
+}
+
+#if defined(_XM_NO_INTRINSICS_)
+#undef XMISNAN
+#undef XMISINF
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_)
+#undef XM3UNPACK3INTO4
+#undef XM3PACK4INTO3
+#endif
diff --git a/Inc/DirectXPackedVector.h b/Inc/DirectXPackedVector.h
index 635dd8a..cc092fb 100644
--- a/Inc/DirectXPackedVector.h
+++ b/Inc/DirectXPackedVector.h
@@ -1,1003 +1,1003 @@
-//-------------------------------------------------------------------------------------
-// DirectXPackedVector.h -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#include "DirectXMath.h"
-
-namespace DirectX
-{
-    
-namespace PackedVector
-{
-
-#pragma warning(push)
-#pragma warning(disable:4201 4365 4324)
-// C4201: nonstandard extension used
-// C4365: Off by default noise
-// C4324: alignment padding warnings
-
-//------------------------------------------------------------------------------
-// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into
-// a 32 bit integer.  The normalized color is packed into 32 bits using 8 bit
-// unsigned, normalized integers for the alpha, red, green, and blue components.
-// The alpha component is stored in the most significant bits and the blue
-// component in the least significant bits (A8R8G8B8):
-// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0]
-struct XMCOLOR
-{
-    union
-    {
-        struct
-        {
-            uint8_t b;  // Blue:    0/255 to 255/255
-            uint8_t g;  // Green:   0/255 to 255/255
-            uint8_t r;  // Red:     0/255 to 255/255
-            uint8_t a;  // Alpha:   0/255 to 255/255
-        };
-        uint32_t c;
-    };
-
-    XMCOLOR() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMCOLOR(uint32_t Color) : c(Color) {}
-    XMCOLOR(float _r, float _g, float _b, float _a);
-    explicit XMCOLOR(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return c; }
-
-    XMCOLOR& operator= (const XMCOLOR& Color) { c = Color.c; return *this; }
-    XMCOLOR& operator= (const uint32_t Color) { c = Color; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 16 bit floating point number consisting of a sign bit, a 5 bit biased 
-// exponent, and a 10 bit mantissa
-typedef uint16_t HALF;
-
-//------------------------------------------------------------------------------
-// 2D Vector; 16 bit floating point components
-struct XMHALF2
-{
-    union
-    {
-        struct
-        {
-            HALF x;
-            HALF y;
-        };
-        uint32_t v;
-    };
-
-    XMHALF2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMHALF2(uint32_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {}
-    explicit XMHALF2(_In_reads_(2) const HALF *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMHALF2(float _x, float _y);
-    explicit XMHALF2(_In_reads_(2) const float *pArray);
-
-    XMHALF2& operator= (const XMHALF2& Half2) { x = Half2.x; y = Half2.y; return *this; }
-    XMHALF2& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 16 bit signed normalized integer components
-struct XMSHORTN2
-{
-    union
-    {
-        struct
-        {
-            int16_t x;
-            int16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMSHORTN2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMSHORTN2(uint32_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMSHORTN2(int16_t _x, int16_t _y) : x(_x), y(_y) {}
-    explicit XMSHORTN2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMSHORTN2(float _x, float _y);
-    explicit XMSHORTN2(_In_reads_(2) const float *pArray);
-
-    XMSHORTN2& operator= (const XMSHORTN2& ShortN2) { x = ShortN2.x; y = ShortN2.y; return *this; }
-    XMSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 2D Vector; 16 bit signed integer components
-struct XMSHORT2
-{
-    union
-    {
-        struct
-        {
-            int16_t x;
-            int16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMSHORT2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMSHORT2(uint32_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMSHORT2(int16_t _x, int16_t _y) : x(_x), y(_y) {}
-    explicit XMSHORT2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMSHORT2(float _x, float _y);
-    explicit XMSHORT2(_In_reads_(2) const float *pArray);
-
-    XMSHORT2& operator= (const XMSHORT2& Short2) { x = Short2.x; y = Short2.y; return *this; }
-    XMSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 2D Vector; 16 bit unsigned normalized integer components
-struct XMUSHORTN2
-{
-    union
-    {
-        struct
-        {
-            uint16_t x;
-            uint16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMUSHORTN2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUSHORTN2(uint32_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUSHORTN2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
-    explicit XMUSHORTN2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMUSHORTN2(float _x, float _y);
-    explicit XMUSHORTN2(_In_reads_(2) const float *pArray);
-
-    XMUSHORTN2& operator= (const XMUSHORTN2& UShortN2) { x = UShortN2.x; y = UShortN2.y; return *this; }
-    XMUSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 2D Vector; 16 bit unsigned integer components
-struct XMUSHORT2
-{
-    union
-    {
-        struct
-        {
-            uint16_t x;
-            uint16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMUSHORT2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUSHORT2(uint32_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUSHORT2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
-    explicit XMUSHORT2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMUSHORT2(float _x, float _y);
-    explicit XMUSHORT2(_In_reads_(2) const float *pArray);
-
-    XMUSHORT2& operator= (const XMUSHORT2& UShort2) { x = UShort2.x; y = UShort2.y; return *this; }
-    XMUSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 8 bit signed normalized integer components
-struct XMBYTEN2
-{
-    union
-    {
-        struct
-        {
-            int8_t x;
-            int8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMBYTEN2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMBYTEN2(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMBYTEN2(int8_t _x, int8_t _y) : x(_x), y(_y) {}
-    explicit XMBYTEN2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMBYTEN2(float _x, float _y);
-    explicit XMBYTEN2(_In_reads_(2) const float *pArray);
-
-    XMBYTEN2& operator= (const XMBYTEN2& ByteN2) { x = ByteN2.x; y = ByteN2.y; return *this; }
-    XMBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-// 2D Vector; 8 bit signed integer components
-struct XMBYTE2
-{
-    union
-    {
-        struct
-        {
-            int8_t x;
-            int8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMBYTE2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMBYTE2(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMBYTE2(int8_t _x, int8_t _y) : x(_x), y(_y) {}
-    explicit XMBYTE2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMBYTE2(float _x, float _y);
-    explicit XMBYTE2(_In_reads_(2) const float *pArray);
-
-    XMBYTE2& operator= (const XMBYTE2& Byte2) { x = Byte2.x; y = Byte2.y; return *this; }
-    XMBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-// 2D Vector; 8 bit unsigned normalized integer components
-struct XMUBYTEN2
-{
-    union
-    {
-        struct
-        {
-            uint8_t x;
-            uint8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMUBYTEN2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUBYTEN2(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUBYTEN2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {}
-    explicit XMUBYTEN2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMUBYTEN2(float _x, float _y);
-    explicit XMUBYTEN2(_In_reads_(2) const float *pArray);
-
-    XMUBYTEN2& operator= (const XMUBYTEN2& UByteN2) { x = UByteN2.x; y = UByteN2.y; return *this; }
-    XMUBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-// 2D Vector; 8 bit unsigned integer components
-struct XMUBYTE2
-{
-    union
-    {
-        struct
-        {
-            uint8_t x;
-            uint8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMUBYTE2() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUBYTE2(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUBYTE2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {}
-    explicit XMUBYTE2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
-    XMUBYTE2(float _x, float _y);
-    explicit XMUBYTE2(_In_reads_(2) const float *pArray);
-
-    XMUBYTE2& operator= (const XMUBYTE2& UByte2) { x = UByte2.x; y = UByte2.y; return *this; }
-    XMUBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 3D vector: 5/6/5 unsigned integer components
-struct XMU565
-{
-    union
-    {
-        struct
-        {
-            uint16_t x  : 5;    // 0 to 31
-            uint16_t y  : 6;    // 0 to 63
-            uint16_t z  : 5;    // 0 to 31
-        };
-        uint16_t v;
-    };
-
-    XMU565() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMU565(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMU565(uint8_t _x, uint8_t _y, uint8_t _z) : x(_x), y(_y), z(_z) {}
-    explicit XMU565(_In_reads_(3) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-    XMU565(float _x, float _y, float _z);
-    explicit XMU565(_In_reads_(3) const float *pArray);
-
-    operator uint16_t () const { return v; }
-
-    XMU565& operator= (const XMU565& U565) { v = U565.v; return *this; }
-    XMU565& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 3D vector: 11/11/10 floating-point components
-// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
-// and 6-bit mantissa for x component, a 5-bit biased exponent and
-// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit
-// mantissa for z. The z component is stored in the most significant bits
-// and the x component in the least significant bits. No sign bits so
-// all partial-precision numbers are positive.
-// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0]
-struct XMFLOAT3PK
-{
-    union
-    {
-        struct
-        {
-            uint32_t xm : 6; // x-mantissa
-            uint32_t xe : 5; // x-exponent
-            uint32_t ym : 6; // y-mantissa
-            uint32_t ye : 5; // y-exponent
-            uint32_t zm : 5; // z-mantissa
-            uint32_t ze : 5; // z-exponent
-        };
-        uint32_t v;
-    };
-
-    XMFLOAT3PK() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMFLOAT3PK(uint32_t Packed) : v(Packed) {}
-    XMFLOAT3PK(float _x, float _y, float _z);
-    explicit XMFLOAT3PK(_In_reads_(3) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMFLOAT3PK& operator= (const XMFLOAT3PK& float3pk) { v = float3pk.v; return *this; }
-    XMFLOAT3PK& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent
-// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
-// with 9-bit mantissa for the x, y, and z component. The shared exponent
-// is stored in the most significant bits and the x component mantissa is in
-// the least significant bits. No sign bits so all partial-precision numbers
-// are positive.
-// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0]
-struct XMFLOAT3SE
-{
-    union
-    {
-        struct
-        {
-            uint32_t xm : 9; // x-mantissa
-            uint32_t ym : 9; // y-mantissa
-            uint32_t zm : 9; // z-mantissa
-            uint32_t e  : 5; // shared exponent
-        };
-        uint32_t v;
-    };
-
-    XMFLOAT3SE() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMFLOAT3SE(uint32_t Packed) : v(Packed) {}
-    XMFLOAT3SE(float _x, float _y, float _z);
-    explicit XMFLOAT3SE(_In_reads_(3) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMFLOAT3SE& operator= (const XMFLOAT3SE& float3se) { v = float3se.v; return *this; }
-    XMFLOAT3SE& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 16 bit floating point components
-struct XMHALF4
-{
-    union
-    {
-        struct
-        {
-            HALF x;
-            HALF y;
-            HALF z;
-            HALF w;
-        };
-        uint64_t v;
-    };
-
-    XMHALF4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMHALF4(uint64_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMHALF4(_In_reads_(4) const HALF *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMHALF4(float _x, float _y, float _z, float _w);
-    explicit XMHALF4(_In_reads_(4) const float *pArray);
-
-    XMHALF4& operator= (const XMHALF4& Half4) { x = Half4.x; y = Half4.y; z = Half4.z; w = Half4.w; return *this; }
-    XMHALF4& operator= (uint64_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 16 bit signed normalized integer components
-struct XMSHORTN4
-{
-    union
-    {
-        struct
-        {
-            int16_t x;
-            int16_t y;
-            int16_t z;
-            int16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMSHORTN4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMSHORTN4(uint64_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMSHORTN4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMSHORTN4(float _x, float _y, float _z, float _w);
-    explicit XMSHORTN4(_In_reads_(4) const float *pArray);
-
-    XMSHORTN4& operator= (const XMSHORTN4& ShortN4) { x = ShortN4.x; y = ShortN4.y; z = ShortN4.z; w = ShortN4.w; return *this; }
-    XMSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 16 bit signed integer components
-struct XMSHORT4
-{
-    union
-    {
-        struct
-        {
-            int16_t x;
-            int16_t y;
-            int16_t z;
-            int16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMSHORT4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMSHORT4(uint64_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMSHORT4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMSHORT4(float _x, float _y, float _z, float _w);
-    explicit XMSHORT4(_In_reads_(4) const float *pArray);
-
-    XMSHORT4& operator= (const XMSHORT4& Short4) { x = Short4.x; y = Short4.y; z = Short4.z; w = Short4.w; return *this; }
-    XMSHORT4& operator= (uint64_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 16 bit unsigned normalized integer components
-struct XMUSHORTN4
-{
-    union
-    {
-        struct
-        {
-            uint16_t x;
-            uint16_t y;
-            uint16_t z;
-            uint16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMUSHORTN4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUSHORTN4(uint64_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUSHORTN4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUSHORTN4(float _x, float _y, float _z, float _w);
-    explicit XMUSHORTN4(_In_reads_(4) const float *pArray);
-
-    XMUSHORTN4& operator= (const XMUSHORTN4& UShortN4) { x = UShortN4.x; y = UShortN4.y; z = UShortN4.z; w = UShortN4.w; return *this; }
-    XMUSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 16 bit unsigned integer components
-struct XMUSHORT4
-{
-    union
-    {
-        struct
-        {
-            uint16_t x;
-            uint16_t y;
-            uint16_t z;
-            uint16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMUSHORT4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUSHORT4(uint64_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUSHORT4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUSHORT4(float _x, float _y, float _z, float _w);
-    explicit XMUSHORT4(_In_reads_(4) const float *pArray);
-
-    XMUSHORT4& operator= (const XMUSHORT4& UShort4) { x = UShort4.x; y = UShort4.y; z = UShort4.z; w = UShort4.w; return *this; }
-    XMUSHORT4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, 
-// normalized integer for the w component and 10 bit signed, normalized 
-// integers for the z, y, and x components.  The w component is stored in the 
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XMXDECN4
-{
-    union
-    {
-        struct
-        {
-            int32_t x   : 10;    // -511/511 to 511/511
-            int32_t y   : 10;    // -511/511 to 511/511
-            int32_t z   : 10;    // -511/511 to 511/511
-            uint32_t w  : 2;     //      0/3 to     3/3
-        };
-        uint32_t v;
-    };
-
-    XMXDECN4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMXDECN4(uint32_t Packed) : v(Packed) {}
-    XMXDECN4(float _x, float _y, float _z, float _w);
-    explicit XMXDECN4(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMXDECN4& operator= (const XMXDECN4& XDecN4) { v = XDecN4.v; return *this; }
-    XMXDECN4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned
-// integer for the w component and 10 bit signed integers for the 
-// z, y, and x components.  The w component is stored in the 
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XM_DEPRECATED XMXDEC4
-{
-    union
-    {
-        struct
-        {
-            int32_t x   : 10;    // -511 to 511
-            int32_t y   : 10;    // -511 to 511
-            int32_t z   : 10;    // -511 to 511
-            uint32_t w  : 2;     // 0 to 3
-        };
-        uint32_t v;
-    };
-
-    XMXDEC4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMXDEC4(uint32_t Packed) : v(Packed) {}
-    XMXDEC4(float _x, float _y, float _z, float _w);
-    explicit XMXDEC4(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMXDEC4& operator= (const XMXDEC4& XDec4) { v = XDec4.v; return *this; }
-    XMXDEC4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, 
-// normalized integer for the w component and 10 bit signed, normalized 
-// integers for the z, y, and x components.  The w component is stored in the 
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XM_DEPRECATED XMDECN4
-{
-    union
-    {
-        struct
-        {
-            int32_t x   : 10;    // -511/511 to 511/511
-            int32_t y   : 10;    // -511/511 to 511/511
-            int32_t z   : 10;    // -511/511 to 511/511
-            int32_t w   : 2;     //     -1/1 to     1/1
-        };
-        uint32_t v;
-    };
-
-    XMDECN4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMDECN4(uint32_t Packed) : v(Packed) {}
-    XMDECN4(float _x, float _y, float _z, float _w);
-    explicit XMDECN4(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMDECN4& operator= (const XMDECN4& DecN4) { v = DecN4.v; return *this; }
-    XMDECN4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
-// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, 
-// integer for the w component and 10 bit signed integers for the 
-// z, y, and x components.  The w component is stored in the 
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XM_DEPRECATED XMDEC4
-{
-    union
-    {
-        struct
-        {
-            int32_t  x  : 10;    // -511 to 511
-            int32_t  y  : 10;    // -511 to 511
-            int32_t  z  : 10;    // -511 to 511
-            int32_t  w  : 2;     //   -1 to   1
-        };
-        uint32_t v;
-    };
-
-    XMDEC4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMDEC4(uint32_t Packed) : v(Packed) {}
-    XMDEC4(float _x, float _y, float _z, float _w);
-    explicit XMDEC4(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMDEC4& operator= (const XMDEC4& Dec4) { v = Dec4.v; return *this; }
-    XMDEC4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, 
-// normalized integer for the w component and 10 bit unsigned, normalized 
-// integers for the z, y, and x components.  The w component is stored in the 
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XMUDECN4
-{
-    union
-    {
-        struct
-        {
-            uint32_t x  : 10;    // 0/1023 to 1023/1023
-            uint32_t y  : 10;    // 0/1023 to 1023/1023
-            uint32_t z  : 10;    // 0/1023 to 1023/1023
-            uint32_t w  : 2;     //    0/3 to       3/3
-        };
-        uint32_t v;
-    };
-
-    XMUDECN4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUDECN4(uint32_t Packed) : v(Packed) {}
-    XMUDECN4(float _x, float _y, float _z, float _w);
-    explicit XMUDECN4(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMUDECN4& operator= (const XMUDECN4& UDecN4) { v = UDecN4.v; return *this; }
-    XMUDECN4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
-// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, 
-// integer for the w component and 10 bit unsigned integers 
-// for the z, y, and x components.  The w component is stored in the 
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XMUDEC4
-{
-    union
-    {
-        struct
-        {
-            uint32_t x  : 10;    // 0 to 1023
-            uint32_t y  : 10;    // 0 to 1023
-            uint32_t z  : 10;    // 0 to 1023
-            uint32_t w  : 2;     // 0 to    3
-        };
-        uint32_t v;
-    };
-
-    XMUDEC4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUDEC4(uint32_t Packed) : v(Packed) {}
-    XMUDEC4(float _x, float _y, float _z, float _w);
-    explicit XMUDEC4(_In_reads_(4) const float *pArray);
-
-    operator uint32_t () const { return v; }
-
-    XMUDEC4& operator= (const XMUDEC4& UDec4) { v = UDec4.v; return *this; }
-    XMUDEC4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 8 bit signed normalized integer components
-struct XMBYTEN4
-{
-    union
-    {
-        struct
-        {
-            int8_t x;
-            int8_t y;
-            int8_t z;
-            int8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMBYTEN4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XM_CONSTEXPR XMBYTEN4(uint32_t Packed) : v(Packed) {}
-    explicit XMBYTEN4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMBYTEN4(float _x, float _y, float _z, float _w);
-    explicit XMBYTEN4(_In_reads_(4) const float *pArray);
-
-    XMBYTEN4& operator= (const XMBYTEN4& ByteN4) { x = ByteN4.x; y = ByteN4.y; z = ByteN4.z; w = ByteN4.w; return *this; }
-    XMBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 8 bit signed integer components
-struct XMBYTE4
-{
-    union
-    {
-        struct
-        {
-            int8_t x;
-            int8_t y;
-            int8_t z;
-            int8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMBYTE4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XM_CONSTEXPR XMBYTE4(uint32_t Packed) : v(Packed) {}
-    explicit XMBYTE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMBYTE4(float _x, float _y, float _z, float _w);
-    explicit XMBYTE4(_In_reads_(4) const float *pArray);
-
-    XMBYTE4& operator= (const XMBYTE4& Byte4) { x = Byte4.x; y = Byte4.y; z = Byte4.z; w = Byte4.w; return *this; }
-    XMBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 8 bit unsigned normalized integer components
-struct XMUBYTEN4
-{
-    union
-    {
-        struct
-        {
-            uint8_t x;
-            uint8_t y;
-            uint8_t z;
-            uint8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMUBYTEN4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XM_CONSTEXPR XMUBYTEN4(uint32_t Packed) : v(Packed) {}
-    explicit XMUBYTEN4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUBYTEN4(float _x, float _y, float _z, float _w);
-    explicit XMUBYTEN4(_In_reads_(4) const float *pArray);
-
-    XMUBYTEN4& operator= (const XMUBYTEN4& UByteN4) { x = UByteN4.x; y = UByteN4.y; z = UByteN4.z; w = UByteN4.w; return *this; }
-    XMUBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-// 4D Vector; 8 bit unsigned integer components
-struct XMUBYTE4
-{
-    union
-    {
-        struct
-        {
-            uint8_t x;
-            uint8_t y;
-            uint8_t z;
-            uint8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMUBYTE4() XM_CTOR_DEFAULT
-    XM_CONSTEXPR XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XM_CONSTEXPR XMUBYTE4(uint32_t Packed) : v(Packed) {}
-    explicit XMUBYTE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUBYTE4(float _x, float _y, float _z, float _w);
-    explicit XMUBYTE4(_In_reads_(4) const float *pArray);
-
-    XMUBYTE4& operator= (const XMUBYTE4& UByte4) { x = UByte4.x; y = UByte4.y; z = UByte4.z; w = UByte4.w; return *this; }
-    XMUBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D vector; 4 bit unsigned integer components
-struct XMUNIBBLE4
-{
-    union
-    {
-        struct
-        {
-            uint16_t x  : 4;    // 0 to 15
-            uint16_t y  : 4;    // 0 to 15
-            uint16_t z  : 4;    // 0 to 15
-            uint16_t w  : 4;    // 0 to 15
-        };
-        uint16_t v;
-    };
-
-    XMUNIBBLE4() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMUNIBBLE4(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUNIBBLE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUNIBBLE4(float _x, float _y, float _z, float _w);
-    explicit XMUNIBBLE4(_In_reads_(4) const float *pArray);
-
-    operator uint16_t () const { return v; }
-
-    XMUNIBBLE4& operator= (const XMUNIBBLE4& UNibble4) { v = UNibble4.v; return *this; }
-    XMUNIBBLE4& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-//------------------------------------------------------------------------------
-// 4D vector: 5/5/5/1 unsigned integer components
-struct XMU555
-{
-    union
-    {
-        struct
-        {
-            uint16_t x  : 5;    // 0 to 31
-            uint16_t y  : 5;    // 0 to 31
-            uint16_t z  : 5;    // 0 to 31
-            uint16_t w  : 1;    // 0 or 1
-        };
-        uint16_t v;
-    };
-
-    XMU555() XM_CTOR_DEFAULT
-    explicit XM_CONSTEXPR XMU555(uint16_t Packed) : v(Packed) {}
-    XM_CONSTEXPR XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {}
-    XMU555(_In_reads_(3) const uint8_t *pArray, _In_ bool _w) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {}
-    XMU555(float _x, float _y, float _z, bool _w);
-    XMU555(_In_reads_(3) const float *pArray, _In_ bool _w);
-
-    operator uint16_t () const { return v; }
-
-    XMU555& operator= (const XMU555& U555) { v = U555.v; return *this; }
-    XMU555& operator= (uint16_t Packed) { v = Packed; return *this; }
-};
-
-#pragma warning(pop)
-
-
-/****************************************************************************
- *
- * Data conversion operations
- *
- ****************************************************************************/
-
-float           XMConvertHalfToFloat(HALF Value);
-float*          XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream,
-                                           _In_ size_t OutputStride,
-                                           _In_reads_bytes_(sizeof(HALF)+InputStride*(HalfCount-1)) const HALF* pInputStream,
-                                           _In_ size_t InputStride, _In_ size_t HalfCount);
-HALF            XMConvertFloatToHalf(float Value);
-HALF*           XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream,
-                                           _In_ size_t OutputStride,
-                                           _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream,
-                                           _In_ size_t InputStride, _In_ size_t FloatCount);
-
-/****************************************************************************
- *
- * Load operations
- *
- ****************************************************************************/
-
-XMVECTOR    XM_CALLCONV     XMLoadColor(_In_ const XMCOLOR* pSource);
-
-XMVECTOR    XM_CALLCONV     XMLoadHalf2(_In_ const XMHALF2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadShortN2(_In_ const XMSHORTN2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadShort2(_In_ const XMSHORT2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUShortN2(_In_ const XMUSHORTN2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUShort2(_In_ const XMUSHORT2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadByteN2(_In_ const XMBYTEN2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadByte2(_In_ const XMBYTE2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUByteN2(_In_ const XMUBYTEN2* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUByte2(_In_ const XMUBYTE2* pSource);
-
-XMVECTOR    XM_CALLCONV     XMLoadU565(_In_ const XMU565* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource);
-
-XMVECTOR    XM_CALLCONV     XMLoadHalf4(_In_ const XMHALF4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadShortN4(_In_ const XMSHORTN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadShort4(_In_ const XMSHORT4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUShortN4(_In_ const XMUSHORTN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUShort4(_In_ const XMUSHORT4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadXDecN4(_In_ const XMXDECN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUDecN4(_In_ const XMUDECN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUDec4(_In_ const XMUDEC4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadByteN4(_In_ const XMBYTEN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadByte4(_In_ const XMBYTE4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUByteN4(_In_ const XMUBYTEN4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUByte4(_In_ const XMUBYTE4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource);
-XMVECTOR    XM_CALLCONV     XMLoadU555(_In_ const XMU555* pSource);
-
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource);
-XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource);
-XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource);
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * Store operations
- *
- ****************************************************************************/
-
-void    XM_CALLCONV     XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V);
-
-void    XM_CALLCONV     XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V);
-
-void    XM_CALLCONV     XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V);
-
-void    XM_CALLCONV     XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V);
-void    XM_CALLCONV     XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V);
-
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-void    XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V);
-void    XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V);
-void    XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V);
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * Implementation
- *
- ****************************************************************************/
-
-#pragma warning(push)
-#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101)
-// C4068/4616: ignore unknown pragmas
-// C4214/4204: nonstandard extension used
-// C4365: Off by default noise
-// C6001/6101: False positives
-
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-
-#include "DirectXPackedVector.inl"
-
-#pragma prefast(pop)
-#pragma warning(pop)
-
-}; // namespace PackedVector
-
-}; // namespace DirectX
-
+//-------------------------------------------------------------------------------------
+// DirectXPackedVector.h -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+    
+namespace PackedVector
+{
+
+#pragma warning(push)
+#pragma warning(disable:4201 4365 4324)
+// C4201: nonstandard extension used
+// C4365: Off by default noise
+// C4324: alignment padding warnings
+
+//------------------------------------------------------------------------------
+// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into
+// a 32 bit integer.  The normalized color is packed into 32 bits using 8 bit
+// unsigned, normalized integers for the alpha, red, green, and blue components.
+// The alpha component is stored in the most significant bits and the blue
+// component in the least significant bits (A8R8G8B8):
+// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0]
+struct XMCOLOR
+{
+    union
+    {
+        struct
+        {
+            uint8_t b;  // Blue:    0/255 to 255/255
+            uint8_t g;  // Green:   0/255 to 255/255
+            uint8_t r;  // Red:     0/255 to 255/255
+            uint8_t a;  // Alpha:   0/255 to 255/255
+        };
+        uint32_t c;
+    };
+
+    XMCOLOR() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMCOLOR(uint32_t Color) : c(Color) {}
+    XMCOLOR(float _r, float _g, float _b, float _a);
+    explicit XMCOLOR(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return c; }
+
+    XMCOLOR& operator= (const XMCOLOR& Color) { c = Color.c; return *this; }
+    XMCOLOR& operator= (const uint32_t Color) { c = Color; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 16 bit floating point number consisting of a sign bit, a 5 bit biased 
+// exponent, and a 10 bit mantissa
+typedef uint16_t HALF;
+
+//------------------------------------------------------------------------------
+// 2D Vector; 16 bit floating point components
+struct XMHALF2
+{
+    union
+    {
+        struct
+        {
+            HALF x;
+            HALF y;
+        };
+        uint32_t v;
+    };
+
+    XMHALF2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMHALF2(uint32_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {}
+    explicit XMHALF2(_In_reads_(2) const HALF *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMHALF2(float _x, float _y);
+    explicit XMHALF2(_In_reads_(2) const float *pArray);
+
+    XMHALF2& operator= (const XMHALF2& Half2) { x = Half2.x; y = Half2.y; return *this; }
+    XMHALF2& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 2D Vector; 16 bit signed normalized integer components
+struct XMSHORTN2
+{
+    union
+    {
+        struct
+        {
+            int16_t x;
+            int16_t y;
+        };
+        uint32_t v;
+    };
+
+    XMSHORTN2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMSHORTN2(uint32_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMSHORTN2(int16_t _x, int16_t _y) : x(_x), y(_y) {}
+    explicit XMSHORTN2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMSHORTN2(float _x, float _y);
+    explicit XMSHORTN2(_In_reads_(2) const float *pArray);
+
+    XMSHORTN2& operator= (const XMSHORTN2& ShortN2) { x = ShortN2.x; y = ShortN2.y; return *this; }
+    XMSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 2D Vector; 16 bit signed integer components
+struct XMSHORT2
+{
+    union
+    {
+        struct
+        {
+            int16_t x;
+            int16_t y;
+        };
+        uint32_t v;
+    };
+
+    XMSHORT2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMSHORT2(uint32_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMSHORT2(int16_t _x, int16_t _y) : x(_x), y(_y) {}
+    explicit XMSHORT2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMSHORT2(float _x, float _y);
+    explicit XMSHORT2(_In_reads_(2) const float *pArray);
+
+    XMSHORT2& operator= (const XMSHORT2& Short2) { x = Short2.x; y = Short2.y; return *this; }
+    XMSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 2D Vector; 16 bit unsigned normalized integer components
+struct XMUSHORTN2
+{
+    union
+    {
+        struct
+        {
+            uint16_t x;
+            uint16_t y;
+        };
+        uint32_t v;
+    };
+
+    XMUSHORTN2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUSHORTN2(uint32_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUSHORTN2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
+    explicit XMUSHORTN2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMUSHORTN2(float _x, float _y);
+    explicit XMUSHORTN2(_In_reads_(2) const float *pArray);
+
+    XMUSHORTN2& operator= (const XMUSHORTN2& UShortN2) { x = UShortN2.x; y = UShortN2.y; return *this; }
+    XMUSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 2D Vector; 16 bit unsigned integer components
+struct XMUSHORT2
+{
+    union
+    {
+        struct
+        {
+            uint16_t x;
+            uint16_t y;
+        };
+        uint32_t v;
+    };
+
+    XMUSHORT2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUSHORT2(uint32_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUSHORT2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
+    explicit XMUSHORT2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMUSHORT2(float _x, float _y);
+    explicit XMUSHORT2(_In_reads_(2) const float *pArray);
+
+    XMUSHORT2& operator= (const XMUSHORT2& UShort2) { x = UShort2.x; y = UShort2.y; return *this; }
+    XMUSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 2D Vector; 8 bit signed normalized integer components
+struct XMBYTEN2
+{
+    union
+    {
+        struct
+        {
+            int8_t x;
+            int8_t y;
+        };
+        uint16_t v;
+    };
+
+    XMBYTEN2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMBYTEN2(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMBYTEN2(int8_t _x, int8_t _y) : x(_x), y(_y) {}
+    explicit XMBYTEN2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMBYTEN2(float _x, float _y);
+    explicit XMBYTEN2(_In_reads_(2) const float *pArray);
+
+    XMBYTEN2& operator= (const XMBYTEN2& ByteN2) { x = ByteN2.x; y = ByteN2.y; return *this; }
+    XMBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+// 2D Vector; 8 bit signed integer components
+struct XMBYTE2
+{
+    union
+    {
+        struct
+        {
+            int8_t x;
+            int8_t y;
+        };
+        uint16_t v;
+    };
+
+    XMBYTE2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMBYTE2(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMBYTE2(int8_t _x, int8_t _y) : x(_x), y(_y) {}
+    explicit XMBYTE2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMBYTE2(float _x, float _y);
+    explicit XMBYTE2(_In_reads_(2) const float *pArray);
+
+    XMBYTE2& operator= (const XMBYTE2& Byte2) { x = Byte2.x; y = Byte2.y; return *this; }
+    XMBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+// 2D Vector; 8 bit unsigned normalized integer components
+struct XMUBYTEN2
+{
+    union
+    {
+        struct
+        {
+            uint8_t x;
+            uint8_t y;
+        };
+        uint16_t v;
+    };
+
+    XMUBYTEN2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUBYTEN2(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUBYTEN2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {}
+    explicit XMUBYTEN2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMUBYTEN2(float _x, float _y);
+    explicit XMUBYTEN2(_In_reads_(2) const float *pArray);
+
+    XMUBYTEN2& operator= (const XMUBYTEN2& UByteN2) { x = UByteN2.x; y = UByteN2.y; return *this; }
+    XMUBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+// 2D Vector; 8 bit unsigned integer components
+struct XMUBYTE2
+{
+    union
+    {
+        struct
+        {
+            uint8_t x;
+            uint8_t y;
+        };
+        uint16_t v;
+    };
+
+    XMUBYTE2() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUBYTE2(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUBYTE2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {}
+    explicit XMUBYTE2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+    XMUBYTE2(float _x, float _y);
+    explicit XMUBYTE2(_In_reads_(2) const float *pArray);
+
+    XMUBYTE2& operator= (const XMUBYTE2& UByte2) { x = UByte2.x; y = UByte2.y; return *this; }
+    XMUBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 3D vector: 5/6/5 unsigned integer components
+struct XMU565
+{
+    union
+    {
+        struct
+        {
+            uint16_t x  : 5;    // 0 to 31
+            uint16_t y  : 6;    // 0 to 63
+            uint16_t z  : 5;    // 0 to 31
+        };
+        uint16_t v;
+    };
+
+    XMU565() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMU565(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMU565(uint8_t _x, uint8_t _y, uint8_t _z) : x(_x), y(_y), z(_z) {}
+    explicit XMU565(_In_reads_(3) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+    XMU565(float _x, float _y, float _z);
+    explicit XMU565(_In_reads_(3) const float *pArray);
+
+    operator uint16_t () const { return v; }
+
+    XMU565& operator= (const XMU565& U565) { v = U565.v; return *this; }
+    XMU565& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 3D vector: 11/11/10 floating-point components
+// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
+// and 6-bit mantissa for x component, a 5-bit biased exponent and
+// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit
+// mantissa for z. The z component is stored in the most significant bits
+// and the x component in the least significant bits. No sign bits so
+// all partial-precision numbers are positive.
+// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0]
+struct XMFLOAT3PK
+{
+    union
+    {
+        struct
+        {
+            uint32_t xm : 6; // x-mantissa
+            uint32_t xe : 5; // x-exponent
+            uint32_t ym : 6; // y-mantissa
+            uint32_t ye : 5; // y-exponent
+            uint32_t zm : 5; // z-mantissa
+            uint32_t ze : 5; // z-exponent
+        };
+        uint32_t v;
+    };
+
+    XMFLOAT3PK() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMFLOAT3PK(uint32_t Packed) : v(Packed) {}
+    XMFLOAT3PK(float _x, float _y, float _z);
+    explicit XMFLOAT3PK(_In_reads_(3) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMFLOAT3PK& operator= (const XMFLOAT3PK& float3pk) { v = float3pk.v; return *this; }
+    XMFLOAT3PK& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent
+// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
+// with 9-bit mantissa for the x, y, and z component. The shared exponent
+// is stored in the most significant bits and the x component mantissa is in
+// the least significant bits. No sign bits so all partial-precision numbers
+// are positive.
+// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0]
+struct XMFLOAT3SE
+{
+    union
+    {
+        struct
+        {
+            uint32_t xm : 9; // x-mantissa
+            uint32_t ym : 9; // y-mantissa
+            uint32_t zm : 9; // z-mantissa
+            uint32_t e  : 5; // shared exponent
+        };
+        uint32_t v;
+    };
+
+    XMFLOAT3SE() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMFLOAT3SE(uint32_t Packed) : v(Packed) {}
+    XMFLOAT3SE(float _x, float _y, float _z);
+    explicit XMFLOAT3SE(_In_reads_(3) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMFLOAT3SE& operator= (const XMFLOAT3SE& float3se) { v = float3se.v; return *this; }
+    XMFLOAT3SE& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D Vector; 16 bit floating point components
+struct XMHALF4
+{
+    union
+    {
+        struct
+        {
+            HALF x;
+            HALF y;
+            HALF z;
+            HALF w;
+        };
+        uint64_t v;
+    };
+
+    XMHALF4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMHALF4(uint64_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMHALF4(_In_reads_(4) const HALF *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMHALF4(float _x, float _y, float _z, float _w);
+    explicit XMHALF4(_In_reads_(4) const float *pArray);
+
+    XMHALF4& operator= (const XMHALF4& Half4) { x = Half4.x; y = Half4.y; z = Half4.z; w = Half4.w; return *this; }
+    XMHALF4& operator= (uint64_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D Vector; 16 bit signed normalized integer components
+struct XMSHORTN4
+{
+    union
+    {
+        struct
+        {
+            int16_t x;
+            int16_t y;
+            int16_t z;
+            int16_t w;
+        };
+        uint64_t v;
+    };
+
+    XMSHORTN4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMSHORTN4(uint64_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMSHORTN4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMSHORTN4(float _x, float _y, float _z, float _w);
+    explicit XMSHORTN4(_In_reads_(4) const float *pArray);
+
+    XMSHORTN4& operator= (const XMSHORTN4& ShortN4) { x = ShortN4.x; y = ShortN4.y; z = ShortN4.z; w = ShortN4.w; return *this; }
+    XMSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 16 bit signed integer components
+struct XMSHORT4
+{
+    union
+    {
+        struct
+        {
+            int16_t x;
+            int16_t y;
+            int16_t z;
+            int16_t w;
+        };
+        uint64_t v;
+    };
+
+    XMSHORT4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMSHORT4(uint64_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMSHORT4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMSHORT4(float _x, float _y, float _z, float _w);
+    explicit XMSHORT4(_In_reads_(4) const float *pArray);
+
+    XMSHORT4& operator= (const XMSHORT4& Short4) { x = Short4.x; y = Short4.y; z = Short4.z; w = Short4.w; return *this; }
+    XMSHORT4& operator= (uint64_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 16 bit unsigned normalized integer components
+struct XMUSHORTN4
+{
+    union
+    {
+        struct
+        {
+            uint16_t x;
+            uint16_t y;
+            uint16_t z;
+            uint16_t w;
+        };
+        uint64_t v;
+    };
+
+    XMUSHORTN4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUSHORTN4(uint64_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMUSHORTN4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMUSHORTN4(float _x, float _y, float _z, float _w);
+    explicit XMUSHORTN4(_In_reads_(4) const float *pArray);
+
+    XMUSHORTN4& operator= (const XMUSHORTN4& UShortN4) { x = UShortN4.x; y = UShortN4.y; z = UShortN4.z; w = UShortN4.w; return *this; }
+    XMUSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 16 bit unsigned integer components
+struct XMUSHORT4
+{
+    union
+    {
+        struct
+        {
+            uint16_t x;
+            uint16_t y;
+            uint16_t z;
+            uint16_t w;
+        };
+        uint64_t v;
+    };
+
+    XMUSHORT4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUSHORT4(uint64_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMUSHORT4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMUSHORT4(float _x, float _y, float _z, float _w);
+    explicit XMUSHORT4(_In_reads_(4) const float *pArray);
+
+    XMUSHORT4& operator= (const XMUSHORT4& UShort4) { x = UShort4.x; y = UShort4.y; z = UShort4.z; w = UShort4.w; return *this; }
+    XMUSHORT4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, 
+// normalized integer for the w component and 10 bit signed, normalized 
+// integers for the z, y, and x components.  The w component is stored in the 
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+struct XMXDECN4
+{
+    union
+    {
+        struct
+        {
+            int32_t x   : 10;    // -511/511 to 511/511
+            int32_t y   : 10;    // -511/511 to 511/511
+            int32_t z   : 10;    // -511/511 to 511/511
+            uint32_t w  : 2;     //      0/3 to     3/3
+        };
+        uint32_t v;
+    };
+
+    XMXDECN4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMXDECN4(uint32_t Packed) : v(Packed) {}
+    XMXDECN4(float _x, float _y, float _z, float _w);
+    explicit XMXDECN4(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMXDECN4& operator= (const XMXDECN4& XDecN4) { v = XDecN4.v; return *this; }
+    XMXDECN4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned
+// integer for the w component and 10 bit signed integers for the 
+// z, y, and x components.  The w component is stored in the 
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+struct XM_DEPRECATED XMXDEC4
+{
+    union
+    {
+        struct
+        {
+            int32_t x   : 10;    // -511 to 511
+            int32_t y   : 10;    // -511 to 511
+            int32_t z   : 10;    // -511 to 511
+            uint32_t w  : 2;     // 0 to 3
+        };
+        uint32_t v;
+    };
+
+    XMXDEC4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMXDEC4(uint32_t Packed) : v(Packed) {}
+    XMXDEC4(float _x, float _y, float _z, float _w);
+    explicit XMXDEC4(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMXDEC4& operator= (const XMXDEC4& XDec4) { v = XDec4.v; return *this; }
+    XMXDEC4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, 
+// normalized integer for the w component and 10 bit signed, normalized 
+// integers for the z, y, and x components.  The w component is stored in the 
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+struct XM_DEPRECATED XMDECN4
+{
+    union
+    {
+        struct
+        {
+            int32_t x   : 10;    // -511/511 to 511/511
+            int32_t y   : 10;    // -511/511 to 511/511
+            int32_t z   : 10;    // -511/511 to 511/511
+            int32_t w   : 2;     //     -1/1 to     1/1
+        };
+        uint32_t v;
+    };
+
+    XMDECN4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMDECN4(uint32_t Packed) : v(Packed) {}
+    XMDECN4(float _x, float _y, float _z, float _w);
+    explicit XMDECN4(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMDECN4& operator= (const XMDECN4& DecN4) { v = DecN4.v; return *this; }
+    XMDECN4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, 
+// integer for the w component and 10 bit signed integers for the 
+// z, y, and x components.  The w component is stored in the 
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+struct XM_DEPRECATED XMDEC4
+{
+    union
+    {
+        struct
+        {
+            int32_t  x  : 10;    // -511 to 511
+            int32_t  y  : 10;    // -511 to 511
+            int32_t  z  : 10;    // -511 to 511
+            int32_t  w  : 2;     //   -1 to   1
+        };
+        uint32_t v;
+    };
+
+    XMDEC4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMDEC4(uint32_t Packed) : v(Packed) {}
+    XMDEC4(float _x, float _y, float _z, float _w);
+    explicit XMDEC4(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMDEC4& operator= (const XMDEC4& Dec4) { v = Dec4.v; return *this; }
+    XMDEC4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, 
+// normalized integer for the w component and 10 bit unsigned, normalized 
+// integers for the z, y, and x components.  The w component is stored in the 
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+struct XMUDECN4
+{
+    union
+    {
+        struct
+        {
+            uint32_t x  : 10;    // 0/1023 to 1023/1023
+            uint32_t y  : 10;    // 0/1023 to 1023/1023
+            uint32_t z  : 10;    // 0/1023 to 1023/1023
+            uint32_t w  : 2;     //    0/3 to       3/3
+        };
+        uint32_t v;
+    };
+
+    XMUDECN4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUDECN4(uint32_t Packed) : v(Packed) {}
+    XMUDECN4(float _x, float _y, float _z, float _w);
+    explicit XMUDECN4(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMUDECN4& operator= (const XMUDECN4& UDecN4) { v = UDecN4.v; return *this; }
+    XMUDECN4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, 
+// integer for the w component and 10 bit unsigned integers 
+// for the z, y, and x components.  The w component is stored in the 
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+struct XMUDEC4
+{
+    union
+    {
+        struct
+        {
+            uint32_t x  : 10;    // 0 to 1023
+            uint32_t y  : 10;    // 0 to 1023
+            uint32_t z  : 10;    // 0 to 1023
+            uint32_t w  : 2;     // 0 to    3
+        };
+        uint32_t v;
+    };
+
+    XMUDEC4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUDEC4(uint32_t Packed) : v(Packed) {}
+    XMUDEC4(float _x, float _y, float _z, float _w);
+    explicit XMUDEC4(_In_reads_(4) const float *pArray);
+
+    operator uint32_t () const { return v; }
+
+    XMUDEC4& operator= (const XMUDEC4& UDec4) { v = UDec4.v; return *this; }
+    XMUDEC4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D Vector; 8 bit signed normalized integer components
+struct XMBYTEN4
+{
+    union
+    {
+        struct
+        {
+            int8_t x;
+            int8_t y;
+            int8_t z;
+            int8_t w;
+        };
+        uint32_t v;
+    };
+
+    XMBYTEN4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XM_CONSTEXPR XMBYTEN4(uint32_t Packed) : v(Packed) {}
+    explicit XMBYTEN4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMBYTEN4(float _x, float _y, float _z, float _w);
+    explicit XMBYTEN4(_In_reads_(4) const float *pArray);
+
+    XMBYTEN4& operator= (const XMBYTEN4& ByteN4) { x = ByteN4.x; y = ByteN4.y; z = ByteN4.z; w = ByteN4.w; return *this; }
+    XMBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 8 bit signed integer components
+struct XMBYTE4
+{
+    union
+    {
+        struct
+        {
+            int8_t x;
+            int8_t y;
+            int8_t z;
+            int8_t w;
+        };
+        uint32_t v;
+    };
+
+    XMBYTE4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XM_CONSTEXPR XMBYTE4(uint32_t Packed) : v(Packed) {}
+    explicit XMBYTE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMBYTE4(float _x, float _y, float _z, float _w);
+    explicit XMBYTE4(_In_reads_(4) const float *pArray);
+
+    XMBYTE4& operator= (const XMBYTE4& Byte4) { x = Byte4.x; y = Byte4.y; z = Byte4.z; w = Byte4.w; return *this; }
+    XMBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 8 bit unsigned normalized integer components
+struct XMUBYTEN4
+{
+    union
+    {
+        struct
+        {
+            uint8_t x;
+            uint8_t y;
+            uint8_t z;
+            uint8_t w;
+        };
+        uint32_t v;
+    };
+
+    XMUBYTEN4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XM_CONSTEXPR XMUBYTEN4(uint32_t Packed) : v(Packed) {}
+    explicit XMUBYTEN4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMUBYTEN4(float _x, float _y, float _z, float _w);
+    explicit XMUBYTEN4(_In_reads_(4) const float *pArray);
+
+    XMUBYTEN4& operator= (const XMUBYTEN4& UByteN4) { x = UByteN4.x; y = UByteN4.y; z = UByteN4.z; w = UByteN4.w; return *this; }
+    XMUBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+// 4D Vector; 8 bit unsigned integer components
+struct XMUBYTE4
+{
+    union
+    {
+        struct
+        {
+            uint8_t x;
+            uint8_t y;
+            uint8_t z;
+            uint8_t w;
+        };
+        uint32_t v;
+    };
+
+    XMUBYTE4() XM_CTOR_DEFAULT
+    XM_CONSTEXPR XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XM_CONSTEXPR XMUBYTE4(uint32_t Packed) : v(Packed) {}
+    explicit XMUBYTE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMUBYTE4(float _x, float _y, float _z, float _w);
+    explicit XMUBYTE4(_In_reads_(4) const float *pArray);
+
+    XMUBYTE4& operator= (const XMUBYTE4& UByte4) { x = UByte4.x; y = UByte4.y; z = UByte4.z; w = UByte4.w; return *this; }
+    XMUBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D vector; 4 bit unsigned integer components
+struct XMUNIBBLE4
+{
+    union
+    {
+        struct
+        {
+            uint16_t x  : 4;    // 0 to 15
+            uint16_t y  : 4;    // 0 to 15
+            uint16_t z  : 4;    // 0 to 15
+            uint16_t w  : 4;    // 0 to 15
+        };
+        uint16_t v;
+    };
+
+    XMUNIBBLE4() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMUNIBBLE4(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {}
+    explicit XMUNIBBLE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    XMUNIBBLE4(float _x, float _y, float _z, float _w);
+    explicit XMUNIBBLE4(_In_reads_(4) const float *pArray);
+
+    operator uint16_t () const { return v; }
+
+    XMUNIBBLE4& operator= (const XMUNIBBLE4& UNibble4) { v = UNibble4.v; return *this; }
+    XMUNIBBLE4& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+//------------------------------------------------------------------------------
+// 4D vector: 5/5/5/1 unsigned integer components
+struct XMU555
+{
+    union
+    {
+        struct
+        {
+            uint16_t x  : 5;    // 0 to 31
+            uint16_t y  : 5;    // 0 to 31
+            uint16_t z  : 5;    // 0 to 31
+            uint16_t w  : 1;    // 0 or 1
+        };
+        uint16_t v;
+    };
+
+    XMU555() XM_CTOR_DEFAULT
+    explicit XM_CONSTEXPR XMU555(uint16_t Packed) : v(Packed) {}
+    XM_CONSTEXPR XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {}
+    XMU555(_In_reads_(3) const uint8_t *pArray, _In_ bool _w) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {}
+    XMU555(float _x, float _y, float _z, bool _w);
+    XMU555(_In_reads_(3) const float *pArray, _In_ bool _w);
+
+    operator uint16_t () const { return v; }
+
+    XMU555& operator= (const XMU555& U555) { v = U555.v; return *this; }
+    XMU555& operator= (uint16_t Packed) { v = Packed; return *this; }
+};
+
+#pragma warning(pop)
+
+
+/****************************************************************************
+ *
+ * Data conversion operations
+ *
+ ****************************************************************************/
+
+float           XMConvertHalfToFloat(HALF Value);
+float*          XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream,
+                                           _In_ size_t OutputStride,
+                                           _In_reads_bytes_(sizeof(HALF)+InputStride*(HalfCount-1)) const HALF* pInputStream,
+                                           _In_ size_t InputStride, _In_ size_t HalfCount);
+HALF            XMConvertFloatToHalf(float Value);
+HALF*           XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream,
+                                           _In_ size_t OutputStride,
+                                           _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream,
+                                           _In_ size_t InputStride, _In_ size_t FloatCount);
+
+/****************************************************************************
+ *
+ * Load operations
+ *
+ ****************************************************************************/
+
+XMVECTOR    XM_CALLCONV     XMLoadColor(_In_ const XMCOLOR* pSource);
+
+XMVECTOR    XM_CALLCONV     XMLoadHalf2(_In_ const XMHALF2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadShortN2(_In_ const XMSHORTN2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadShort2(_In_ const XMSHORT2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUShortN2(_In_ const XMUSHORTN2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUShort2(_In_ const XMUSHORT2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadByteN2(_In_ const XMBYTEN2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadByte2(_In_ const XMBYTE2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUByteN2(_In_ const XMUBYTEN2* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUByte2(_In_ const XMUBYTE2* pSource);
+
+XMVECTOR    XM_CALLCONV     XMLoadU565(_In_ const XMU565* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource);
+
+XMVECTOR    XM_CALLCONV     XMLoadHalf4(_In_ const XMHALF4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadShortN4(_In_ const XMSHORTN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadShort4(_In_ const XMSHORT4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUShortN4(_In_ const XMUSHORTN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUShort4(_In_ const XMUSHORT4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadXDecN4(_In_ const XMXDECN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUDecN4(_In_ const XMUDECN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUDec4(_In_ const XMUDEC4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadByteN4(_In_ const XMBYTEN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadByte4(_In_ const XMBYTE4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUByteN4(_In_ const XMUBYTEN4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUByte4(_In_ const XMUBYTE4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource);
+XMVECTOR    XM_CALLCONV     XMLoadU555(_In_ const XMU555* pSource);
+
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource);
+XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource);
+XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource);
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Store operations
+ *
+ ****************************************************************************/
+
+void    XM_CALLCONV     XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V);
+
+void    XM_CALLCONV     XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V);
+
+void    XM_CALLCONV     XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V);
+
+void    XM_CALLCONV     XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V);
+void    XM_CALLCONV     XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V);
+
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+void    XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V);
+void    XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V);
+void    XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V);
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Implementation
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101)
+// C4068/4616: ignore unknown pragmas
+// C4214/4204: nonstandard extension used
+// C4365: Off by default noise
+// C6001/6101: False positives
+
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+
+#include "DirectXPackedVector.inl"
+
+#pragma prefast(pop)
+#pragma warning(pop)
+
+}; // namespace PackedVector
+
+}; // namespace DirectX
+
diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl
index 4713db8..b60eafd 100644
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@@ -1,4368 +1,4368 @@
-//-------------------------------------------------------------------------------------
-// DirectXPackedVector.inl -- SIMD C++ Math library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Data conversion
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline float PackedVector::XMConvertHalfToFloat
-(
-    HALF Value
-)
-{
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
-    __m128 V2 = _mm_cvtph_ps( V1 );
-    return _mm_cvtss_f32( V2 );
-#else
-    uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
-
-    uint32_t Exponent = (Value & 0x7C00);
-    if ( Exponent == 0x7C00 ) // INF/NAN
-    {
-        Exponent = (uint32_t)0x8f;
-    }
-    else if (Exponent != 0)  // The value is normalized
-    {
-        Exponent = (uint32_t)((Value >> 10) & 0x1F);
-    }
-    else if (Mantissa != 0)     // The value is denormalized
-    {
-        // Normalize the value in the resulting float
-        Exponent = 1;
-
-        do
-        {
-            Exponent--;
-            Mantissa <<= 1;
-        } while ((Mantissa & 0x0400) == 0);
-
-        Mantissa &= 0x03FF;
-    }
-    else                        // The value is zero
-    {
-        Exponent = (uint32_t)-112;
-    }
-
-    uint32_t Result = ((Value & 0x8000) << 16) | // Sign
-                      ((Exponent + 112) << 23) | // Exponent
-                      (Mantissa << 13);          // Mantissa
-
-    return reinterpret_cast<float*>(&Result)[0];
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
-
-_Use_decl_annotations_
-inline float* PackedVector::XMConvertHalfToFloatStream
-(
-    float*      pOutputStream, 
-    size_t      OutputStride, 
-    const HALF* pInputStream, 
-    size_t      InputStride, 
-    size_t      HalfCount
-)
-{
-    assert(pOutputStream);
-    assert(pInputStream);
-
-    assert(InputStride >= sizeof(HALF));
-    _Analysis_assume_(InputStride >= sizeof(HALF));
-
-    assert(OutputStride >= sizeof(float));
-    _Analysis_assume_(OutputStride >= sizeof(float));
-
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = HalfCount >> 2;
-    if ( four > 0 )
-    {
-        if (InputStride == sizeof(HALF))
-        {
-            if (OutputStride == sizeof(float))
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                        pHalf += InputStride*4;
-
-                        __m128 FV = _mm_cvtph_ps( HV );
-
-                        XM_STREAM_PS( reinterpret_cast<float*>(pFloat), FV );
-                        pFloat += OutputStride*4; 
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                        pHalf += InputStride*4;
-
-                        __m128 FV = _mm_cvtph_ps( HV );
-
-                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
-                        pFloat += OutputStride*4; 
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                // Packed input, scattered output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
-                    pHalf += InputStride*4;
-
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
-                    pFloat += OutputStride; 
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
-                    pFloat += OutputStride; 
-                    i += 4;
-                }
-            }
-        }
-        else if (OutputStride == sizeof(float))
-        {
-            if ( ((uintptr_t)pFloat & 0xF) == 0)
-            {
-                // Scattered input, aligned & packed output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16( HV, H1, 0 );
-                    HV = _mm_insert_epi16( HV, H2, 1 );
-                    HV = _mm_insert_epi16( HV, H3, 2 );
-                    HV = _mm_insert_epi16( HV, H4, 3 );
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    XM_STREAM_PS( reinterpret_cast<float*>(pFloat ), FV );
-                    pFloat += OutputStride*4; 
-                    i += 4;
-                }
-            }
-            else
-            {
-                // Scattered input, packed output
-                for (size_t j = 0; j < four; ++j)
-                {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16( HV, H1, 0 );
-                    HV = _mm_insert_epi16( HV, H2, 1 );
-                    HV = _mm_insert_epi16( HV, H3, 2 );
-                    HV = _mm_insert_epi16( HV, H4, 3 );
-                    __m128 FV = _mm_cvtph_ps( HV );
-
-                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
-                    pFloat += OutputStride*4; 
-                    i += 4;
-                }
-            }
-        }
-        else
-        {
-            // Scattered input, scattered output
-            for (size_t j = 0; j < four; ++j)
-            {
-                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-
-                __m128i HV = _mm_setzero_si128();
-                HV = _mm_insert_epi16(HV, H1, 0);
-                HV = _mm_insert_epi16(HV, H2, 1);
-                HV = _mm_insert_epi16(HV, H3, 2);
-                HV = _mm_insert_epi16(HV, H4, 3);
-                __m128 FV = _mm_cvtph_ps(HV);
-
-                _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
-                pFloat += OutputStride;
-                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
-                pFloat += OutputStride;
-                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
-                pFloat += OutputStride;
-                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
-                pFloat += OutputStride;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < HalfCount; ++i)
-    {
-        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride; 
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#else
-    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    for (size_t i = 0; i < HalfCount; i++)
-    {
-        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride; 
-    }
-
-    return pOutputStream;
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
-(
-    float Value
-)
-{
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128 V1 = _mm_set_ss( Value );
-    __m128i V2 = _mm_cvtps_ph( V1, 0 );
-    return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
-#else
-    uint32_t Result;
-
-    uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
-    uint32_t Sign = (IValue & 0x80000000U) >> 16U;
-    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
-
-    if (IValue > 0x477FE000U)
-    {
-        // The number is too large to be represented as a half.  Saturate to infinity.
-        if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF ) != 0))
-        {
-            Result = 0x7FFF; // NAN
-        }
-        else
-        {
-            Result = 0x7C00U; // INF
-        }
-    }
-    else
-    {
-        if (IValue < 0x38800000U)
-        {
-            // The number is too small to be represented as a normalized half.
-            // Convert it to a denormalized value.
-            uint32_t Shift = 113U - (IValue >> 23U);
-            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
-        }
-        else
-        {
-            // Rebias the exponent to represent the value as a normalized half.
-            IValue += 0xC8000000U;
-        }
-
-        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; 
-    }
-    return (HALF)(Result|Sign);
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
-(
-    HALF* pOutputStream, 
-    size_t       OutputStride, 
-    const float* pInputStream, 
-    size_t       InputStride, 
-    size_t       FloatCount
-)
-{
-    assert(pOutputStream);
-    assert(pInputStream);
-
-    assert(InputStride >= sizeof(float));
-    _Analysis_assume_(InputStride >= sizeof(float));
-
-    assert(OutputStride >= sizeof(HALF));
-    _Analysis_assume_(OutputStride >= sizeof(HALF));
-
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = FloatCount >> 2;
-    if (four > 0)
-    {
-        if (InputStride == sizeof(float))
-        {
-            if (OutputStride == sizeof(HALF))
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Aligned and packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                        pHalf += OutputStride*4;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                        pHalf += OutputStride*4;
-                        i += 4;
-                    }
-                }
-            }
-            else
-            {
-                if ( ((uintptr_t)pFloat & 0xF) == 0)
-                {
-                    // Aligned & packed input, scattered output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-                else
-                {
-                    // Packed input, scattered output
-                    for (size_t j = 0; j < four; ++j)
-                    {
-                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
-                        pFloat += InputStride*4;
-
-                        __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-            }
-        }
-        else if (OutputStride == sizeof(HALF))
-        {
-            // Scattered input, packed output
-            for (size_t j = 0; j < four; ++j)
-            {
-                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
-                pFloat += InputStride;
-
-                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
-                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
-                FV = _mm_blend_ps( FV, FT, 0xC );
-
-                __m128i HV = _mm_cvtps_ph( FV, 0 );
-
-                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
-                pHalf += OutputStride*4;
-                i += 4;
-            }
-        }
-        else
-        {
-            // Scattered input, scattered output
-            for (size_t j = 0; j < four; ++j)
-            {
-                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
-                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
-                FV = _mm_blend_ps(FV, FT, 0xC);
-
-                __m128i HV = _mm_cvtps_ph(FV, 0);
-
-                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
-                pHalf += OutputStride;
-                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
-                pHalf += OutputStride;
-                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
-                pHalf += OutputStride;
-                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
-                pHalf += OutputStride;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < FloatCount; ++i)
-    {
-        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride; 
-        pHalf += OutputStride;
-    }
-
-    return pOutputStream;
-#else
-    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    for (size_t i = 0; i < FloatCount; i++)
-    {
-        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride; 
-        pHalf += OutputStride;
-    }
-    return pOutputStream;
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-#pragma prefast(pop)
-
-/****************************************************************************
- *
- * Vector and matrix load operations
- *
- ****************************************************************************/
-#pragma prefast(push)
-#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadColor
-(
-    const XMCOLOR* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    // int32_t -> Float conversions are done in one instruction.
-    // uint32_t -> Float calls a runtime function. Keep in int32_t
-    int32_t iColor = (int32_t)(pSource->c);
-    XMVECTORF32 vColor = {
-        (float)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
-        (float)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
-        (float)(iColor & 0xFF) * (1.0f/255.0f),
-        (float)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
-    };
-    return vColor.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32_t bgra = pSource->c;
-    uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000);
-    uint32x2_t vInt8 = vdup_n_u32(rgba);
-    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
-    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_n_f32( R, 1.0f/255.0f );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    __m128i vInt = _mm_set1_epi32(pSource->c);
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
-    // Convert to floating point numbers
-    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
-    // Convert 0-255 to 0.0f-1.0f
-    return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
-(
-    const XMHALF2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
-    return _mm_cvtph_ps( _mm_castps_si128( V ) );
-#else
-    XMVECTORF32 vResult = {
-        XMConvertHalfToFloat(pSource->x),
-        XMConvertHalfToFloat(pSource->y),
-        0.0f,
-        0.0f
-    };
-    return vResult.v;
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN2
-(
-    const XMSHORTN2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)),
-        (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)),
-        0.0f,
-        0.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
-    vInt = vandq_s32( vInt, g_XMMaskXY );
-    float32x4_t R = vcvtq_f32_s32(vInt);
-    R = vmulq_n_f32( R, 1.0f/32767.0f );
-    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
-    // x needs to be sign extended
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x - 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
-    // Convert -1.0f - 1.0f
-    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
-    // Clamp result (for case of -32768)
-    return _mm_max_ps( vTemp, g_XMNegativeOne );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort2
-(
-    const XMSHORT2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        0.f,
-        0.f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
-    vInt = vandq_s32( vInt, g_XMMaskXY );
-    return vcvtq_f32_s32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
-    // x needs to be sign extended
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x - 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
-    // Y is 65536 too large
-    return _mm_mul_ps(vTemp,g_XMFixupY16);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN2
-(
-    const XMUSHORTN2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x / 65535.0f,
-        (float)pSource->y / 65535.0f,
-        0.f,
-        0.f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) );
-    vInt = vandq_u32( vInt, g_XMMaskXY );
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    R = vmulq_n_f32( R, 1.0f/65535.0f );
-    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
-    static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
-    // y needs to be sign flipped
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // y + 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,FixaddY16);
-    // Y is 65536 times too large
-    vTemp = _mm_mul_ps(vTemp,FixupY16);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort2
-(
-    const XMUSHORT2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        0.f,
-        0.f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) );
-    vInt = vandq_u32( vInt, g_XMMaskXY );
-    return vcvtq_f32_u32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
-    // y needs to be sign flipped
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // Y is 65536 times too large
-    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
-    // y + 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,FixaddY16);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN2
-(
-    const XMBYTEN2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (pSource->x == -128) ? -1.f : ((float)pSource->x * (1.0f/127.0f)),
-        (pSource->y == -128) ? -1.f : ((float)pSource->y * (1.0f/127.0f)),
-        0.0f,
-        0.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) );
-    int32x4_t vInt = vmovl_s16( vget_low_s16( vInt16 ) );
-    vInt = vandq_s32( vInt, g_XMMaskXY );
-    float32x4_t R = vcvtq_f32_s32(vInt);
-    R = vmulq_n_f32( R, 1.0f/127.0f );
-    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.0f/127.0f,1.0f/(127.0f*256.0f),0,0};
-    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask
-    vTemp = _mm_and_ps(vTemp,Mask);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp,Scale);
-    // Clamp result (for case of -128)
-    return _mm_max_ps( vTemp, g_XMNegativeOne );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte2
-(
-    const XMBYTE2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        0.0f,
-        0.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) );
-    int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
-    vInt = vandq_s32( vInt, g_XMMaskXY );
-    return vcvtq_f32_s32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
-    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask
-    vTemp = _mm_and_ps(vTemp,Mask);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    return _mm_mul_ps(vTemp,Scale);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN2
-(
-    const XMUBYTEN2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x * (1.0f/255.0f),
-        (float)pSource->y * (1.0f/255.0f),
-        0.0f,
-        0.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u16(vInt8) );
-    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
-    vInt = vandq_u32( vInt, g_XMMaskXY );
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_n_f32( R, 1.0f/255.0f );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.0f/255.0f,1.0f/(255.0f*256.0f),0,0};
-    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask
-    vTemp = _mm_and_ps(vTemp,Mask);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    return _mm_mul_ps(vTemp,Scale);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte2
-(
-    const XMUBYTE2* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        0.0f,
-        0.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
-    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
-    vInt = vandq_s32( vInt, g_XMMaskXY );
-    return vcvtq_f32_u32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,0,0};
-    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask
-    vTemp = _mm_and_ps(vTemp,Mask);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    return _mm_mul_ps(vTemp,Scale);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU565
-(
-    const XMU565* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        float(pSource->v & 0x1F),
-        float((pSource->v >> 5) & 0x3F),
-        float((pSource->v >> 11) & 0x1F),
-        0.f,
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
-    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
-    uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    uint32x4_t vInt = vmovl_u16( vInt16 );
-    vInt = vandq_u32(vInt,U565And);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R,U565Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
-    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
-    // Get the 32 bit value and splat it
-    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Mask off x, y and z
-    vResult = _mm_and_ps(vResult,U565And);
-    // Convert to float
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Normalize x, y, and z
-    vResult = _mm_mul_ps(vResult,U565Mul);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3PK
-(
-    const XMFLOAT3PK* pSource
-)
-{
-    assert(pSource);
-
-    __declspec(align(16)) uint32_t Result[4];
-    uint32_t Mantissa;
-    uint32_t Exponent;
-
-    // X Channel (6-bit mantissa)
-    Mantissa = pSource->xm;
-
-    if ( pSource->xe == 0x1f ) // INF or NAN
-    {
-        Result[0] = 0x7f800000 | (pSource->xm << 17);
-    }
-    else
-    {
-        if ( pSource->xe != 0 ) // The value is normalized
-        {
-            Exponent = pSource->xe;
-        }
-        else if (Mantissa != 0) // The value is denormalized
-        {
-            // Normalize the value in the resulting float
-            Exponent = 1;
-    
-            do
-            {
-                Exponent--;
-                Mantissa <<= 1;
-            } while ((Mantissa & 0x40) == 0);
-    
-            Mantissa &= 0x3F;
-        }
-        else // The value is zero
-        {
-            Exponent = (uint32_t)-112;
-        }
-    
-        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
-    }
-
-    // Y Channel (6-bit mantissa)
-    Mantissa = pSource->ym;
-
-    if ( pSource->ye == 0x1f ) // INF or NAN
-    {
-        Result[1] = 0x7f800000 | (pSource->ym << 17);
-    }
-    else
-    {
-        if ( pSource->ye != 0 ) // The value is normalized
-        {
-            Exponent = pSource->ye;
-        }
-        else if (Mantissa != 0) // The value is denormalized
-        {
-            // Normalize the value in the resulting float
-            Exponent = 1;
-    
-            do
-            {
-                Exponent--;
-                Mantissa <<= 1;
-            } while ((Mantissa & 0x40) == 0);
-    
-            Mantissa &= 0x3F;
-        }
-        else // The value is zero
-        {
-            Exponent = (uint32_t)-112;
-        }
-    
-        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
-    }
-
-    // Z Channel (5-bit mantissa)
-    Mantissa = pSource->zm;
-
-    if ( pSource->ze == 0x1f ) // INF or NAN
-    {
-        Result[2] = 0x7f800000 | (pSource->zm << 17);
-    }
-    else
-    {
-        if ( pSource->ze != 0 ) // The value is normalized
-        {
-            Exponent = pSource->ze;
-        }
-        else if (Mantissa != 0) // The value is denormalized
-        {
-            // Normalize the value in the resulting float
-            Exponent = 1;
-    
-            do
-            {
-                Exponent--;
-                Mantissa <<= 1;
-            } while ((Mantissa & 0x20) == 0);
-    
-            Mantissa &= 0x1F;
-        }
-        else // The value is zero
-        {
-            Exponent = (uint32_t)-112;
-        }
-
-        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
-    }
-
-    return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) );
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3SE
-(
-    const XMFLOAT3SE* pSource
-)
-{
-    assert(pSource);
-
-    union { float f; int32_t i; } fi;
-    fi.i = 0x33800000 + (pSource->e << 23);
-    float Scale = fi.f;
-
-    XMVECTORF32 v = {
-        Scale * float( pSource->xm ),
-        Scale * float( pSource->ym ),
-        Scale * float( pSource->zm ),
-        1.0f };
-    return v;
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
-(
-    const XMHALF4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
-    return _mm_cvtph_ps( V );
-#else
-    XMVECTORF32 vResult = {
-        XMConvertHalfToFloat(pSource->x),
-        XMConvertHalfToFloat(pSource->y),
-        XMConvertHalfToFloat(pSource->z),
-        XMConvertHalfToFloat(pSource->w)
-    };
-    return vResult.v;
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN4
-(
-    const XMSHORTN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)),
-        (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)),
-        (pSource->z == -32768) ? -1.f : ((float)pSource->z * (1.0f/32767.0f)),
-        (pSource->w == -32768) ? -1.f : ((float)pSource->w * (1.0f/32767.0f))
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int16x4_t vInt = vld1_s16( (const int16_t*)pSource );
-    int32x4_t V = vmovl_s16( vInt );
-    V = vcvtq_f32_s32( V );
-    V = vmulq_n_f32( V,  1.0f/32767.0f );
-    return vmaxq_f32( V, vdupq_n_f32(-1.f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
-    // x and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x and z - 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
-    // Convert to -1.0f - 1.0f
-    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
-    // Clamp result (for case of -32768)
-    return _mm_max_ps( vTemp, g_XMNegativeOne );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort4
-(
-    const XMSHORT4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        (float)pSource->z,
-        (float)pSource->w
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int16x4_t vInt = vld1_s16( (const int16_t*)pSource );
-    int32x4_t V = vmovl_s16( vInt );
-    return vcvtq_f32_s32( V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
-    // x and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x and z - 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
-    // Fix y and w because they are 65536 too large
-    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN4
-(
-    const XMUSHORTN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x / 65535.0f,
-        (float)pSource->y / 65535.0f,
-        (float)pSource->z / 65535.0f,
-        (float)pSource->w / 65535.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource );
-    uint32x4_t V = vmovl_u16( vInt );
-    V = vcvtq_f32_u32( V );
-    return vmulq_n_f32( V, 1.0f/65535.0f );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
-    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
-    // y and w are signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // y and w + 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
-    // Fix y and w because they are 65536 too large
-    vTemp = _mm_mul_ps(vTemp,FixupY16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort4
-(
-    const XMUSHORT4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        (float)pSource->z,
-        (float)pSource->w
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource );
-    uint32x4_t V = vmovl_u16( vInt );
-    return vcvtq_f32_u32( V );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f,32768.0f};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
-    // y and w are signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // Fix y and w because they are 65536 too large
-    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
-    // y and w + 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDecN4
-(
-    const XMXDECN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f),
-        (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f),
-        (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
-        (float)(pSource->v >> 30) / 3.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskA2B10G10R10);
-    vInt = veorq_u32(vInt,g_XMFlipA2B10G10R10);
-    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
-    R = vaddq_f32(R,g_XMFixAA2B10G10R10);
-    R = vmulq_f32(R,g_XMNormalizeA2B10G10R10);
-    return vmaxq_f32( R, vdupq_n_f32(-1.0f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
-    // Clamp result (for case of -512)
-    return _mm_max_ps( vTemp, g_XMNegativeOne );
-#endif
-}
-
-//------------------------------------------------------------------------------
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDec4
-(
-    const XMXDEC4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]),
-        (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]),
-        (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]),
-        (float)(pSource->v >> 30)
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
-    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskDec4);
-    vInt = veorq_u32(vInt,XDec4Xor);
-    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
-    R = vaddq_f32(R ,XDec4Add);
-    return vmulq_f32(R,g_XMMulDec4);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
-    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,XDec4Xor);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,XDec4Add);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
-    return vTemp;
-#endif
-}
-
-#pragma warning(pop)
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4
-(
-    const XMUDECN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        (float)ElementX / 1023.0f,
-        (float)ElementY / 1023.0f,
-        (float)ElementZ / 1023.0f,
-        (float)(pSource->v >> 30) / 3.0f
-    };
-    return vResult.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskDec4);
-    float32x4_t R = vcvtq_f32_u32( vInt );
-    return vmulq_f32(R,UDecN4Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
-    return vTemp;
-#endif
-}
-
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4_XR
-(
-    const XMUDECN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    int32_t ElementX = pSource->v & 0x3FF;
-    int32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    int32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        (float)(ElementX - 0x180) / 510.0f,
-        (float)(ElementY - 0x180) / 510.0f,
-        (float)(ElementZ - 0x180) / 510.0f,
-        (float)(pSource->v >> 30) / 3.0f
-    };
-
-    return vResult.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
-    static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 };
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskDec4);
-    int32x4_t vTemp = vsubq_s32( vreinterpretq_s32_u32(vInt), XRBias );
-    vTemp = veorq_u32( vTemp, g_XMFlipW );
-    float32x4_t R = vcvtq_f32_s32( vTemp );
-    R = vaddq_f32(R,g_XMAddUDec4);
-    return vmulq_f32(R,XRMul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
-    static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 };
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Mask channels
-    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
-    // Subtract bias
-    vTemp = _mm_castsi128_ps( _mm_sub_epi32( _mm_castps_si128(vTemp), XRBias ) );
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Convert to 0.0f-1.0f
-    return _mm_mul_ps(vTemp,XRMul);
-#endif
-}
-
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDec4
-(
-    const XMUDEC4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        (float)ElementX,
-        (float)ElementY,
-        (float)ElementZ,
-        (float)(pSource->v >> 30)
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskDec4);
-    float32x4_t R = vcvtq_f32_u32( vInt );
-    return vmulq_f32(R,g_XMMulDec4);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDecN4
-(
-    const XMDECN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-    uint32_t ElementW = pSource->v >> 30;
-
-    XMVECTORF32 vResult = {
-        (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f),
-        (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f),
-        (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
-        (ElementW == 0x2)   ? -1.f : ((float)(int16_t)(ElementW | SignExtendW[(ElementW >> 1) & 1]))
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskDec4);
-    vInt = veorq_u32(vInt,g_XMXorDec4);
-    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
-    R = vaddq_f32(R,g_XMAddDec4);
-    R = vmulq_f32(R,DecN4Mul);
-    return vmaxq_f32( R, vdupq_n_f32(-1.0f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp,DecN4Mul);
-    // Clamp result (for case of -512/-1)
-    return _mm_max_ps( vTemp, g_XMNegativeOne );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDec4
-(
-    const XMDEC4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-    uint32_t ElementW = pSource->v >> 30;
-
-    XMVECTORF32 vResult = {
-        (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]),
-        (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]),
-        (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]),
-        (float)(int16_t)(ElementW | SignExtendW[ElementW >> 1])
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    vInt = vandq_u32(vInt,g_XMMaskDec4);
-    vInt = veorq_u32(vInt,g_XMXorDec4);
-    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
-    R = vaddq_f32(R,g_XMAddDec4);
-    return vmulq_f32(R,g_XMMulDec4);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
-    return vTemp;
-#endif
-}
-
-#pragma warning(pop)
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN4
-(
-    const XMUBYTEN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x / 255.0f,
-        (float)pSource->y / 255.0f,
-        (float)pSource->z / 255.0f,
-        (float)pSource->w / 255.0f
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
-    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_n_f32( R, 1.0f/255.0f );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte4
-(
-    const XMUBYTE4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        (float)pSource->z,
-        (float)pSource->w
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
-    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
-    return vcvtq_f32_u32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN4
-(
-    const XMBYTEN4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (pSource->x == -128) ? -1.f : ((float)pSource->x / 127.0f),
-        (pSource->y == -128) ? -1.f : ((float)pSource->y / 127.0f),
-        (pSource->z == -128) ? -1.f : ((float)pSource->z / 127.0f),
-        (pSource->w == -128) ? -1.f : ((float)pSource->w / 127.0f)
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) );
-    int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
-    float32x4_t R = vcvtq_f32_s32(vInt);
-    R = vmulq_n_f32( R, 1.0f/127.0f );
-    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
-    // Clamp result (for case of -128)
-    return _mm_max_ps( vTemp, g_XMNegativeOne );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte4
-(
-    const XMBYTE4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        (float)pSource->x,
-        (float)pSource->y,
-        (float)pSource->z,
-        (float)pSource->w
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
-    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) );
-    int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
-    return vcvtq_f32_s32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUNibble4
-(
-     const XMUNIBBLE4* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        float(pSource->v & 0xF),
-        float((pSource->v >> 4) & 0xF),
-        float((pSource->v >> 8) & 0xF),
-        float((pSource->v >> 12) & 0xF)
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
-    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
-    uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    uint32x4_t vInt = vmovl_u16( vInt16 );
-    vInt = vandq_u32(vInt,UNibble4And);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R,UNibble4Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
-    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
-    // Get the 32 bit value and splat it
-    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Mask off x, y and z
-    vResult = _mm_and_ps(vResult,UNibble4And);
-    // Convert to float
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Normalize x, y, and z
-    vResult = _mm_mul_ps(vResult,UNibble4Mul);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU555
-(
-     const XMU555* pSource
-)
-{
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        float(pSource->v & 0x1F),
-        float((pSource->v >> 5) & 0x1F),
-        float((pSource->v >> 10) & 0x1F),
-        float((pSource->v >> 15) & 0x1)
-    };
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
-    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
-    uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
-    uint32x4_t vInt = vmovl_u16( vInt16 );
-    vInt = vandq_u32(vInt,U555And);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R,U555Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
-    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
-    // Get the 32 bit value and splat it
-    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
-    // Mask off x, y and z
-    vResult = _mm_and_ps(vResult,U555And);
-    // Convert to float
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Normalize x, y, and z
-    vResult = _mm_mul_ps(vResult,U555Mul);
-    return vResult;
-#endif
-}
-
-#pragma prefast(pop)
-
-/****************************************************************************
- *
- * Vector and matrix store operations
- *
- ****************************************************************************/
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreColor
-(
-    XMCOLOR* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiply(N, g_UByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->c = ((uint32_t)tmp.w << 24) |
-                      ((uint32_t)tmp.x << 16) |
-                      ((uint32_t)tmp.y <<  8) |
-                      ((uint32_t)tmp.z);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 255.0f );
-    R = XMVectorRound(R);
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
-    uint32_t rgba = vget_lane_u32( vreinterpret_u32_u8(vInt8), 0 );
-    pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Set <0 to 0
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    // Set>1 to 1
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Convert to 0-255
-    vResult = _mm_mul_ps(vResult,g_UByteMax);
-    // Shuffle RGBA to ARGB
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
-    // Convert to int 
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Mash to shorts
-    vInt = _mm_packs_epi32(vInt,vInt);
-    // Mash to bytes
-    vInt = _mm_packus_epi16(vInt,vInt);
-    // Store the color
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),_mm_castsi128_ps(vInt));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreHalf2
-(
-    XMHALF2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph( V, 0 );
-    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
-#else
-    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
-    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreShortN2
-(
-    XMSHORTN2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 32767.0f );
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32( vInt32 );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    vResult = _mm_mul_ps(vResult,g_ShortMax);
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    vResulti = _mm_packs_epi32(vResulti,vResulti);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreShort2
-(
-    XMSHORT2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f) );
-    R = vminq_f32(R, vdupq_n_f32(32767.0f));
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32( vInt32 );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_ShortMin);
-    vResult = _mm_min_ps(vResult,g_ShortMax);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Pack the ints into shorts
-    vInt = _mm_packs_epi32(vInt,vInt);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUShortN2
-(
-    XMUSHORTN2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 65535.0f );
-    R = vaddq_f32( R, g_XMOneHalf );
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    vResult = _mm_mul_ps(vResult,g_UShortMax);
-    vResult = _mm_add_ps(vResult,g_XMOneHalf);
-     // Convert to int
-    __m128i vInt = _mm_cvttps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
-    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUShort2
-(
-    XMUSHORT2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
-    R = vminq_f32(R, vdupq_n_f32(65535.0f));
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_UShortMax);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
-    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreByteN2
-(
-    XMBYTEN2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (int8_t)tmp.x;
-    pDestination->y = (int8_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 127.0f );
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32( vInt32 );
-    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
-    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_s8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,g_ByteMax);
-    // Convert to int by rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreByte2
-(
-    XMBYTE2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (int8_t)tmp.x;
-    pDestination->y = (int8_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) );
-    R = vminq_f32(R, vdupq_n_f32(127.0f));
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32( vInt32 );
-    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
-    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_s8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_ByteMin);
-    vResult = _mm_min_ps(vResult,g_ByteMax);
-    // Convert to int by rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUByteN2
-(
-    XMUBYTEN2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (uint8_t)tmp.x;
-    pDestination->y = (uint8_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 255.0f );
-    R = vaddq_f32( R, g_XMOneHalf );
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
-    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_u8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,g_UByteMax);
-    vResult = _mm_add_ps(vResult,g_XMOneHalf);
-    // Convert to int
-    __m128i vInt = _mm_cvttps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUByte2
-(
-    XMUBYTE2* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->x = (uint8_t)tmp.x;
-    pDestination->y = (uint8_t)tmp.y;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
-    R = vminq_f32(R, vdupq_n_f32(255.0f));
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
-    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_u8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_UByteMax);
-    // Convert to int by rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreU565
-(
-    XMU565* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    static const XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A( &tmp, N );
-
-    pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) |
-                      (((uint16_t)tmp.y & 0x3F) << 5) |
-                      (((uint16_t)tmp.x & 0x1F));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
-    static const XMVECTORF32 Scale = {1.0f,32.f,32.f*64.f, 0.f };
-    static const XMVECTORU32 Mask = {0x1F,0x3F<<5,0x1F<<11,0};
-    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
-    vResult = vminq_f32(vResult,Max);
-    vResult = vmulq_f32(vResult,Scale);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti,Mask);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vhi = vget_high_u32(vResulti);
-    vTemp = vorr_u32( vTemp, vhi );
-    vTemp = vpadd_u32( vTemp, vTemp );
-    vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,Max);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
-    pDestination->v = ((z & 0x1F) << 11) |
-                      ((y & 0x3F) << 5) |
-                      ((x & 0x1F));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreFloat3PK
-(
-    XMFLOAT3PK* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-
-    __declspec(align(16)) uint32_t IValue[4];
-    XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V );
-
-    uint32_t Result[3];
-
-    // X & Y Channels (5-bit exponent, 6-bit mantissa)
-    for(uint32_t j=0; j < 2; ++j)
-    {
-        uint32_t Sign = IValue[j] & 0x80000000;
-        uint32_t I = IValue[j] & 0x7FFFFFFF;
-
-        if ((I & 0x7F800000) == 0x7F800000)
-        {
-            // INF or NAN
-            Result[j] = 0x7c0;
-            if (( I & 0x7FFFFF ) != 0)
-            {
-                Result[j] = 0x7c0 | (((I>>17)|(I>>11)|(I>>6)|(I))&0x3f);
-            }
-            else if ( Sign )
-            {
-                // -INF is clamped to 0 since 3PK is positive only
-                Result[j] = 0;
-            }
-        }
-        else if ( Sign )
-        {
-            // 3PK is positive only, so clamp to zero
-            Result[j] = 0;
-        }
-        else if (I > 0x477E0000U)
-        {
-            // The number is too large to be represented as a float11, set to max
-            Result[j] = 0x7BF;
-        }
-        else
-        {
-            if (I < 0x38800000U)
-            {
-                // The number is too small to be represented as a normalized float11
-                // Convert it to a denormalized value.
-                uint32_t Shift = 113U - (I >> 23U);
-                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
-            }
-            else
-            {
-                // Rebias the exponent to represent the value as a normalized float11
-                I += 0xC8000000U;
-            }
-     
-            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
-        }
-    }
-
-    // Z Channel (5-bit exponent, 5-bit mantissa)
-    uint32_t Sign = IValue[2] & 0x80000000;
-    uint32_t I = IValue[2] & 0x7FFFFFFF;
-
-    if ((I & 0x7F800000) == 0x7F800000)
-    {
-        // INF or NAN
-        Result[2] = 0x3e0;
-        if ( I & 0x7FFFFF )
-        {
-            Result[2] = 0x3e0 | (((I>>18)|(I>>13)|(I>>3)|(I))&0x1f);
-        }
-        else if ( Sign )
-        {
-            // -INF is clamped to 0 since 3PK is positive only
-            Result[2] = 0;
-        }
-    }
-    else if ( Sign )
-    {
-        // 3PK is positive only, so clamp to zero
-        Result[2] = 0;
-    }
-    else if (I > 0x477C0000U)
-    {
-        // The number is too large to be represented as a float10, set to max
-        Result[2] = 0x3df;
-    }
-    else
-    {
-        if (I < 0x38800000U)
-        {
-            // The number is too small to be represented as a normalized float10
-            // Convert it to a denormalized value.
-            uint32_t Shift = 113U - (I >> 23U);
-            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
-        }
-        else
-        {
-            // Rebias the exponent to represent the value as a normalized float10
-            I += 0xC8000000U;
-        }
-     
-        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
-    }
-
-    // Pack Result into memory
-    pDestination->v = (Result[0] & 0x7ff)
-                      | ( (Result[1] & 0x7ff) << 11 )
-                      | ( (Result[2] & 0x3ff) << 22 );
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreFloat3SE
-(
-    XMFLOAT3SE* pDestination,
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-
-    XMFLOAT3A tmp;
-    XMStoreFloat3A( &tmp, V );
-
-    static const float maxf9 = float(0x1FF << 7);
-    static const float minf9 = float(1.f / (1 << 16));
-
-    float x = (tmp.x >= 0.f) ? ( (tmp.x > maxf9) ? maxf9 : tmp.x ) : 0.f;
-    float y = (tmp.y >= 0.f) ? ( (tmp.y > maxf9) ? maxf9 : tmp.y ) : 0.f;
-    float z = (tmp.z >= 0.f) ? ( (tmp.z > maxf9) ? maxf9 : tmp.z ) : 0.f;
-
-    const float max_xy = (x > y) ? x : y;
-    const float max_xyz = (max_xy > z) ? max_xy : z;
-
-    const float maxColor = (max_xyz > minf9) ? max_xyz : minf9;
-
-    union { float f; int32_t i; } fi;
-    fi.f = maxColor;
-    fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
-
-    uint32_t exp = fi.i >> 23;
-    pDestination->e = exp - 0x6f;
-
-    fi.i = 0x83000000 - (exp << 23);
-    float ScaleR = fi.f;
-
-#ifdef _XM_NO_ROUNDF_
-    pDestination->xm = static_cast<uint32_t>( Internal::round_to_nearest(x * ScaleR) );
-    pDestination->ym = static_cast<uint32_t>( Internal::round_to_nearest(y * ScaleR) );
-    pDestination->zm = static_cast<uint32_t>( Internal::round_to_nearest(z * ScaleR) );
-#else
-    pDestination->xm = static_cast<uint32_t>( lroundf(x * ScaleR) );
-    pDestination->ym = static_cast<uint32_t>( lroundf(y * ScaleR) );
-    pDestination->zm = static_cast<uint32_t>( lroundf(z * ScaleR) );
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreHalf4
-(
-    XMHALF4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph( V, 0 );
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
-#else
-    XMFLOAT4A t;
-    XMStoreFloat4A(&t, V );
-
-    pDestination->x = XMConvertFloatToHalf(t.x);
-    pDestination->y = XMConvertFloatToHalf(t.y);
-    pDestination->z = XMConvertFloatToHalf(t.z);
-    pDestination->w = XMConvertFloatToHalf(t.w);
-#endif // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreShortN4
-(
-    XMSHORTN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-    pDestination->z = (int16_t)tmp.z;
-    pDestination->w = (int16_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(-1.f) );
-    vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) );
-    vResult = vmulq_n_f32( vResult, 32767.0f );
-    vResult = vcvtq_s32_f32( vResult );
-    int16x4_t vInt = vmovn_s32( vResult );
-    vst1_s16( reinterpret_cast<int16_t*>(pDestination), vInt );
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    vResult = _mm_mul_ps(vResult,g_ShortMax);
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    vResulti = _mm_packs_epi32(vResulti,vResulti);
-    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreShort4
-(
-    XMSHORT4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-    pDestination->z = (int16_t)tmp.z;
-    pDestination->w = (int16_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32( V, g_ShortMin );
-    vResult = vminq_f32( vResult, g_ShortMax );
-    vResult = vcvtq_s32_f32( vResult );
-    int16x4_t vInt = vmovn_s32( vResult );
-    vst1_s16( reinterpret_cast<int16_t*>(pDestination), vInt );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_ShortMin);
-    vResult = _mm_min_ps(vResult,g_ShortMax);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Pack the ints into shorts
-    vInt = _mm_packs_epi32(vInt,vInt);
-    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUShortN4
-(
-    XMUSHORTN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-    pDestination->z = (int16_t)tmp.z;
-    pDestination->w = (int16_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) );
-    vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) );
-    vResult = vmulq_n_f32( vResult, 65535.0f );
-    vResult = vaddq_f32( vResult, g_XMOneHalf );
-    vResult = vcvtq_u32_f32( vResult );
-    uint16x4_t vInt = vmovn_u32( vResult );
-    vst1_u16( reinterpret_cast<uint16_t*>(pDestination), vInt );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    vResult = _mm_mul_ps(vResult,g_UShortMax);
-    vResult = _mm_add_ps(vResult,g_XMOneHalf);
-    // Convert to int
-    __m128i vInt = _mm_cvttps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
-    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
-    pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
-    pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUShort4
-(
-    XMUSHORT4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (int16_t)tmp.x;
-    pDestination->y = (int16_t)tmp.y;
-    pDestination->z = (int16_t)tmp.z;
-    pDestination->w = (int16_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) );
-    vResult = vminq_f32( vResult, g_UShortMax );
-    vResult = vcvtq_u32_f32( vResult );
-    uint16x4_t vInt = vmovn_u32( vResult );
-    vst1_u16( reinterpret_cast<uint16_t*>(pDestination), vInt );
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_UShortMax);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
-    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
-    pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
-    pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreXDecN4
-(
-    XMXDECN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
-    static const XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 3.0f};
-
-    XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v);
-    N = XMVectorMultiply(N, Scale.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((uint32_t)tmp.w << 30) |
-                       (((int32_t)tmp.z & 0x3FF) << 20) |
-                       (((int32_t)tmp.y & 0x3FF) << 10) |
-                       (((int32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
-    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
-    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
-    float32x4_t vResult = vmaxq_f32(V,Min);
-    vResult = vminq_f32(vResult,vdupq_n_f32(1.0f));
-    vResult = vmulq_f32(vResult,Scale);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti,ScaleMask);
-    int32x4_t vResultw = vandq_s32(vResulti,g_XMMaskW);
-    vResulti = vaddq_s32(vResulti,vResultw);
-    // Do a horizontal or of all 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
-    uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
-    vTemp = vorr_u32( vTemp, vhi );
-    vTemp = vpadd_u32( vTemp, vTemp );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
-    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
-    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
-    XMVECTOR vResult = _mm_max_ps(V,Min);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,Scale);
-    // Convert to int (W is unsigned)
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,ScaleMask);
-    // To fix W, add itself to shift it up to <<30 instead of <<29
-    __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
-    vResulti = _mm_add_epi32(vResulti,vResultw);
-    // Do a horizontal or of all 4 entries
-    vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1));
-    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
-    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
-    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
-    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreXDec4
-(
-    XMXDEC4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, 0.0f};
-    static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 3.0f};
-
-    XMVECTOR N = XMVectorClamp(V, Min, Max);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((uint32_t)tmp.w << 30) |
-                       (((int32_t)tmp.z & 0x3FF) << 20) |
-                       (((int32_t)tmp.y & 0x3FF) << 10) |
-                       (((int32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
-    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
-    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
-    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    float32x4_t vResult = vmaxq_f32(V,MinXDec4);
-    vResult = vminq_f32(vResult,MaxXDec4);
-    vResult = vmulq_f32(vResult,ScaleXDec4);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti,MaskXDec4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
-    uint32x2_t vTemp2 = vget_high_u32(vreinterpret_u32_s32(vResulti));
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32( vTemp, 1 );
-    vTemp2 = vadd_s32( vTemp2, vTemp2 );
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
-    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
-    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
-    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
-    vResult = _mm_min_ps(vResult,MaxXDec4);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleXDec4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskXDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // Perform a single bit left shift on y|w
-    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-#pragma warning(pop)
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUDecN4
-(
-    XMUDECN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32  Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiply(N, Scale.v);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((uint32_t)tmp.w << 30) |
-                       (((uint32_t)tmp.z & 0x3FF) << 20) |
-                       (((uint32_t)tmp.y & 0x3FF) << 10) |
-                       (((uint32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
-    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f));
-    vResult = vminq_f32(vResult,vdupq_n_f32(1.f));
-    vResult = vmulq_f32(vResult,ScaleUDecN4);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti,MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32( vTemp, 1 );
-    vTemp2 = vadd_u32( vTemp2, vTemp2 );
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
-    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleUDecN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // Perform a left shift by one bit on y|w
-    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUDecN4_XR
-(
-    XMUDECN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32  Scale = { 510.0f, 510.0f, 510.0f, 3.0f };
-    static const XMVECTORF32  Bias  = { 384.0f, 384.0f, 384.0f, 0.0f };
-    static const XMVECTORF32  C     = { 1023.f, 1023.f, 1023.f, 3.f };
-
-    XMVECTOR N = XMVectorMultiplyAdd( V, Scale, Bias );
-    N = XMVectorClamp( N, g_XMZero, C );
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((uint32_t)tmp.w << 30)
-                      | (((uint32_t)tmp.z & 0x3FF) << 20)
-                      | (((uint32_t)tmp.y & 0x3FF) << 10)
-                      | (((uint32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f};
-    static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f };
-    static const XMVECTORF32 Bias  = { 384.0f, 384.0f, 384.0f, 0.0f };
-    static const XMVECTORF32 C     = { 1023.f, 1023.f, 1023.f, 3.f };
-    float32x4_t vResult = vmlaq_f32( Bias, V, Scale );
-    vResult = vmaxq_f32(vResult,vdupq_n_f32(0.f));
-    vResult = vminq_f32(vResult,C);
-    vResult = vmulq_f32(vResult,Shift);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti,MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32( vTemp, 1 );
-    vTemp2 = vadd_u32( vTemp2, vTemp2 );
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f};
-    static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f };
-    static const XMVECTORF32 Bias  = { 384.0f, 384.0f, 384.0f, 0.0f };
-    static const XMVECTORF32 C     = { 1023.f, 1023.f, 1023.f, 3.f };
-    // Scale & bias
-    XMVECTOR vResult = _mm_mul_ps( V, Scale );
-    vResult = _mm_add_ps( vResult, Bias );
-    // Clamp to bounds
-    vResult = _mm_max_ps(vResult,g_XMZero);
-    vResult = _mm_min_ps(vResult,C);
-    // Scale by shift values
-    vResult = _mm_mul_ps(vResult,Shift);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // Perform a left shift by one bit on y|w
-    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUDec4
-(
-    XMUDEC4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((uint32_t)tmp.w << 30) |
-                       (((uint32_t)tmp.z & 0x3FF) << 20) |
-                       (((uint32_t)tmp.y & 0x3FF) << 10) |
-                       (((uint32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
-    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
-    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f));
-    vResult = vminq_f32(vResult,MaxUDec4);
-    vResult = vmulq_f32(vResult,ScaleUDec4);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti,MaskUDec4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32( vTemp, 1 );
-    vTemp2 = vadd_u32( vTemp2, vTemp2 );
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
-    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
-    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,MaxUDec4);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleUDec4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskUDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // Perform a left shift by one bit on y|w
-    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreDecN4
-(
-    XMDECN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 1.0f};
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, Scale.v);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((int32_t)tmp.w << 30) |
-                       (((int32_t)tmp.z & 0x3FF) << 20) |
-                       (((int32_t)tmp.y & 0x3FF) << 10) |
-                       (((int32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
-    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(-1.f));
-    vResult = vminq_f32(vResult,vdupq_n_f32(1.f));
-    vResult = vmulq_f32(vResult,ScaleDecN4);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti,g_XMMaskDec4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
-    uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
-    vTemp = vorr_u32( vTemp, vhi );
-    vTemp = vpadd_u32( vTemp, vTemp );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleDecN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,g_XMMaskDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreDec4
-(
-    XMDEC4*  pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, -1.0f};
-    static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 1.0f};
-
-    XMVECTOR N = XMVectorClamp(V, Min, Max);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((int32_t)tmp.w << 30) |
-                       (((int32_t)tmp.z & 0x3FF) << 20) |
-                       (((int32_t)tmp.y & 0x3FF) << 10) |
-                       (((int32_t)tmp.x & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
-    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
-    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
-    float32x4_t vResult = vmaxq_f32(V,MinDec4);
-    vResult = vminq_f32(vResult,MaxDec4);
-    vResult = vmulq_f32(vResult,ScaleDec4);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti,g_XMMaskDec4);
-    // Do a horizontal or of all 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
-    uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
-    vTemp = vorr_u32( vTemp, vhi );
-    vTemp = vpadd_u32( vTemp, vTemp );
-    vst1_lane_u32( &pDestination->v, vTemp, 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
-    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
-    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,MinDec4);
-    vResult = _mm_min_ps(vResult,MaxDec4);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleDec4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,g_XMMaskDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-#pragma warning(pop)
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUByteN4
-(
-    XMUBYTEN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiply(N, g_UByteMax);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (uint8_t)tmp.x;
-    pDestination->y = (uint8_t)tmp.y;
-    pDestination->z = (uint8_t)tmp.z;
-    pDestination->w = (uint8_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 255.0f );
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
-    static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleUByteN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskUByteN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // Perform a single bit left shift to fix y|w 
-    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUByte4
-(
-    XMUBYTE4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (uint8_t)tmp.x;
-    pDestination->y = (uint8_t)tmp.y;
-    pDestination->z = (uint8_t)tmp.z;
-    pDestination->w = (uint8_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
-    R = vminq_f32(R, vdupq_n_f32(255.0f));
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
-    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
-    static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,g_UByteMax);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleUByte4);
-    // Convert to int by rounding
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskUByte4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // Perform a single bit left shift to fix y|w 
-    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreByteN4
-(
-    XMBYTEN4* pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(V, g_ByteMax);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (int8_t)tmp.x;
-    pDestination->y = (int8_t)tmp.y;
-    pDestination->z = (int8_t)tmp.z;
-    pDestination->w = (int8_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32( R, 127.0f );
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32( vInt32 );
-    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
-    static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult,g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleByteN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskByteN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreByte4
-(
-    XMBYTE4*  pDestination, 
-    FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->x = (int8_t)tmp.x;
-    pDestination->y = (int8_t)tmp.y;
-    pDestination->z = (int8_t)tmp.z;
-    pDestination->w = (int8_t)tmp.w;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) );
-    R = vminq_f32(R, vdupq_n_f32(127.f));
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32( vInt32 );
-    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
-    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
-    static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V,g_ByteMin);
-    vResult = _mm_min_ps(vResult,g_ByteMax);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult,ScaleByte4);
-    // Convert to int by rounding
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti,MaskByte4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreUNibble4
-(
-     XMUNIBBLE4* pDestination,
-     FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    static const XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) |
-                      (((uint16_t)tmp.z & 0xF) << 8) |
-                      (((uint16_t)tmp.y & 0xF) << 4) |
-                      (((uint16_t)tmp.x & 0xF));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
-    static const XMVECTORF32 Scale = {1.0f,16.f,16.f*16.f,16.f*16.f*16.f};
-    static const XMVECTORU32 Mask = {0xF,0xF<<4,0xF<<8,0xF<<12};
-    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
-    vResult = vminq_f32(vResult,Max);
-    vResult = vmulq_f32(vResult,Scale);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti,Mask);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vhi = vget_high_u32(vResulti);
-    vTemp = vorr_u32( vTemp, vhi );
-    vTemp = vpadd_u32( vTemp, vTemp );
-    vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,Max);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
-    uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
-    pDestination->v = ((w & 0xF) << 12) |
-                      ((z & 0xF) << 8) |
-                      ((y & 0xF) << 4) |
-                      ((x & 0xF));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline void XM_CALLCONV PackedVector::XMStoreU555
-(
-     XMU555* pDestination,
-     FXMVECTOR V
-)
-{
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    static const XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N );
-
-    pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) |
-                      (((uint16_t)tmp.z & 0x1F) << 10) |
-                      (((uint16_t)tmp.y & 0x1F) << 5) |
-                      (((uint16_t)tmp.x & 0x1F));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
-    static const XMVECTORF32 Scale = {1.0f,32.f/2.f,32.f*32.f,32.f*32.f*32.f/2.f};
-    static const XMVECTORU32 Mask = {0x1F,0x1F<<(5-1),0x1F<<10,0x1<<(15-1)};
-    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
-    vResult = vminq_f32(vResult,Max);
-    vResult = vmulq_f32(vResult,Scale);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti,Mask);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32( vTemp, 1 );
-    vTemp2 = vadd_s32( vTemp2, vTemp2 );
-    vTemp = vorr_u32( vTemp, vTemp2 );
-    vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
-    vResult = _mm_min_ps(vResult,Max);
-     // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them manually
-    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
-    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
-    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
-    uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
-    pDestination->v = ((w) ? 0x8000 : 0) |
-                      ((z & 0x1F) << 10) |
-                      ((y & 0x1F) << 5) |
-                      ((x & 0x1F));
-#endif
-}
-
-
-/****************************************************************************
- *
- * XMCOLOR operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMCOLOR::XMCOLOR
-(
-    float _r,
-    float _g,
-    float _b,
-    float _a
-)
-{
-    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMCOLOR::XMCOLOR
-(
-    const float* pArray
-)
-{
-    XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMHALF2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMHALF2::XMHALF2
-(
-    float _x,
-    float _y
-)
-{
-    x = XMConvertFloatToHalf(_x);
-    y = XMConvertFloatToHalf(_y);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMHALF2::XMHALF2
-(
-    const float* pArray
-)
-{
-    assert( pArray != nullptr );
-    x = XMConvertFloatToHalf(pArray[0]);
-    y = XMConvertFloatToHalf(pArray[1]);
-}
-
-/****************************************************************************
- *
- * XMSHORTN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMSHORTN2::XMSHORTN2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMSHORTN2::XMSHORTN2
-(
-    const float* pArray
-)
-{
-    XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMSHORT2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMSHORT2::XMSHORT2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMSHORT2::XMSHORT2
-(
-    const float* pArray
-)
-{
-    XMStoreShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORTN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUSHORTN2::XMUSHORTN2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUSHORTN2::XMUSHORTN2
-(
-    const float* pArray
-)
-{
-    XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORT2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUSHORT2::XMUSHORT2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUSHORT2::XMUSHORT2
-(
-    const float* pArray
-)
-{
-    XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTEN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMBYTEN2::XMBYTEN2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMBYTEN2::XMBYTEN2
-(
-    const float* pArray
-)
-{
-    XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTE2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMBYTE2::XMBYTE2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMBYTE2::XMBYTE2
-(
-    const float* pArray
-)
-{
-    XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTEN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUBYTEN2::XMUBYTEN2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUBYTEN2::XMUBYTEN2
-(
-    const float* pArray
-)
-{
-    XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTE2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUBYTE2::XMUBYTE2
-(
-    float _x,
-    float _y
-)
-{
-    XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUBYTE2::XMUBYTE2
-(
-    const float* pArray
-)
-{
-    XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMU565 operators
- *
- ****************************************************************************/
-
-inline PackedVector::XMU565::XMU565
-(
-    float _x,
-    float _y,
-    float _z
-)
-{
-    XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f ));
-}
-
-_Use_decl_annotations_
-inline PackedVector::XMU565::XMU565
-(
-    const float *pArray
-)
-{
-    XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMFLOAT3PK operators
- *
- ****************************************************************************/
-
-inline PackedVector::XMFLOAT3PK::XMFLOAT3PK
-(
-    float _x,
-    float _y,
-    float _z
-)
-{
-    XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f ));
-}
-
-_Use_decl_annotations_
-inline PackedVector::XMFLOAT3PK::XMFLOAT3PK
-(
-    const float *pArray
-)
-{
-    XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMFLOAT3SE operators
- *
- ****************************************************************************/
-
-inline PackedVector::XMFLOAT3SE::XMFLOAT3SE
-(
-    float _x,
-    float _y,
-    float _z
-)
-{
-    XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f ));
-}
-
-_Use_decl_annotations_
-inline PackedVector::XMFLOAT3SE::XMFLOAT3SE
-(
-    const float *pArray
-)
-{
-    XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMHALF4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMHALF4::XMHALF4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    x = XMConvertFloatToHalf(_x);
-    y = XMConvertFloatToHalf(_y);
-    z = XMConvertFloatToHalf(_z);
-    w = XMConvertFloatToHalf(_w);
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_
-inline PackedVector::XMHALF4::XMHALF4
-(
-    const float* pArray
-)
-{
-    XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4);
-}
-
-/****************************************************************************
- *
- * XMSHORTN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMSHORTN4::XMSHORTN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMSHORTN4::XMSHORTN4
-(
-    const float* pArray
-)
-{
-    XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMSHORT4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMSHORT4::XMSHORT4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMSHORT4::XMSHORT4
-(
-    const float* pArray
-)
-{
-    XMStoreShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORTN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUSHORTN4::XMUSHORTN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUSHORTN4::XMUSHORTN4
-(
-    const float* pArray
-)
-{
-    XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORT4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUSHORT4::XMUSHORT4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUSHORT4::XMUSHORT4
-(
-    const float* pArray
-)
-{
-    XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMXDECN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMXDECN4::XMXDECN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMXDECN4::XMXDECN4
-(
-    const float* pArray
-)
-{
-    XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMXDEC4 operators
- *
- ****************************************************************************/
-
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMXDEC4::XMXDEC4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMXDEC4::XMXDEC4
-(
-    const float* pArray
-)
-{
-    XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMDECN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMDECN4::XMDECN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMDECN4::XMDECN4
-(
-    const float* pArray
-)
-{
-    XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMDEC4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMDEC4::XMDEC4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMDEC4::XMDEC4
-(
-    const float* pArray
-)
-{
-    XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-#pragma warning(pop)
-
-/****************************************************************************
- *
- * XMUDECN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUDECN4::XMUDECN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUDECN4::XMUDECN4
-(
-    const float* pArray
-)
-{
-    XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUDEC4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUDEC4::XMUDEC4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUDEC4::XMUDEC4
-(
-    const float* pArray
-)
-{
-    XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTEN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMBYTEN4::XMBYTEN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMBYTEN4::XMBYTEN4
-(
-    const float* pArray
-)
-{
-    XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTE4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMBYTE4::XMBYTE4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMBYTE4::XMBYTE4
-(
-    const float* pArray
-)
-{
-    XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTEN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUBYTEN4::XMUBYTEN4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUBYTEN4::XMUBYTEN4
-(
-    const float* pArray
-)
-{
-    XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTE4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUBYTE4::XMUBYTE4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUBYTE4::XMUBYTE4
-(
-    const float* pArray
-)
-{
-    XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUNIBBLE4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMUNIBBLE4::XMUNIBBLE4
-(
-    float _x,
-    float _y,
-    float _z,
-    float _w
-)
-{
-    XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w ));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMUNIBBLE4::XMUNIBBLE4
-(
-    const float *pArray
-)
-{
-    XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMU555 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline PackedVector::XMU555::XMU555
-(
-    float _x,
-    float _y,
-    float _z,
-    bool _w
-)
-{
-    XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) ));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_
-inline PackedVector::XMU555::XMU555
-(
-    const float *pArray,
-    bool _w
-)
-{
-    XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray));
-    XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) ));
-}
-
-
+//-------------------------------------------------------------------------------------
+// DirectXPackedVector.inl -- SIMD C++ Math library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Data conversion
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline float PackedVector::XMConvertHalfToFloat
+(
+    HALF Value
+)
+{
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
+    __m128 V2 = _mm_cvtph_ps( V1 );
+    return _mm_cvtss_f32( V2 );
+#else
+    uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
+
+    uint32_t Exponent = (Value & 0x7C00);
+    if ( Exponent == 0x7C00 ) // INF/NAN
+    {
+        Exponent = (uint32_t)0x8f;
+    }
+    else if (Exponent != 0)  // The value is normalized
+    {
+        Exponent = (uint32_t)((Value >> 10) & 0x1F);
+    }
+    else if (Mantissa != 0)     // The value is denormalized
+    {
+        // Normalize the value in the resulting float
+        Exponent = 1;
+
+        do
+        {
+            Exponent--;
+            Mantissa <<= 1;
+        } while ((Mantissa & 0x0400) == 0);
+
+        Mantissa &= 0x03FF;
+    }
+    else                        // The value is zero
+    {
+        Exponent = (uint32_t)-112;
+    }
+
+    uint32_t Result = ((Value & 0x8000) << 16) | // Sign
+                      ((Exponent + 112) << 23) | // Exponent
+                      (Mantissa << 13);          // Mantissa
+
+    return reinterpret_cast<float*>(&Result)[0];
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+
+_Use_decl_annotations_
+inline float* PackedVector::XMConvertHalfToFloatStream
+(
+    float*      pOutputStream, 
+    size_t      OutputStride, 
+    const HALF* pInputStream, 
+    size_t      InputStride, 
+    size_t      HalfCount
+)
+{
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(HALF));
+    _Analysis_assume_(InputStride >= sizeof(HALF));
+
+    assert(OutputStride >= sizeof(float));
+    _Analysis_assume_(OutputStride >= sizeof(float));
+
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        XM_STREAM_PS( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                    pHalf += InputStride*4;
+
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
+                    pFloat += OutputStride; 
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ( ((uintptr_t)pFloat & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    XM_STREAM_PS( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                __m128i HV = _mm_setzero_si128();
+                HV = _mm_insert_epi16(HV, H1, 0);
+                HV = _mm_insert_epi16(HV, H2, 1);
+                HV = _mm_insert_epi16(HV, H3, 2);
+                HV = _mm_insert_epi16(HV, H4, 3);
+                __m128 FV = _mm_cvtph_ps(HV);
+
+                _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                pFloat += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride; 
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#else
+    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    for (size_t i = 0; i < HalfCount; i++)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride; 
+    }
+
+    return pOutputStream;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
+(
+    float Value
+)
+{
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128 V1 = _mm_set_ss( Value );
+    __m128i V2 = _mm_cvtps_ph( V1, 0 );
+    return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
+#else
+    uint32_t Result;
+
+    uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
+    uint32_t Sign = (IValue & 0x80000000U) >> 16U;
+    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
+
+    if (IValue > 0x477FE000U)
+    {
+        // The number is too large to be represented as a half.  Saturate to infinity.
+        if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF ) != 0))
+        {
+            Result = 0x7FFF; // NAN
+        }
+        else
+        {
+            Result = 0x7C00U; // INF
+        }
+    }
+    else
+    {
+        if (IValue < 0x38800000U)
+        {
+            // The number is too small to be represented as a normalized half.
+            // Convert it to a denormalized value.
+            uint32_t Shift = 113U - (IValue >> 23U);
+            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
+        }
+        else
+        {
+            // Rebias the exponent to represent the value as a normalized half.
+            IValue += 0xC8000000U;
+        }
+
+        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; 
+    }
+    return (HALF)(Result|Sign);
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
+(
+    HALF* pOutputStream, 
+    size_t       OutputStride, 
+    const float* pInputStream, 
+    size_t       InputStride, 
+    size_t       FloatCount
+)
+{
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(float));
+    _Analysis_assume_(InputStride >= sizeof(float));
+
+    assert(OutputStride >= sizeof(HALF));
+    _Analysis_assume_(OutputStride >= sizeof(HALF));
+
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
+                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
+                FV = _mm_blend_ps( FV, FT, 0xC );
+
+                __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                pHalf += OutputStride*4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                pHalf += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+#else
+    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    for (size_t i = 0; i < FloatCount; i++)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+    return pOutputStream;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+#pragma prefast(pop)
+
+/****************************************************************************
+ *
+ * Vector and matrix load operations
+ *
+ ****************************************************************************/
+#pragma prefast(push)
+#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadColor
+(
+    const XMCOLOR* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    // int32_t -> Float conversions are done in one instruction.
+    // uint32_t -> Float calls a runtime function. Keep in int32_t
+    int32_t iColor = (int32_t)(pSource->c);
+    XMVECTORF32 vColor = {
+        (float)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
+        (float)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
+        (float)(iColor & 0xFF) * (1.0f/255.0f),
+        (float)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
+    };
+    return vColor.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32_t bgra = pSource->c;
+    uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000);
+    uint32x2_t vInt8 = vdup_n_u32(rgba);
+    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
+    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_n_f32( R, 1.0f/255.0f );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    __m128i vInt = _mm_set1_epi32(pSource->c);
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
+    // Convert to floating point numbers
+    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
+    // Convert 0-255 to 0.0f-1.0f
+    return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
+(
+    const XMHALF2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_cvtph_ps( _mm_castps_si128( V ) );
+#else
+    XMVECTORF32 vResult = {
+        XMConvertHalfToFloat(pSource->x),
+        XMConvertHalfToFloat(pSource->y),
+        0.0f,
+        0.0f
+    };
+    return vResult.v;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN2
+(
+    const XMSHORTN2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)),
+        (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)),
+        0.0f,
+        0.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
+    vInt = vandq_s32( vInt, g_XMMaskXY );
+    float32x4_t R = vcvtq_f32_s32(vInt);
+    R = vmulq_n_f32( R, 1.0f/32767.0f );
+    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+    // x needs to be sign extended
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x - 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
+    // Convert -1.0f - 1.0f
+    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
+    // Clamp result (for case of -32768)
+    return _mm_max_ps( vTemp, g_XMNegativeOne );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort2
+(
+    const XMSHORT2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        0.f,
+        0.f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
+    vInt = vandq_s32( vInt, g_XMMaskXY );
+    return vcvtq_f32_s32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+    // x needs to be sign extended
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x - 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
+    // Y is 65536 too large
+    return _mm_mul_ps(vTemp,g_XMFixupY16);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN2
+(
+    const XMUSHORTN2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x / 65535.0f,
+        (float)pSource->y / 65535.0f,
+        0.f,
+        0.f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) );
+    vInt = vandq_u32( vInt, g_XMMaskXY );
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    R = vmulq_n_f32( R, 1.0f/65535.0f );
+    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
+    static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+    // y needs to be sign flipped
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // y + 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,FixaddY16);
+    // Y is 65536 times too large
+    vTemp = _mm_mul_ps(vTemp,FixupY16);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort2
+(
+    const XMUSHORT2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        0.f,
+        0.f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) );
+    vInt = vandq_u32( vInt, g_XMMaskXY );
+    return vcvtq_f32_u32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+    // y needs to be sign flipped
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // Y is 65536 times too large
+    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
+    // y + 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,FixaddY16);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN2
+(
+    const XMBYTEN2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (pSource->x == -128) ? -1.f : ((float)pSource->x * (1.0f/127.0f)),
+        (pSource->y == -128) ? -1.f : ((float)pSource->y * (1.0f/127.0f)),
+        0.0f,
+        0.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) );
+    int32x4_t vInt = vmovl_s16( vget_low_s16( vInt16 ) );
+    vInt = vandq_s32( vInt, g_XMMaskXY );
+    float32x4_t R = vcvtq_f32_s32(vInt);
+    R = vmulq_n_f32( R, 1.0f/127.0f );
+    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.0f/127.0f,1.0f/(127.0f*256.0f),0,0};
+    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask
+    vTemp = _mm_and_ps(vTemp,Mask);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp,Scale);
+    // Clamp result (for case of -128)
+    return _mm_max_ps( vTemp, g_XMNegativeOne );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte2
+(
+    const XMBYTE2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        0.0f,
+        0.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) );
+    int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
+    vInt = vandq_s32( vInt, g_XMMaskXY );
+    return vcvtq_f32_s32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
+    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask
+    vTemp = _mm_and_ps(vTemp,Mask);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    return _mm_mul_ps(vTemp,Scale);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN2
+(
+    const XMUBYTEN2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x * (1.0f/255.0f),
+        (float)pSource->y * (1.0f/255.0f),
+        0.0f,
+        0.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u16(vInt8) );
+    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
+    vInt = vandq_u32( vInt, g_XMMaskXY );
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_n_f32( R, 1.0f/255.0f );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.0f/255.0f,1.0f/(255.0f*256.0f),0,0};
+    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask
+    vTemp = _mm_and_ps(vTemp,Mask);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    return _mm_mul_ps(vTemp,Scale);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte2
+(
+    const XMUBYTE2* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        0.0f,
+        0.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
+    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
+    vInt = vandq_s32( vInt, g_XMMaskXY );
+    return vcvtq_f32_u32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,0,0};
+    static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask
+    vTemp = _mm_and_ps(vTemp,Mask);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    return _mm_mul_ps(vTemp,Scale);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU565
+(
+    const XMU565* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        float(pSource->v & 0x1F),
+        float((pSource->v >> 5) & 0x3F),
+        float((pSource->v >> 11) & 0x1F),
+        0.f,
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
+    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
+    uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    uint32x4_t vInt = vmovl_u16( vInt16 );
+    vInt = vandq_u32(vInt,U565And);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R,U565Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
+    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
+    // Get the 32 bit value and splat it
+    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Mask off x, y and z
+    vResult = _mm_and_ps(vResult,U565And);
+    // Convert to float
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Normalize x, y, and z
+    vResult = _mm_mul_ps(vResult,U565Mul);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3PK
+(
+    const XMFLOAT3PK* pSource
+)
+{
+    assert(pSource);
+
+    __declspec(align(16)) uint32_t Result[4];
+    uint32_t Mantissa;
+    uint32_t Exponent;
+
+    // X Channel (6-bit mantissa)
+    Mantissa = pSource->xm;
+
+    if ( pSource->xe == 0x1f ) // INF or NAN
+    {
+        Result[0] = 0x7f800000 | (pSource->xm << 17);
+    }
+    else
+    {
+        if ( pSource->xe != 0 ) // The value is normalized
+        {
+            Exponent = pSource->xe;
+        }
+        else if (Mantissa != 0) // The value is denormalized
+        {
+            // Normalize the value in the resulting float
+            Exponent = 1;
+    
+            do
+            {
+                Exponent--;
+                Mantissa <<= 1;
+            } while ((Mantissa & 0x40) == 0);
+    
+            Mantissa &= 0x3F;
+        }
+        else // The value is zero
+        {
+            Exponent = (uint32_t)-112;
+        }
+    
+        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
+    }
+
+    // Y Channel (6-bit mantissa)
+    Mantissa = pSource->ym;
+
+    if ( pSource->ye == 0x1f ) // INF or NAN
+    {
+        Result[1] = 0x7f800000 | (pSource->ym << 17);
+    }
+    else
+    {
+        if ( pSource->ye != 0 ) // The value is normalized
+        {
+            Exponent = pSource->ye;
+        }
+        else if (Mantissa != 0) // The value is denormalized
+        {
+            // Normalize the value in the resulting float
+            Exponent = 1;
+    
+            do
+            {
+                Exponent--;
+                Mantissa <<= 1;
+            } while ((Mantissa & 0x40) == 0);
+    
+            Mantissa &= 0x3F;
+        }
+        else // The value is zero
+        {
+            Exponent = (uint32_t)-112;
+        }
+    
+        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
+    }
+
+    // Z Channel (5-bit mantissa)
+    Mantissa = pSource->zm;
+
+    if ( pSource->ze == 0x1f ) // INF or NAN
+    {
+        Result[2] = 0x7f800000 | (pSource->zm << 17);
+    }
+    else
+    {
+        if ( pSource->ze != 0 ) // The value is normalized
+        {
+            Exponent = pSource->ze;
+        }
+        else if (Mantissa != 0) // The value is denormalized
+        {
+            // Normalize the value in the resulting float
+            Exponent = 1;
+    
+            do
+            {
+                Exponent--;
+                Mantissa <<= 1;
+            } while ((Mantissa & 0x20) == 0);
+    
+            Mantissa &= 0x1F;
+        }
+        else // The value is zero
+        {
+            Exponent = (uint32_t)-112;
+        }
+
+        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
+    }
+
+    return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) );
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3SE
+(
+    const XMFLOAT3SE* pSource
+)
+{
+    assert(pSource);
+
+    union { float f; int32_t i; } fi;
+    fi.i = 0x33800000 + (pSource->e << 23);
+    float Scale = fi.f;
+
+    XMVECTORF32 v = {
+        Scale * float( pSource->xm ),
+        Scale * float( pSource->ym ),
+        Scale * float( pSource->zm ),
+        1.0f };
+    return v;
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
+(
+    const XMHALF4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtph_ps( V );
+#else
+    XMVECTORF32 vResult = {
+        XMConvertHalfToFloat(pSource->x),
+        XMConvertHalfToFloat(pSource->y),
+        XMConvertHalfToFloat(pSource->z),
+        XMConvertHalfToFloat(pSource->w)
+    };
+    return vResult.v;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN4
+(
+    const XMSHORTN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)),
+        (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)),
+        (pSource->z == -32768) ? -1.f : ((float)pSource->z * (1.0f/32767.0f)),
+        (pSource->w == -32768) ? -1.f : ((float)pSource->w * (1.0f/32767.0f))
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int16x4_t vInt = vld1_s16( (const int16_t*)pSource );
+    int32x4_t V = vmovl_s16( vInt );
+    V = vcvtq_f32_s32( V );
+    V = vmulq_n_f32( V,  1.0f/32767.0f );
+    return vmaxq_f32( V, vdupq_n_f32(-1.f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
+    // x and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x and z - 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
+    // Convert to -1.0f - 1.0f
+    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
+    // Clamp result (for case of -32768)
+    return _mm_max_ps( vTemp, g_XMNegativeOne );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort4
+(
+    const XMSHORT4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        (float)pSource->z,
+        (float)pSource->w
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int16x4_t vInt = vld1_s16( (const int16_t*)pSource );
+    int32x4_t V = vmovl_s16( vInt );
+    return vcvtq_f32_s32( V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
+    // x and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x and z - 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
+    // Fix y and w because they are 65536 too large
+    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN4
+(
+    const XMUSHORTN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x / 65535.0f,
+        (float)pSource->y / 65535.0f,
+        (float)pSource->z / 65535.0f,
+        (float)pSource->w / 65535.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource );
+    uint32x4_t V = vmovl_u16( vInt );
+    V = vcvtq_f32_u32( V );
+    return vmulq_n_f32( V, 1.0f/65535.0f );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
+    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
+    // y and w are signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // y and w + 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
+    // Fix y and w because they are 65536 too large
+    vTemp = _mm_mul_ps(vTemp,FixupY16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort4
+(
+    const XMUSHORT4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        (float)pSource->z,
+        (float)pSource->w
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource );
+    uint32x4_t V = vmovl_u16( vInt );
+    return vcvtq_f32_u32( V );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f,32768.0f};
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
+    // y and w are signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // Fix y and w because they are 65536 too large
+    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
+    // y and w + 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDecN4
+(
+    const XMXDECN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = {
+        (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f),
+        (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f),
+        (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
+        (float)(pSource->v >> 30) / 3.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskA2B10G10R10);
+    vInt = veorq_u32(vInt,g_XMFlipA2B10G10R10);
+    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
+    R = vaddq_f32(R,g_XMFixAA2B10G10R10);
+    R = vmulq_f32(R,g_XMNormalizeA2B10G10R10);
+    return vmaxq_f32( R, vdupq_n_f32(-1.0f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
+    // Clamp result (for case of -512)
+    return _mm_max_ps( vTemp, g_XMNegativeOne );
+#endif
+}
+
+//------------------------------------------------------------------------------
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDec4
+(
+    const XMXDEC4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = {
+        (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]),
+        (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]),
+        (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]),
+        (float)(pSource->v >> 30)
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
+    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskDec4);
+    vInt = veorq_u32(vInt,XDec4Xor);
+    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
+    R = vaddq_f32(R ,XDec4Add);
+    return vmulq_f32(R,g_XMMulDec4);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
+    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,XDec4Xor);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,XDec4Add);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
+    return vTemp;
+#endif
+}
+
+#pragma warning(pop)
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4
+(
+    const XMUDECN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = {
+        (float)ElementX / 1023.0f,
+        (float)ElementY / 1023.0f,
+        (float)ElementZ / 1023.0f,
+        (float)(pSource->v >> 30) / 3.0f
+    };
+    return vResult.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskDec4);
+    float32x4_t R = vcvtq_f32_u32( vInt );
+    return vmulq_f32(R,UDecN4Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
+    return vTemp;
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4_XR
+(
+    const XMUDECN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    int32_t ElementX = pSource->v & 0x3FF;
+    int32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    int32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = {
+        (float)(ElementX - 0x180) / 510.0f,
+        (float)(ElementY - 0x180) / 510.0f,
+        (float)(ElementZ - 0x180) / 510.0f,
+        (float)(pSource->v >> 30) / 3.0f
+    };
+
+    return vResult.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
+    static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 };
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskDec4);
+    int32x4_t vTemp = vsubq_s32( vreinterpretq_s32_u32(vInt), XRBias );
+    vTemp = veorq_u32( vTemp, g_XMFlipW );
+    float32x4_t R = vcvtq_f32_s32( vTemp );
+    R = vaddq_f32(R,g_XMAddUDec4);
+    return vmulq_f32(R,XRMul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
+    static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 };
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Mask channels
+    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+    // Subtract bias
+    vTemp = _mm_castsi128_ps( _mm_sub_epi32( _mm_castps_si128(vTemp), XRBias ) );
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Convert to 0.0f-1.0f
+    return _mm_mul_ps(vTemp,XRMul);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDec4
+(
+    const XMUDEC4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = {
+        (float)ElementX,
+        (float)ElementY,
+        (float)ElementZ,
+        (float)(pSource->v >> 30)
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskDec4);
+    float32x4_t R = vcvtq_f32_u32( vInt );
+    return vmulq_f32(R,g_XMMulDec4);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDecN4
+(
+    const XMDECN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
+    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+    uint32_t ElementW = pSource->v >> 30;
+
+    XMVECTORF32 vResult = {
+        (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f),
+        (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f),
+        (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
+        (ElementW == 0x2)   ? -1.f : ((float)(int16_t)(ElementW | SignExtendW[(ElementW >> 1) & 1]))
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskDec4);
+    vInt = veorq_u32(vInt,g_XMXorDec4);
+    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
+    R = vaddq_f32(R,g_XMAddDec4);
+    R = vmulq_f32(R,DecN4Mul);
+    return vmaxq_f32( R, vdupq_n_f32(-1.0f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp,DecN4Mul);
+    // Clamp result (for case of -512/-1)
+    return _mm_max_ps( vTemp, g_XMNegativeOne );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDec4
+(
+    const XMDEC4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
+    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+    uint32_t ElementW = pSource->v >> 30;
+
+    XMVECTORF32 vResult = {
+        (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]),
+        (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]),
+        (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]),
+        (float)(int16_t)(ElementW | SignExtendW[ElementW >> 1])
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    vInt = vandq_u32(vInt,g_XMMaskDec4);
+    vInt = veorq_u32(vInt,g_XMXorDec4);
+    float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
+    R = vaddq_f32(R,g_XMAddDec4);
+    return vmulq_f32(R,g_XMMulDec4);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
+    return vTemp;
+#endif
+}
+
+#pragma warning(pop)
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN4
+(
+    const XMUBYTEN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x / 255.0f,
+        (float)pSource->y / 255.0f,
+        (float)pSource->z / 255.0f,
+        (float)pSource->w / 255.0f
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
+    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_n_f32( R, 1.0f/255.0f );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte4
+(
+    const XMUBYTE4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        (float)pSource->z,
+        (float)pSource->w
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
+    uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
+    return vcvtq_f32_u32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN4
+(
+    const XMBYTEN4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (pSource->x == -128) ? -1.f : ((float)pSource->x / 127.0f),
+        (pSource->y == -128) ? -1.f : ((float)pSource->y / 127.0f),
+        (pSource->z == -128) ? -1.f : ((float)pSource->z / 127.0f),
+        (pSource->w == -128) ? -1.f : ((float)pSource->w / 127.0f)
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) );
+    int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
+    float32x4_t R = vcvtq_f32_s32(vInt);
+    R = vmulq_n_f32( R, 1.0f/127.0f );
+    return vmaxq_f32( R, vdupq_n_f32(-1.f) );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
+    // Clamp result (for case of -128)
+    return _mm_max_ps( vTemp, g_XMNegativeOne );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte4
+(
+    const XMBYTE4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        (float)pSource->x,
+        (float)pSource->y,
+        (float)pSource->z,
+        (float)pSource->w
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
+    int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) );
+    int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
+    return vcvtq_f32_s32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUNibble4
+(
+     const XMUNIBBLE4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        float(pSource->v & 0xF),
+        float((pSource->v >> 4) & 0xF),
+        float((pSource->v >> 8) & 0xF),
+        float((pSource->v >> 12) & 0xF)
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
+    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
+    uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    uint32x4_t vInt = vmovl_u16( vInt16 );
+    vInt = vandq_u32(vInt,UNibble4And);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R,UNibble4Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
+    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
+    // Get the 32 bit value and splat it
+    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Mask off x, y and z
+    vResult = _mm_and_ps(vResult,UNibble4And);
+    // Convert to float
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Normalize x, y, and z
+    vResult = _mm_mul_ps(vResult,UNibble4Mul);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU555
+(
+     const XMU555* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = {
+        float(pSource->v & 0x1F),
+        float((pSource->v >> 5) & 0x1F),
+        float((pSource->v >> 10) & 0x1F),
+        float((pSource->v >> 15) & 0x1)
+    };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
+    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
+    uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
+    uint32x4_t vInt = vmovl_u16( vInt16 );
+    vInt = vandq_u32(vInt,U555And);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R,U555Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
+    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
+    // Get the 32 bit value and splat it
+    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+    // Mask off x, y and z
+    vResult = _mm_and_ps(vResult,U555And);
+    // Convert to float
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Normalize x, y, and z
+    vResult = _mm_mul_ps(vResult,U555Mul);
+    return vResult;
+#endif
+}
+
+#pragma prefast(pop)
+
+/****************************************************************************
+ *
+ * Vector and matrix store operations
+ *
+ ****************************************************************************/
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreColor
+(
+    XMCOLOR* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiply(N, g_UByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->c = ((uint32_t)tmp.w << 24) |
+                      ((uint32_t)tmp.x << 16) |
+                      ((uint32_t)tmp.y <<  8) |
+                      ((uint32_t)tmp.z);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 255.0f );
+    R = XMVectorRound(R);
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
+    uint32_t rgba = vget_lane_u32( vreinterpret_u32_u8(vInt8), 0 );
+    pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Set <0 to 0
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    // Set>1 to 1
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Convert to 0-255
+    vResult = _mm_mul_ps(vResult,g_UByteMax);
+    // Shuffle RGBA to ARGB
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
+    // Convert to int 
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Mash to shorts
+    vInt = _mm_packs_epi32(vInt,vInt);
+    // Mash to bytes
+    vInt = _mm_packus_epi16(vInt,vInt);
+    // Store the color
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),_mm_castsi128_ps(vInt));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreHalf2
+(
+    XMHALF2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
+#else
+    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
+    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreShortN2
+(
+    XMSHORTN2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 32767.0f );
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32( vInt32 );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    vResult = _mm_mul_ps(vResult,g_ShortMax);
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    vResulti = _mm_packs_epi32(vResulti,vResulti);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreShort2
+(
+    XMSHORT2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f) );
+    R = vminq_f32(R, vdupq_n_f32(32767.0f));
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32( vInt32 );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_ShortMin);
+    vResult = _mm_min_ps(vResult,g_ShortMax);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Pack the ints into shorts
+    vInt = _mm_packs_epi32(vInt,vInt);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUShortN2
+(
+    XMUSHORTN2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 65535.0f );
+    R = vaddq_f32( R, g_XMOneHalf );
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    vResult = _mm_mul_ps(vResult,g_UShortMax);
+    vResult = _mm_add_ps(vResult,g_XMOneHalf);
+     // Convert to int
+    __m128i vInt = _mm_cvttps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
+    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUShort2
+(
+    XMUSHORT2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
+    R = vminq_f32(R, vdupq_n_f32(65535.0f));
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_UShortMax);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
+    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreByteN2
+(
+    XMBYTEN2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (int8_t)tmp.x;
+    pDestination->y = (int8_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 127.0f );
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32( vInt32 );
+    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
+    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_s8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,g_ByteMax);
+    // Convert to int by rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreByte2
+(
+    XMBYTE2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (int8_t)tmp.x;
+    pDestination->y = (int8_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) );
+    R = vminq_f32(R, vdupq_n_f32(127.0f));
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32( vInt32 );
+    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
+    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_s8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_ByteMin);
+    vResult = _mm_min_ps(vResult,g_ByteMax);
+    // Convert to int by rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUByteN2
+(
+    XMUBYTEN2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (uint8_t)tmp.x;
+    pDestination->y = (uint8_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 255.0f );
+    R = vaddq_f32( R, g_XMOneHalf );
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
+    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_u8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,g_UByteMax);
+    vResult = _mm_add_ps(vResult,g_XMOneHalf);
+    // Convert to int
+    __m128i vInt = _mm_cvttps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUByte2
+(
+    XMUBYTE2* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->x = (uint8_t)tmp.x;
+    pDestination->y = (uint8_t)tmp.y;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
+    R = vminq_f32(R, vdupq_n_f32(255.0f));
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
+    vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_u8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_UByteMax);
+    // Convert to int by rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreU565
+(
+    XMU565* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A( &tmp, N );
+
+    pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) |
+                      (((uint16_t)tmp.y & 0x3F) << 5) |
+                      (((uint16_t)tmp.x & 0x1F));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
+    static const XMVECTORF32 Scale = {1.0f,32.f,32.f*64.f, 0.f };
+    static const XMVECTORU32 Mask = {0x1F,0x3F<<5,0x1F<<11,0};
+    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
+    vResult = vminq_f32(vResult,Max);
+    vResult = vmulq_f32(vResult,Scale);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti,Mask);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vhi = vget_high_u32(vResulti);
+    vTemp = vorr_u32( vTemp, vhi );
+    vTemp = vpadd_u32( vTemp, vTemp );
+    vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,Max);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
+    pDestination->v = ((z & 0x1F) << 11) |
+                      ((y & 0x3F) << 5) |
+                      ((x & 0x1F));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreFloat3PK
+(
+    XMFLOAT3PK* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+
+    __declspec(align(16)) uint32_t IValue[4];
+    XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V );
+
+    uint32_t Result[3];
+
+    // X & Y Channels (5-bit exponent, 6-bit mantissa)
+    for(uint32_t j=0; j < 2; ++j)
+    {
+        uint32_t Sign = IValue[j] & 0x80000000;
+        uint32_t I = IValue[j] & 0x7FFFFFFF;
+
+        if ((I & 0x7F800000) == 0x7F800000)
+        {
+            // INF or NAN
+            Result[j] = 0x7c0;
+            if (( I & 0x7FFFFF ) != 0)
+            {
+                Result[j] = 0x7c0 | (((I>>17)|(I>>11)|(I>>6)|(I))&0x3f);
+            }
+            else if ( Sign )
+            {
+                // -INF is clamped to 0 since 3PK is positive only
+                Result[j] = 0;
+            }
+        }
+        else if ( Sign )
+        {
+            // 3PK is positive only, so clamp to zero
+            Result[j] = 0;
+        }
+        else if (I > 0x477E0000U)
+        {
+            // The number is too large to be represented as a float11, set to max
+            Result[j] = 0x7BF;
+        }
+        else
+        {
+            if (I < 0x38800000U)
+            {
+                // The number is too small to be represented as a normalized float11
+                // Convert it to a denormalized value.
+                uint32_t Shift = 113U - (I >> 23U);
+                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+            }
+            else
+            {
+                // Rebias the exponent to represent the value as a normalized float11
+                I += 0xC8000000U;
+            }
+     
+            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
+        }
+    }
+
+    // Z Channel (5-bit exponent, 5-bit mantissa)
+    uint32_t Sign = IValue[2] & 0x80000000;
+    uint32_t I = IValue[2] & 0x7FFFFFFF;
+
+    if ((I & 0x7F800000) == 0x7F800000)
+    {
+        // INF or NAN
+        Result[2] = 0x3e0;
+        if ( I & 0x7FFFFF )
+        {
+            Result[2] = 0x3e0 | (((I>>18)|(I>>13)|(I>>3)|(I))&0x1f);
+        }
+        else if ( Sign )
+        {
+            // -INF is clamped to 0 since 3PK is positive only
+            Result[2] = 0;
+        }
+    }
+    else if ( Sign )
+    {
+        // 3PK is positive only, so clamp to zero
+        Result[2] = 0;
+    }
+    else if (I > 0x477C0000U)
+    {
+        // The number is too large to be represented as a float10, set to max
+        Result[2] = 0x3df;
+    }
+    else
+    {
+        if (I < 0x38800000U)
+        {
+            // The number is too small to be represented as a normalized float10
+            // Convert it to a denormalized value.
+            uint32_t Shift = 113U - (I >> 23U);
+            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+        }
+        else
+        {
+            // Rebias the exponent to represent the value as a normalized float10
+            I += 0xC8000000U;
+        }
+     
+        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
+    }
+
+    // Pack Result into memory
+    pDestination->v = (Result[0] & 0x7ff)
+                      | ( (Result[1] & 0x7ff) << 11 )
+                      | ( (Result[2] & 0x3ff) << 22 );
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreFloat3SE
+(
+    XMFLOAT3SE* pDestination,
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+
+    XMFLOAT3A tmp;
+    XMStoreFloat3A( &tmp, V );
+
+    static const float maxf9 = float(0x1FF << 7);
+    static const float minf9 = float(1.f / (1 << 16));
+
+    float x = (tmp.x >= 0.f) ? ( (tmp.x > maxf9) ? maxf9 : tmp.x ) : 0.f;
+    float y = (tmp.y >= 0.f) ? ( (tmp.y > maxf9) ? maxf9 : tmp.y ) : 0.f;
+    float z = (tmp.z >= 0.f) ? ( (tmp.z > maxf9) ? maxf9 : tmp.z ) : 0.f;
+
+    const float max_xy = (x > y) ? x : y;
+    const float max_xyz = (max_xy > z) ? max_xy : z;
+
+    const float maxColor = (max_xyz > minf9) ? max_xyz : minf9;
+
+    union { float f; int32_t i; } fi;
+    fi.f = maxColor;
+    fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
+
+    uint32_t exp = fi.i >> 23;
+    pDestination->e = exp - 0x6f;
+
+    fi.i = 0x83000000 - (exp << 23);
+    float ScaleR = fi.f;
+
+#ifdef _XM_NO_ROUNDF_
+    pDestination->xm = static_cast<uint32_t>( Internal::round_to_nearest(x * ScaleR) );
+    pDestination->ym = static_cast<uint32_t>( Internal::round_to_nearest(y * ScaleR) );
+    pDestination->zm = static_cast<uint32_t>( Internal::round_to_nearest(z * ScaleR) );
+#else
+    pDestination->xm = static_cast<uint32_t>( lroundf(x * ScaleR) );
+    pDestination->ym = static_cast<uint32_t>( lroundf(y * ScaleR) );
+    pDestination->zm = static_cast<uint32_t>( lroundf(z * ScaleR) );
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreHalf4
+(
+    XMHALF4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
+#else
+    XMFLOAT4A t;
+    XMStoreFloat4A(&t, V );
+
+    pDestination->x = XMConvertFloatToHalf(t.x);
+    pDestination->y = XMConvertFloatToHalf(t.y);
+    pDestination->z = XMConvertFloatToHalf(t.z);
+    pDestination->w = XMConvertFloatToHalf(t.w);
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreShortN4
+(
+    XMSHORTN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+    pDestination->z = (int16_t)tmp.z;
+    pDestination->w = (int16_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(-1.f) );
+    vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) );
+    vResult = vmulq_n_f32( vResult, 32767.0f );
+    vResult = vcvtq_s32_f32( vResult );
+    int16x4_t vInt = vmovn_s32( vResult );
+    vst1_s16( reinterpret_cast<int16_t*>(pDestination), vInt );
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    vResult = _mm_mul_ps(vResult,g_ShortMax);
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    vResulti = _mm_packs_epi32(vResulti,vResulti);
+    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreShort4
+(
+    XMSHORT4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+    pDestination->z = (int16_t)tmp.z;
+    pDestination->w = (int16_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32( V, g_ShortMin );
+    vResult = vminq_f32( vResult, g_ShortMax );
+    vResult = vcvtq_s32_f32( vResult );
+    int16x4_t vInt = vmovn_s32( vResult );
+    vst1_s16( reinterpret_cast<int16_t*>(pDestination), vInt );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_ShortMin);
+    vResult = _mm_min_ps(vResult,g_ShortMax);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Pack the ints into shorts
+    vInt = _mm_packs_epi32(vInt,vInt);
+    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUShortN4
+(
+    XMUSHORTN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+    pDestination->z = (int16_t)tmp.z;
+    pDestination->w = (int16_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) );
+    vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) );
+    vResult = vmulq_n_f32( vResult, 65535.0f );
+    vResult = vaddq_f32( vResult, g_XMOneHalf );
+    vResult = vcvtq_u32_f32( vResult );
+    uint16x4_t vInt = vmovn_u32( vResult );
+    vst1_u16( reinterpret_cast<uint16_t*>(pDestination), vInt );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    vResult = _mm_mul_ps(vResult,g_UShortMax);
+    vResult = _mm_add_ps(vResult,g_XMOneHalf);
+    // Convert to int
+    __m128i vInt = _mm_cvttps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
+    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
+    pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
+    pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUShort4
+(
+    XMUSHORT4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (int16_t)tmp.x;
+    pDestination->y = (int16_t)tmp.y;
+    pDestination->z = (int16_t)tmp.z;
+    pDestination->w = (int16_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) );
+    vResult = vminq_f32( vResult, g_UShortMax );
+    vResult = vcvtq_u32_f32( vResult );
+    uint16x4_t vInt = vmovn_u32( vResult );
+    vst1_u16( reinterpret_cast<uint16_t*>(pDestination), vInt );
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_UShortMax);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
+    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
+    pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
+    pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreXDecN4
+(
+    XMXDECN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
+    static const XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 3.0f};
+
+    XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v);
+    N = XMVectorMultiply(N, Scale.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((uint32_t)tmp.w << 30) |
+                       (((int32_t)tmp.z & 0x3FF) << 20) |
+                       (((int32_t)tmp.y & 0x3FF) << 10) |
+                       (((int32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
+    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
+    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
+    float32x4_t vResult = vmaxq_f32(V,Min);
+    vResult = vminq_f32(vResult,vdupq_n_f32(1.0f));
+    vResult = vmulq_f32(vResult,Scale);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti,ScaleMask);
+    int32x4_t vResultw = vandq_s32(vResulti,g_XMMaskW);
+    vResulti = vaddq_s32(vResulti,vResultw);
+    // Do a horizontal or of all 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
+    uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
+    vTemp = vorr_u32( vTemp, vhi );
+    vTemp = vpadd_u32( vTemp, vTemp );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
+    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
+    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
+    XMVECTOR vResult = _mm_max_ps(V,Min);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,Scale);
+    // Convert to int (W is unsigned)
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,ScaleMask);
+    // To fix W, add itself to shift it up to <<30 instead of <<29
+    __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
+    vResulti = _mm_add_epi32(vResulti,vResultw);
+    // Do a horizontal or of all 4 entries
+    vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1));
+    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
+    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
+    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreXDec4
+(
+    XMXDEC4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, 0.0f};
+    static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 3.0f};
+
+    XMVECTOR N = XMVectorClamp(V, Min, Max);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((uint32_t)tmp.w << 30) |
+                       (((int32_t)tmp.z & 0x3FF) << 20) |
+                       (((int32_t)tmp.y & 0x3FF) << 10) |
+                       (((int32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
+    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
+    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
+    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    float32x4_t vResult = vmaxq_f32(V,MinXDec4);
+    vResult = vminq_f32(vResult,MaxXDec4);
+    vResult = vmulq_f32(vResult,ScaleXDec4);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti,MaskXDec4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
+    uint32x2_t vTemp2 = vget_high_u32(vreinterpret_u32_s32(vResulti));
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32( vTemp, 1 );
+    vTemp2 = vadd_s32( vTemp2, vTemp2 );
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
+    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
+    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
+    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
+    vResult = _mm_min_ps(vResult,MaxXDec4);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleXDec4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskXDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // Perform a single bit left shift on y|w
+    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+#pragma warning(pop)
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUDecN4
+(
+    XMUDECN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32  Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiply(N, Scale.v);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((uint32_t)tmp.w << 30) |
+                       (((uint32_t)tmp.z & 0x3FF) << 20) |
+                       (((uint32_t)tmp.y & 0x3FF) << 10) |
+                       (((uint32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
+    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f));
+    vResult = vminq_f32(vResult,vdupq_n_f32(1.f));
+    vResult = vmulq_f32(vResult,ScaleUDecN4);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti,MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32( vTemp, 1 );
+    vTemp2 = vadd_u32( vTemp2, vTemp2 );
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
+    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleUDecN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // Perform a left shift by one bit on y|w
+    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUDecN4_XR
+(
+    XMUDECN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32  Scale = { 510.0f, 510.0f, 510.0f, 3.0f };
+    static const XMVECTORF32  Bias  = { 384.0f, 384.0f, 384.0f, 0.0f };
+    static const XMVECTORF32  C     = { 1023.f, 1023.f, 1023.f, 3.f };
+
+    XMVECTOR N = XMVectorMultiplyAdd( V, Scale, Bias );
+    N = XMVectorClamp( N, g_XMZero, C );
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((uint32_t)tmp.w << 30)
+                      | (((uint32_t)tmp.z & 0x3FF) << 20)
+                      | (((uint32_t)tmp.y & 0x3FF) << 10)
+                      | (((uint32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f};
+    static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f };
+    static const XMVECTORF32 Bias  = { 384.0f, 384.0f, 384.0f, 0.0f };
+    static const XMVECTORF32 C     = { 1023.f, 1023.f, 1023.f, 3.f };
+    float32x4_t vResult = vmlaq_f32( Bias, V, Scale );
+    vResult = vmaxq_f32(vResult,vdupq_n_f32(0.f));
+    vResult = vminq_f32(vResult,C);
+    vResult = vmulq_f32(vResult,Shift);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti,MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32( vTemp, 1 );
+    vTemp2 = vadd_u32( vTemp2, vTemp2 );
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f};
+    static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f };
+    static const XMVECTORF32 Bias  = { 384.0f, 384.0f, 384.0f, 0.0f };
+    static const XMVECTORF32 C     = { 1023.f, 1023.f, 1023.f, 3.f };
+    // Scale & bias
+    XMVECTOR vResult = _mm_mul_ps( V, Scale );
+    vResult = _mm_add_ps( vResult, Bias );
+    // Clamp to bounds
+    vResult = _mm_max_ps(vResult,g_XMZero);
+    vResult = _mm_min_ps(vResult,C);
+    // Scale by shift values
+    vResult = _mm_mul_ps(vResult,Shift);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // Perform a left shift by one bit on y|w
+    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUDec4
+(
+    XMUDEC4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32 Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((uint32_t)tmp.w << 30) |
+                       (((uint32_t)tmp.z & 0x3FF) << 20) |
+                       (((uint32_t)tmp.y & 0x3FF) << 10) |
+                       (((uint32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
+    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
+    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f));
+    vResult = vminq_f32(vResult,MaxUDec4);
+    vResult = vmulq_f32(vResult,ScaleUDec4);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti,MaskUDec4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32( vTemp, 1 );
+    vTemp2 = vadd_u32( vTemp2, vTemp2 );
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
+    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
+    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,MaxUDec4);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleUDec4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskUDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // Perform a left shift by one bit on y|w
+    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreDecN4
+(
+    XMDECN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 1.0f};
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, Scale.v);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((int32_t)tmp.w << 30) |
+                       (((int32_t)tmp.z & 0x3FF) << 20) |
+                       (((int32_t)tmp.y & 0x3FF) << 10) |
+                       (((int32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
+    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(-1.f));
+    vResult = vminq_f32(vResult,vdupq_n_f32(1.f));
+    vResult = vmulq_f32(vResult,ScaleDecN4);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti,g_XMMaskDec4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
+    uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
+    vTemp = vorr_u32( vTemp, vhi );
+    vTemp = vpadd_u32( vTemp, vTemp );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleDecN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,g_XMMaskDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreDec4
+(
+    XMDEC4*  pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, -1.0f};
+    static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 1.0f};
+
+    XMVECTOR N = XMVectorClamp(V, Min, Max);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((int32_t)tmp.w << 30) |
+                       (((int32_t)tmp.z & 0x3FF) << 20) |
+                       (((int32_t)tmp.y & 0x3FF) << 10) |
+                       (((int32_t)tmp.x & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
+    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
+    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
+    float32x4_t vResult = vmaxq_f32(V,MinDec4);
+    vResult = vminq_f32(vResult,MaxDec4);
+    vResult = vmulq_f32(vResult,ScaleDec4);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti,g_XMMaskDec4);
+    // Do a horizontal or of all 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
+    uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
+    vTemp = vorr_u32( vTemp, vhi );
+    vTemp = vpadd_u32( vTemp, vTemp );
+    vst1_lane_u32( &pDestination->v, vTemp, 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
+    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
+    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,MinDec4);
+    vResult = _mm_min_ps(vResult,MaxDec4);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleDec4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,g_XMMaskDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+#pragma warning(pop)
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUByteN4
+(
+    XMUBYTEN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiply(N, g_UByteMax);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (uint8_t)tmp.x;
+    pDestination->y = (uint8_t)tmp.y;
+    pDestination->z = (uint8_t)tmp.z;
+    pDestination->w = (uint8_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 255.0f );
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
+    static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleUByteN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskUByteN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // Perform a single bit left shift to fix y|w 
+    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUByte4
+(
+    XMUBYTE4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (uint8_t)tmp.x;
+    pDestination->y = (uint8_t)tmp.y;
+    pDestination->z = (uint8_t)tmp.z;
+    pDestination->w = (uint8_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
+    R = vminq_f32(R, vdupq_n_f32(255.0f));
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32( vInt32 );
+    uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
+    static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_UByteMax);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleUByte4);
+    // Convert to int by rounding
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskUByte4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // Perform a single bit left shift to fix y|w 
+    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreByteN4
+(
+    XMBYTEN4* pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(V, g_ByteMax);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (int8_t)tmp.x;
+    pDestination->y = (int8_t)tmp.y;
+    pDestination->z = (int8_t)tmp.z;
+    pDestination->w = (int8_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32( R, 127.0f );
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32( vInt32 );
+    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
+    static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleByteN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskByteN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreByte4
+(
+    XMBYTE4*  pDestination, 
+    FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->x = (int8_t)tmp.x;
+    pDestination->y = (int8_t)tmp.y;
+    pDestination->z = (int8_t)tmp.z;
+    pDestination->w = (int8_t)tmp.w;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) );
+    R = vminq_f32(R, vdupq_n_f32(127.f));
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32( vInt32 );
+    int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
+    vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
+    static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V,g_ByteMin);
+    vResult = _mm_min_ps(vResult,g_ByteMax);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult,ScaleByte4);
+    // Convert to int by rounding
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti,MaskByte4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti,vResulti2);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreUNibble4
+(
+     XMUNIBBLE4* pDestination,
+     FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) |
+                      (((uint16_t)tmp.z & 0xF) << 8) |
+                      (((uint16_t)tmp.y & 0xF) << 4) |
+                      (((uint16_t)tmp.x & 0xF));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
+    static const XMVECTORF32 Scale = {1.0f,16.f,16.f*16.f,16.f*16.f*16.f};
+    static const XMVECTORU32 Mask = {0xF,0xF<<4,0xF<<8,0xF<<12};
+    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
+    vResult = vminq_f32(vResult,Max);
+    vResult = vmulq_f32(vResult,Scale);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti,Mask);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vhi = vget_high_u32(vResulti);
+    vTemp = vorr_u32( vTemp, vhi );
+    vTemp = vpadd_u32( vTemp, vTemp );
+    vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,Max);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
+    uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
+    pDestination->v = ((w & 0xF) << 12) |
+                      ((z & 0xF) << 8) |
+                      ((y & 0xF) << 4) |
+                      ((x & 0xF));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV PackedVector::XMStoreU555
+(
+     XMU555* pDestination,
+     FXMVECTOR V
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N );
+
+    pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) |
+                      (((uint16_t)tmp.z & 0x1F) << 10) |
+                      (((uint16_t)tmp.y & 0x1F) << 5) |
+                      (((uint16_t)tmp.x & 0x1F));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
+    static const XMVECTORF32 Scale = {1.0f,32.f/2.f,32.f*32.f,32.f*32.f*32.f/2.f};
+    static const XMVECTORU32 Mask = {0x1F,0x1F<<(5-1),0x1F<<10,0x1<<(15-1)};
+    float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
+    vResult = vminq_f32(vResult,Max);
+    vResult = vmulq_f32(vResult,Scale);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti,Mask);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32( vTemp, 1 );
+    vTemp2 = vadd_s32( vTemp2, vTemp2 );
+    vTemp = vorr_u32( vTemp, vTemp2 );
+    vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+    vResult = _mm_min_ps(vResult,Max);
+     // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
+    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
+    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
+    uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
+    pDestination->v = ((w) ? 0x8000 : 0) |
+                      ((z & 0x1F) << 10) |
+                      ((y & 0x1F) << 5) |
+                      ((x & 0x1F));
+#endif
+}
+
+
+/****************************************************************************
+ *
+ * XMCOLOR operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMCOLOR::XMCOLOR
+(
+    float _r,
+    float _g,
+    float _b,
+    float _a
+)
+{
+    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMCOLOR::XMCOLOR
+(
+    const float* pArray
+)
+{
+    XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMHALF2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMHALF2::XMHALF2
+(
+    float _x,
+    float _y
+)
+{
+    x = XMConvertFloatToHalf(_x);
+    y = XMConvertFloatToHalf(_y);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMHALF2::XMHALF2
+(
+    const float* pArray
+)
+{
+    assert( pArray != nullptr );
+    x = XMConvertFloatToHalf(pArray[0]);
+    y = XMConvertFloatToHalf(pArray[1]);
+}
+
+/****************************************************************************
+ *
+ * XMSHORTN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMSHORTN2::XMSHORTN2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMSHORTN2::XMSHORTN2
+(
+    const float* pArray
+)
+{
+    XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMSHORT2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMSHORT2::XMSHORT2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMSHORT2::XMSHORT2
+(
+    const float* pArray
+)
+{
+    XMStoreShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORTN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUSHORTN2::XMUSHORTN2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUSHORTN2::XMUSHORTN2
+(
+    const float* pArray
+)
+{
+    XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORT2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUSHORT2::XMUSHORT2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUSHORT2::XMUSHORT2
+(
+    const float* pArray
+)
+{
+    XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTEN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMBYTEN2::XMBYTEN2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMBYTEN2::XMBYTEN2
+(
+    const float* pArray
+)
+{
+    XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTE2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMBYTE2::XMBYTE2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMBYTE2::XMBYTE2
+(
+    const float* pArray
+)
+{
+    XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTEN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUBYTEN2::XMUBYTEN2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUBYTEN2::XMUBYTEN2
+(
+    const float* pArray
+)
+{
+    XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTE2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUBYTE2::XMUBYTE2
+(
+    float _x,
+    float _y
+)
+{
+    XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUBYTE2::XMUBYTE2
+(
+    const float* pArray
+)
+{
+    XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMU565 operators
+ *
+ ****************************************************************************/
+
+inline PackedVector::XMU565::XMU565
+(
+    float _x,
+    float _y,
+    float _z
+)
+{
+    XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f ));
+}
+
+_Use_decl_annotations_
+inline PackedVector::XMU565::XMU565
+(
+    const float *pArray
+)
+{
+    XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3PK operators
+ *
+ ****************************************************************************/
+
+inline PackedVector::XMFLOAT3PK::XMFLOAT3PK
+(
+    float _x,
+    float _y,
+    float _z
+)
+{
+    XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f ));
+}
+
+_Use_decl_annotations_
+inline PackedVector::XMFLOAT3PK::XMFLOAT3PK
+(
+    const float *pArray
+)
+{
+    XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3SE operators
+ *
+ ****************************************************************************/
+
+inline PackedVector::XMFLOAT3SE::XMFLOAT3SE
+(
+    float _x,
+    float _y,
+    float _z
+)
+{
+    XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f ));
+}
+
+_Use_decl_annotations_
+inline PackedVector::XMFLOAT3SE::XMFLOAT3SE
+(
+    const float *pArray
+)
+{
+    XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMHALF4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMHALF4::XMHALF4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    x = XMConvertFloatToHalf(_x);
+    y = XMConvertFloatToHalf(_y);
+    z = XMConvertFloatToHalf(_z);
+    w = XMConvertFloatToHalf(_w);
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline PackedVector::XMHALF4::XMHALF4
+(
+    const float* pArray
+)
+{
+    XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4);
+}
+
+/****************************************************************************
+ *
+ * XMSHORTN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMSHORTN4::XMSHORTN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMSHORTN4::XMSHORTN4
+(
+    const float* pArray
+)
+{
+    XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMSHORT4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMSHORT4::XMSHORT4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMSHORT4::XMSHORT4
+(
+    const float* pArray
+)
+{
+    XMStoreShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORTN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUSHORTN4::XMUSHORTN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUSHORTN4::XMUSHORTN4
+(
+    const float* pArray
+)
+{
+    XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORT4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUSHORT4::XMUSHORT4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUSHORT4::XMUSHORT4
+(
+    const float* pArray
+)
+{
+    XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMXDECN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMXDECN4::XMXDECN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMXDECN4::XMXDECN4
+(
+    const float* pArray
+)
+{
+    XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMXDEC4 operators
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMXDEC4::XMXDEC4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMXDEC4::XMXDEC4
+(
+    const float* pArray
+)
+{
+    XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMDECN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMDECN4::XMDECN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMDECN4::XMDECN4
+(
+    const float* pArray
+)
+{
+    XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMDEC4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMDEC4::XMDEC4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMDEC4::XMDEC4
+(
+    const float* pArray
+)
+{
+    XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * XMUDECN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUDECN4::XMUDECN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUDECN4::XMUDECN4
+(
+    const float* pArray
+)
+{
+    XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUDEC4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUDEC4::XMUDEC4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUDEC4::XMUDEC4
+(
+    const float* pArray
+)
+{
+    XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTEN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMBYTEN4::XMBYTEN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMBYTEN4::XMBYTEN4
+(
+    const float* pArray
+)
+{
+    XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTE4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMBYTE4::XMBYTE4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMBYTE4::XMBYTE4
+(
+    const float* pArray
+)
+{
+    XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTEN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUBYTEN4::XMUBYTEN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUBYTEN4::XMUBYTEN4
+(
+    const float* pArray
+)
+{
+    XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTE4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUBYTE4::XMUBYTE4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUBYTE4::XMUBYTE4
+(
+    const float* pArray
+)
+{
+    XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUNIBBLE4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMUNIBBLE4::XMUNIBBLE4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+)
+{
+    XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w ));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMUNIBBLE4::XMUNIBBLE4
+(
+    const float *pArray
+)
+{
+    XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMU555 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+inline PackedVector::XMU555::XMU555
+(
+    float _x,
+    float _y,
+    float _z,
+    bool _w
+)
+{
+    XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) ));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PackedVector::XMU555::XMU555
+(
+    const float *pArray,
+    bool _w
+)
+{
+    XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray));
+    XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) ));
+}
+
+
diff --git a/MIT.txt b/MIT.txt
index 96e5e14..1abfa2b 100644
--- a/MIT.txt
+++ b/MIT.txt
@@ -1,21 +1,21 @@
-                               The MIT License (MIT)
-
-Copyright (c) 2016 Microsoft Corp
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-software and associated documentation files (the "Software"), to deal in the Software 
-without restriction, including without limitation the rights to use, copy, modify, 
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
-permit persons to whom the Software is furnished to do so, subject to the following 
-conditions: 
-
-The above copyright notice and this permission notice shall be included in all copies 
-or substantial portions of the Software.  
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
-PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
-CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
-OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+                               The MIT License (MIT)
+
+Copyright (c) 2016 Microsoft Corp
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+software and associated documentation files (the "Software"), to deal in the Software 
+without restriction, including without limitation the rights to use, copy, modify, 
+merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+permit persons to whom the Software is furnished to do so, subject to the following 
+conditions: 
+
+The above copyright notice and this permission notice shall be included in all copies 
+or substantial portions of the Software.  
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/ReadMe.txt b/ReadMe.txt
index efd995b..49a4730 100644
--- a/ReadMe.txt
+++ b/ReadMe.txt
@@ -1,131 +1,131 @@
------------
-DirectXMath
------------
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-June 2016
-
-This package contains the DirectXMath library, an all inline SIMD C++ linear algebra library
+-----------
+DirectXMath
+-----------
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+June 2016
+
+This package contains the DirectXMath library, an all inline SIMD C++ linear algebra library
 for use in games and graphics apps
-
-
-This code is designed to build with Visual Studio 2013 or 2015. It is recommended that you
-make use of VS 2013 Update 5 or VS 2015 Update 2.
-
-These components are designed to work without requiring any content from the DirectX SDK. For details,
-see "Where is the DirectX SDK?" <http://msdn.microsoft.com/en-us/library/ee663275.aspx>.
-
-Inc\
-    DirectXMath Files (in the DirectX C++ namespace)
-        DirectXMath.h - Core library
-        DirectXPackedVector.h - Load/Store functions and types for working with various compressed GPU formats
-        DirectXColors.h - .NET-style Color defines in sRGB color space
-        DirectXCollision.h - Bounding volume collision library
-
-Extentions\
-    Advanced instruction set variants for guarded codepaths
-        DirectXMathSSE3.h - SSE3
-        DirectXMathBE.h - Supplemental SSE3 (SSSE3)
-        DirectXMathSSE4.h - SSE4.1
-        DirectXMathAVX.h - Advanced Vector Extensions (AVX)
-        DirectXMathAVX2.h - Advanced Vector Extensions 2 (AVX2)
-        DirectXMathF16C.h - Half-precision conversions (F16C)
-        DirectXMathFMA3.h - Fused multiply-accumulate (FMA3)
-        DirectXMathFMA4.h - Fused multiply-accumulate (FMA4)
-
-SHMath\
-    Spherical Harmonics math functions
-        DirectXSH.h - Header for SHMath functions
-        DirectXSH.cpp, DirectXSHD3D11.cpp - Implementation
-
-XDSP\
-    XDSP.h - Digital Signal Processing helper functions
-
-All content and source code for this package are subject to the terms of the MIT License.
-<http://opensource.org/licenses/MIT>.
-
-Documentation is available at <https://msdn.microsoft.com/en-us/library/windows/desktop/hh437833.aspx>.
-
-For the latest version of DirectXMath, bug reports, etc. please visit the project site.
-<https://github.com/Microsoft/DirectXMath>
-
-This project has adopted the Microsoft Open Source Code of Conduct. For more information see the
-Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.
-
-https://opensource.microsoft.com/codeofconduct/
-
-
----------------
-RELEASE HISTORY
----------------
-
-June 2016 (3.09)
-    Includes support for additional optimizations when built with /arch:AVX or /arch:AVX2
-    Added use of constexpr for type constructors, XMConvertToRadians, and XMConvertToDegrees
-    Marked __vector4i, XMXDEC4, XMDECN4, XMDEC4, and associated Load & Store functions as deprecated.
-        These are vestiges of Xbox 360 support and will be removed in a future release
-    Renamed parameter in XMMatrixPerspectiveFov* to reduce user confusion when relying on IntelliSense
-    XMU565, XMUNIBBLE4 constructors take uint8_t instead of int8_t
-
-May 2016
-    DirectXMath 3.08 released under the MIT license
-
-November 2015 (3.08)
-    Added use of _mm_sfence for Stream methods
-    Fixed bug with non-uniform scaling transforms for BoundingOrientedBox
-    Added asserts for Near/FarZ in XMMatrix* methods
-    Added use of =default for PODs with VS 2013/2015
-    Additional SSE and ARM-NEON optimizations for PackedVector functions
-
-April 2015 (3.07)
-    Fix customer reported bugs in BoundingBox methods
-    Fix customer reported bug in XMStoreFloat3SE  
-    Fix customer reported bug in XMVectorATan2, XMVectorATan2Est  
-    Fix customer reported bug in XMVectorRound 
-
-October 2013 (3.06)
-    Fixed load/store of XMFLOAT3SE to properly match the DXGI_FORMAT_R9G9B9E5_SHAREDEXP
-    Added XMLoadUDecN4_XR and XMStoreUDecN4_XR to match DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM
-    Added XMColorRGBToSRGB and XMColorSRGBToRGB to convert linear RGB <-> sRGB
-
-July 2013 (3.05)
-    Use x86/x64 __vectorcall calling-convention when available (XM_CALLCONV, HXMVECTOR, FXMMATRIX introduced)
-    Fixed bug with XMVectorFloor and XMVectorCeiling when given whole odd numbers (i.e. 105.0)
-    Improved XMVectorRound algorithm
-    ARM-NEON optimizations for XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE  
-    ARM-NEON code paths use multiply-by-scalar intrinsics when supported
-    Additional optimizations for ARM-NEON Stream functions
-    Fixed potential warning C4723 using operator/ or operator/=
-
-March 2013 (3.04)
-    XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE functions added to provide base-e support in addition to the existing base-2 support
-    XMVectorExp and XMVectorLog are now aliases for XMVectorExp2 and XMVectorLog2  
-    Additional optimizations for Stream functions
-    XMVector3Cross now ensures w component is zero on ARM
-    XMConvertHalfToFloat and XMConvertFloatToHalf  now use IEEE 754 standard float16 behavior for INF/QNAN
-    Updated matrix version Transform for  BoundingOrientedBox  and  BoundingFrustum  to handle scaling
-
-March 2012 (3.03)
-    Breaking change: Removed union members from XMMATRIX type to make it a fully 'opaque' type
-    Marked single-parameter C++ constructors for XMFLOAT2, XMFLOAT2A, XMFLOAT3, XMFLOAT3A, XMFLOAT4, and XMFLOAT4A explicit
-
-February 2012 (3.02)
-    ARM-NEON intrinsics (selected by default for the ARM platform)
-    reworked XMVectorPermute, change of XM_PERMUTE_ defines, removal of XMVectorPermuteControl
-    Addition of XM_SWIZZLE_ defines
-    Optimizations for transcendental functions
-    Template forms for permute, swizzle, shift-left, rotate-left, rotation-right, and insert
-    Removal of deprecated types and functions
-        (XM_CACHE_LINE_SIZE define, XMVectorExpEst, XMVectorLogEst, XMVectorPowEst, XMVectorSinHEs, XMVectorCosHEst, XMVectorTanHEst,
-         XMVector2InBoundsR, XMVector3InBoundsR, XMVector4InBoundsR)
-    Removed XM_STRICT_VECTOR4; XMVECTOR in NO-INTRINSICS always defined without .x, .y, .z, .w, .v, or .u
-    Additional bounding types
-    SAL fixes and improvements
-
-September 2011 (3.00)
-    Renamed and reorganized the headers
-    Introduced C++ namespaces
-    Removed the Xbox 360-specific GPU types
-        (HENDN3, XMHEND3, XMUHENDN3, XMUHEND3, XMDHENN3, XMDHEN3,
-         XMUDHENN3, XMUDHEN3, XMXICON4, XMXICO4, XMICON4, XMICO4, XMUICON4, XMUICO4 )
+
+
+This code is designed to build with Visual Studio 2013 or 2015. It is recommended that you
+make use of VS 2013 Update 5 or VS 2015 Update 2.
+
+These components are designed to work without requiring any content from the DirectX SDK. For details,
+see "Where is the DirectX SDK?" <http://msdn.microsoft.com/en-us/library/ee663275.aspx>.
+
+Inc\
+    DirectXMath Files (in the DirectX C++ namespace)
+        DirectXMath.h - Core library
+        DirectXPackedVector.h - Load/Store functions and types for working with various compressed GPU formats
+        DirectXColors.h - .NET-style Color defines in sRGB color space
+        DirectXCollision.h - Bounding volume collision library
+
+Extentions\
+    Advanced instruction set variants for guarded codepaths
+        DirectXMathSSE3.h - SSE3
+        DirectXMathBE.h - Supplemental SSE3 (SSSE3)
+        DirectXMathSSE4.h - SSE4.1
+        DirectXMathAVX.h - Advanced Vector Extensions (AVX)
+        DirectXMathAVX2.h - Advanced Vector Extensions 2 (AVX2)
+        DirectXMathF16C.h - Half-precision conversions (F16C)
+        DirectXMathFMA3.h - Fused multiply-accumulate (FMA3)
+        DirectXMathFMA4.h - Fused multiply-accumulate (FMA4)
+
+SHMath\
+    Spherical Harmonics math functions
+        DirectXSH.h - Header for SHMath functions
+        DirectXSH.cpp, DirectXSHD3D11.cpp - Implementation
+
+XDSP\
+    XDSP.h - Digital Signal Processing helper functions
+
+All content and source code for this package are subject to the terms of the MIT License.
+<http://opensource.org/licenses/MIT>.
+
+Documentation is available at <https://msdn.microsoft.com/en-us/library/windows/desktop/hh437833.aspx>.
+
+For the latest version of DirectXMath, bug reports, etc. please visit the project site.
+<https://github.com/Microsoft/DirectXMath>
+
+This project has adopted the Microsoft Open Source Code of Conduct. For more information see the
+Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.
+
+https://opensource.microsoft.com/codeofconduct/
+
+
+---------------
+RELEASE HISTORY
+---------------
+
+June 2016 (3.09)
+    Includes support for additional optimizations when built with /arch:AVX or /arch:AVX2
+    Added use of constexpr for type constructors, XMConvertToRadians, and XMConvertToDegrees
+    Marked __vector4i, XMXDEC4, XMDECN4, XMDEC4, and associated Load & Store functions as deprecated.
+        These are vestiges of Xbox 360 support and will be removed in a future release
+    Renamed parameter in XMMatrixPerspectiveFov* to reduce user confusion when relying on IntelliSense
+    XMU565, XMUNIBBLE4 constructors take uint8_t instead of int8_t
+
+May 2016
+    DirectXMath 3.08 released under the MIT license
+
+November 2015 (3.08)
+    Added use of _mm_sfence for Stream methods
+    Fixed bug with non-uniform scaling transforms for BoundingOrientedBox
+    Added asserts for Near/FarZ in XMMatrix* methods
+    Added use of =default for PODs with VS 2013/2015
+    Additional SSE and ARM-NEON optimizations for PackedVector functions
+
+April 2015 (3.07)
+    Fix customer reported bugs in BoundingBox methods
+    Fix customer reported bug in XMStoreFloat3SE  
+    Fix customer reported bug in XMVectorATan2, XMVectorATan2Est  
+    Fix customer reported bug in XMVectorRound 
+
+October 2013 (3.06)
+    Fixed load/store of XMFLOAT3SE to properly match the DXGI_FORMAT_R9G9B9E5_SHAREDEXP
+    Added XMLoadUDecN4_XR and XMStoreUDecN4_XR to match DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM
+    Added XMColorRGBToSRGB and XMColorSRGBToRGB to convert linear RGB <-> sRGB
+
+July 2013 (3.05)
+    Use x86/x64 __vectorcall calling-convention when available (XM_CALLCONV, HXMVECTOR, FXMMATRIX introduced)
+    Fixed bug with XMVectorFloor and XMVectorCeiling when given whole odd numbers (i.e. 105.0)
+    Improved XMVectorRound algorithm
+    ARM-NEON optimizations for XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE  
+    ARM-NEON code paths use multiply-by-scalar intrinsics when supported
+    Additional optimizations for ARM-NEON Stream functions
+    Fixed potential warning C4723 using operator/ or operator/=
+
+March 2013 (3.04)
+    XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE functions added to provide base-e support in addition to the existing base-2 support
+    XMVectorExp and XMVectorLog are now aliases for XMVectorExp2 and XMVectorLog2  
+    Additional optimizations for Stream functions
+    XMVector3Cross now ensures w component is zero on ARM
+    XMConvertHalfToFloat and XMConvertFloatToHalf  now use IEEE 754 standard float16 behavior for INF/QNAN
+    Updated matrix version Transform for  BoundingOrientedBox  and  BoundingFrustum  to handle scaling
+
+March 2012 (3.03)
+    Breaking change: Removed union members from XMMATRIX type to make it a fully 'opaque' type
+    Marked single-parameter C++ constructors for XMFLOAT2, XMFLOAT2A, XMFLOAT3, XMFLOAT3A, XMFLOAT4, and XMFLOAT4A explicit
+
+February 2012 (3.02)
+    ARM-NEON intrinsics (selected by default for the ARM platform)
+    reworked XMVectorPermute, change of XM_PERMUTE_ defines, removal of XMVectorPermuteControl
+    Addition of XM_SWIZZLE_ defines
+    Optimizations for transcendental functions
+    Template forms for permute, swizzle, shift-left, rotate-left, rotation-right, and insert
+    Removal of deprecated types and functions
+        (XM_CACHE_LINE_SIZE define, XMVectorExpEst, XMVectorLogEst, XMVectorPowEst, XMVectorSinHEs, XMVectorCosHEst, XMVectorTanHEst,
+         XMVector2InBoundsR, XMVector3InBoundsR, XMVector4InBoundsR)
+    Removed XM_STRICT_VECTOR4; XMVECTOR in NO-INTRINSICS always defined without .x, .y, .z, .w, .v, or .u
+    Additional bounding types
+    SAL fixes and improvements
+
+September 2011 (3.00)
+    Renamed and reorganized the headers
+    Introduced C++ namespaces
+    Removed the Xbox 360-specific GPU types
+        (HENDN3, XMHEND3, XMUHENDN3, XMUHEND3, XMDHENN3, XMDHEN3,
+         XMUDHENN3, XMUDHEN3, XMXICON4, XMXICO4, XMICON4, XMICO4, XMUICON4, XMUICO4 )
diff --git a/SHMath/DirectXSH.cpp b/SHMath/DirectXSH.cpp
index d66a35a..c4191b7 100644
--- a/SHMath/DirectXSH.cpp
+++ b/SHMath/DirectXSH.cpp
@@ -1,4868 +1,4868 @@
-//-------------------------------------------------------------------------------------
-// DirectXSH.cpp -- C++ Spherical Harmonics Math Library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/p/?LinkId=262885
-//-------------------------------------------------------------------------------------
-
-#include "DirectXSH.h"
-#include <assert.h>
-
-using namespace DirectX;
-
-#pragma warning( disable : 4619 4456 )
-
-namespace
-{
-    #pragma prefast(disable:246, "generated code by maple (nested const variable definitions)")
-
-    static const float fExtraNormFac[XM_SH_MAXORDER] = { 2.0f*sqrtf(XM_PI), 2.0f/3.0f*sqrtf(3.0f*XM_PI), 2.0f/5.0f*sqrtf(5.0f*XM_PI), 2.0f/7.0f*sqrtf(7.0f*XM_PI), 2.0f/3.0f*sqrtf(XM_PI), 2.0f/11.0f*sqrtf(11.0f*XM_PI) };
-
-    // computes the integral of a constant function over a solid angular
-    // extent.  No error checking - only used internaly.  This function
-    // only returns the Yl0 coefficients, since the rest are zero for
-    // circularly symmetric functions.
-    static const float ComputeCapInt_t1 = sqrtf(0.3141593E1f);
-    static const float ComputeCapInt_t5 = sqrtf(3.0f);
-    static const float ComputeCapInt_t11 = sqrtf(5.0f);
-    static const float ComputeCapInt_t18 = sqrtf(7.0f);
-    static const float ComputeCapInt_t32 = sqrtf(11.0f);
-
-    static inline void ComputeCapInt(const size_t order, float angle, float *pR)
-    {
-        const float t2 = cosf(angle);
-        const float t3 = ComputeCapInt_t1*t2;
-        const float t7 = sinf(angle);
-        const float t8 = t7*t7;
-
-
-        pR[0] = -t3+ComputeCapInt_t1;
-        pR[1] = ComputeCapInt_t5*ComputeCapInt_t1*t8/2.0f;
-
-        if (order > 2)
-        {
-            const float t13 = t2*t2;
-
-            pR[2] = -ComputeCapInt_t11*ComputeCapInt_t1*t2*(t13-1.0f)/2.0f;
-            if (order > 3)
-            {
-                const float t19 = ComputeCapInt_t18*ComputeCapInt_t1;
-                const float t20 = t13*t13;
-
-                pR[3] = -5.0f/8.0f*t19*t20+3.0f/4.0f*t19*t13-t19/8.0f;
-                if (order > 4)
-                {
-
-
-                    pR[4] = -3.0f/8.0f*t3*(7.0f*t20-10.0f*t13+3.0f);
-                    if (order > 5)
-                    {
-                        const float t33 = ComputeCapInt_t32*ComputeCapInt_t1;
-                        pR[5] = -21.0f/16.0f*t33*t20*t13+35.0f/16.0f*t33*t20-15.0f/16.0f*t33*t13+t33/16.0f;
-                    }
-                }
-            }
-        }
-    }
-
-    // input pF only consists of Yl0 values, normalizes coefficients for directional
-    // lights.
-    static inline float CosWtInt(const size_t order)
-    {
-        const float fCW0 = 0.25f;
-        const float fCW1 = 0.5f;
-        const float fCW2 = 5.0f/16.0f;
-        //const float fCW3 = 0.0f;
-        const float fCW4 = -3.0f/32.0f;
-        //const float fCW5 = 0.0f;
-
-        // order has to be at least linear...
-
-        float fRet = fCW0 + fCW1;
-
-        if (order > 2) fRet += fCW2;
-        if (order > 4) fRet += fCW4;
-
-        // odd degrees >= 3 evaluate to zero integrated against cosine...
-
-        return fRet;
-    }
-
-    static const float SHEvalHemisphereLight_fSqrtPi = sqrtf(XM_PI);
-    static const float SHEvalHemisphereLight_fSqrtPi3 = sqrtf(XM_PI/3.0f);
-
-    typedef float REAL;
-    #define CONSTANT(x) (x ## f)
-
-    // routine generated programmatically for evaluating SH basis for degree 1
-    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
-    // output is vector b with SH basis evaluated at (x,y,z).
-    //
-    inline static void sh_eval_basis_1(REAL x,REAL y,REAL z,REAL b[4])
-    {
-        /* m=0 */
-
-        // l=0
-        const REAL p_0_0 = CONSTANT(0.282094791773878140);
-        b[  0] = p_0_0; // l=0,m=0
-        // l=1
-        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
-        b[  2] = p_1_0; // l=1,m=0
-
-
-        /* m=1 */
-
-        const REAL s1 = y;
-        const REAL c1 = x;
-
-        // l=1
-        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
-        b[  1] = p_1_1*s1; // l=1,m=-1
-        b[  3] = p_1_1*c1; // l=1,m=+1
-    }
-
-    // routine generated programmatically for evaluating SH basis for degree 2
-    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
-    // output is vector b with SH basis evaluated at (x,y,z).
-    //
-    inline static void sh_eval_basis_2(REAL x,REAL y,REAL z,REAL b[9])
-    {
-        const REAL z2 = z*z;
-
-
-        /* m=0 */
-
-        // l=0
-        const REAL p_0_0 = CONSTANT(0.282094791773878140);
-        b[  0] = p_0_0; // l=0,m=0
-        // l=1
-        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
-        b[  2] = p_1_0; // l=1,m=0
-        // l=2
-        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
-        b[  6] = p_2_0; // l=2,m=0
-
-
-        /* m=1 */
-
-        const REAL s1 = y;
-        const REAL c1 = x;
-
-        // l=1
-        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
-        b[  1] = p_1_1*s1; // l=1,m=-1
-        b[  3] = p_1_1*c1; // l=1,m=+1
-        // l=2
-        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
-        b[  5] = p_2_1*s1; // l=2,m=-1
-        b[  7] = p_2_1*c1; // l=2,m=+1
-
-
-        /* m=2 */
-
-        const REAL s2 = x*s1 + y*c1;
-        const REAL c2 = x*c1 - y*s1;
-
-        // l=2
-        const REAL p_2_2 = CONSTANT(0.546274215296039590);
-        b[  4] = p_2_2*s2; // l=2,m=-2
-        b[  8] = p_2_2*c2; // l=2,m=+2
-    }
-
-    // routine generated programmatically for evaluating SH basis for degree 3
-    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
-    // output is vector b with SH basis evaluated at (x,y,z).
-    //
-    static void sh_eval_basis_3(REAL x,REAL y,REAL z,REAL b[16])
-    {
-        const REAL z2 = z*z;
-
-
-        /* m=0 */
-
-        // l=0
-        const REAL p_0_0 = CONSTANT(0.282094791773878140);
-        b[  0] = p_0_0; // l=0,m=0
-        // l=1
-        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
-        b[  2] = p_1_0; // l=1,m=0
-        // l=2
-        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
-        b[  6] = p_2_0; // l=2,m=0
-        // l=3
-        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
-        b[ 12] = p_3_0; // l=3,m=0
-
-
-        /* m=1 */
-
-        const REAL s1 = y;
-        const REAL c1 = x;
-
-        // l=1
-        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
-        b[  1] = p_1_1*s1; // l=1,m=-1
-        b[  3] = p_1_1*c1; // l=1,m=+1
-        // l=2
-        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
-        b[  5] = p_2_1*s1; // l=2,m=-1
-        b[  7] = p_2_1*c1; // l=2,m=+1
-        // l=3
-        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
-        b[ 11] = p_3_1*s1; // l=3,m=-1
-        b[ 13] = p_3_1*c1; // l=3,m=+1
-
-
-        /* m=2 */
-
-        const REAL s2 = x*s1 + y*c1;
-        const REAL c2 = x*c1 - y*s1;
-
-        // l=2
-        const REAL p_2_2 = CONSTANT(0.546274215296039590);
-        b[  4] = p_2_2*s2; // l=2,m=-2
-        b[  8] = p_2_2*c2; // l=2,m=+2
-        // l=3
-        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
-        b[ 10] = p_3_2*s2; // l=3,m=-2
-        b[ 14] = p_3_2*c2; // l=3,m=+2
-
-
-        /* m=3 */
-
-        const REAL s3 = x*s2 + y*c2;
-        const REAL c3 = x*c2 - y*s2;
-
-        // l=3
-        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
-        b[  9] = p_3_3*s3; // l=3,m=-3
-        b[ 15] = p_3_3*c3; // l=3,m=+3
-    }
-
-    // routine generated programmatically for evaluating SH basis for degree 4
-    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
-    // output is vector b with SH basis evaluated at (x,y,z).
-    //
-    static void sh_eval_basis_4(REAL x,REAL y,REAL z,REAL b[25])
-    {
-        const REAL z2 = z*z;
-
-
-        /* m=0 */
-
-        // l=0
-        const REAL p_0_0 = CONSTANT(0.282094791773878140);
-        b[  0] = p_0_0; // l=0,m=0
-        // l=1
-        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
-        b[  2] = p_1_0; // l=1,m=0
-        // l=2
-        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
-        b[  6] = p_2_0; // l=2,m=0
-        // l=3
-        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
-        b[ 12] = p_3_0; // l=3,m=0
-        // l=4
-        const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0;
-        b[ 20] = p_4_0; // l=4,m=0
-
-
-        /* m=1 */
-
-        const REAL s1 = y;
-        const REAL c1 = x;
-
-        // l=1
-        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
-        b[  1] = p_1_1*s1; // l=1,m=-1
-        b[  3] = p_1_1*c1; // l=1,m=+1
-        // l=2
-        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
-        b[  5] = p_2_1*s1; // l=2,m=-1
-        b[  7] = p_2_1*c1; // l=2,m=+1
-        // l=3
-        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
-        b[ 11] = p_3_1*s1; // l=3,m=-1
-        b[ 13] = p_3_1*c1; // l=3,m=+1
-        // l=4
-        const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200));
-        b[ 19] = p_4_1*s1; // l=4,m=-1
-        b[ 21] = p_4_1*c1; // l=4,m=+1
-
-
-        /* m=2 */
-
-        const REAL s2 = x*s1 + y*c1;
-        const REAL c2 = x*c1 - y*s1;
-
-        // l=2
-        const REAL p_2_2 = CONSTANT(0.546274215296039590);
-        b[  4] = p_2_2*s2; // l=2,m=-2
-        b[  8] = p_2_2*c2; // l=2,m=+2
-        // l=3
-        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
-        b[ 10] = p_3_2*s2; // l=3,m=-2
-        b[ 14] = p_3_2*c2; // l=3,m=+2
-        // l=4
-        const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980);
-        b[ 18] = p_4_2*s2; // l=4,m=-2
-        b[ 22] = p_4_2*c2; // l=4,m=+2
-
-
-        /* m=3 */
-
-        const REAL s3 = x*s2 + y*c2;
-        const REAL c3 = x*c2 - y*s2;
-
-        // l=3
-        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
-        b[  9] = p_3_3*s3; // l=3,m=-3
-        b[ 15] = p_3_3*c3; // l=3,m=+3
-        // l=4
-        const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z;
-        b[ 17] = p_4_3*s3; // l=4,m=-3
-        b[ 23] = p_4_3*c3; // l=4,m=+3
-
-
-        /* m=4 */
-
-        const REAL s4 = x*s3 + y*c3;
-        const REAL c4 = x*c3 - y*s3;
-
-        // l=4
-        const REAL p_4_4 = CONSTANT(0.625835735449176030);
-        b[ 16] = p_4_4*s4; // l=4,m=-4
-        b[ 24] = p_4_4*c4; // l=4,m=+4
-    }
-
-    // routine generated programmatically for evaluating SH basis for degree 5
-    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
-    // output is vector b with SH basis evaluated at (x,y,z).
-    //
-    static void sh_eval_basis_5(REAL x,REAL y,REAL z,REAL b[36])
-    {
-        const REAL z2 = z*z;
-
-
-        /* m=0 */
-
-        // l=0
-        const REAL p_0_0 = CONSTANT(0.282094791773878140);
-        b[  0] = p_0_0; // l=0,m=0
-        // l=1
-        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
-        b[  2] = p_1_0; // l=1,m=0
-        // l=2
-        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
-        b[  6] = p_2_0; // l=2,m=0
-        // l=3
-        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
-        b[ 12] = p_3_0; // l=3,m=0
-        // l=4
-        const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0;
-        b[ 20] = p_4_0; // l=4,m=0
-        // l=5
-        const REAL p_5_0 = CONSTANT(1.989974874213239700)*z*p_4_0 + CONSTANT(-1.002853072844814000)*p_3_0;
-        b[ 30] = p_5_0; // l=5,m=0
-
-
-        /* m=1 */
-
-        const REAL s1 = y;
-        const REAL c1 = x;
-
-        // l=1
-        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
-        b[  1] = p_1_1*s1; // l=1,m=-1
-        b[  3] = p_1_1*c1; // l=1,m=+1
-        // l=2
-        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
-        b[  5] = p_2_1*s1; // l=2,m=-1
-        b[  7] = p_2_1*c1; // l=2,m=+1
-        // l=3
-        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
-        b[ 11] = p_3_1*s1; // l=3,m=-1
-        b[ 13] = p_3_1*c1; // l=3,m=+1
-        // l=4
-        const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200));
-        b[ 19] = p_4_1*s1; // l=4,m=-1
-        b[ 21] = p_4_1*c1; // l=4,m=+1
-        // l=5
-        const REAL p_5_1 = CONSTANT(2.031009601158990200)*z*p_4_1 + CONSTANT(-0.991031208965114650)*p_3_1;
-        b[ 29] = p_5_1*s1; // l=5,m=-1
-        b[ 31] = p_5_1*c1; // l=5,m=+1
-
-
-        /* m=2 */
-
-        const REAL s2 = x*s1 + y*c1;
-        const REAL c2 = x*c1 - y*s1;
-
-        // l=2
-        const REAL p_2_2 = CONSTANT(0.546274215296039590);
-        b[  4] = p_2_2*s2; // l=2,m=-2
-        b[  8] = p_2_2*c2; // l=2,m=+2
-        // l=3
-        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
-        b[ 10] = p_3_2*s2; // l=3,m=-2
-        b[ 14] = p_3_2*c2; // l=3,m=+2
-        // l=4
-        const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980);
-        b[ 18] = p_4_2*s2; // l=4,m=-2
-        b[ 22] = p_4_2*c2; // l=4,m=+2
-        // l=5
-        const REAL p_5_2 = z*(CONSTANT(7.190305177459987500)*z2 + CONSTANT(-2.396768392486662100));
-        b[ 28] = p_5_2*s2; // l=5,m=-2
-        b[ 32] = p_5_2*c2; // l=5,m=+2
-
-
-        /* m=3 */
-
-        const REAL s3 = x*s2 + y*c2;
-        const REAL c3 = x*c2 - y*s2;
-
-        // l=3
-        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
-        b[  9] = p_3_3*s3; // l=3,m=-3
-        b[ 15] = p_3_3*c3; // l=3,m=+3
-        // l=4
-        const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z;
-        b[ 17] = p_4_3*s3; // l=4,m=-3
-        b[ 23] = p_4_3*c3; // l=4,m=+3
-        // l=5
-        const REAL p_5_3 = CONSTANT(-4.403144694917253700)*z2 + CONSTANT(0.489238299435250430);
-        b[ 27] = p_5_3*s3; // l=5,m=-3
-        b[ 33] = p_5_3*c3; // l=5,m=+3
-
-
-        /* m=4 */
-
-        const REAL s4 = x*s3 + y*c3;
-        const REAL c4 = x*c3 - y*s3;
-
-        // l=4
-        const REAL p_4_4 = CONSTANT(0.625835735449176030);
-        b[ 16] = p_4_4*s4; // l=4,m=-4
-        b[ 24] = p_4_4*c4; // l=4,m=+4
-        // l=5
-        const REAL p_5_4 = CONSTANT(2.075662314881041100)*z;
-        b[ 26] = p_5_4*s4; // l=5,m=-4
-        b[ 34] = p_5_4*c4; // l=5,m=+4
-
-
-        /* m=5 */
-
-        const REAL s5 = x*s4 + y*c4;
-        const REAL c5 = x*c4 - y*s4;
-
-        // l=5
-        const REAL p_5_5 = CONSTANT(-0.656382056840170150);
-        b[ 25] = p_5_5*s5; // l=5,m=-5
-        b[ 35] = p_5_5*c5; // l=5,m=+5
-    }
-
-    static const REAL M_PIjs = (REAL) (4.0*atan(1.0));
-    static const REAL maxang = (REAL) (M_PIjs/2);
-    static const int NSH0 = 1;
-    static const int NSH1 = 4;
-    static const int NSH2 = 9;
-    static const int NSH3 = 16;
-    static const int NSH4 = 25;
-    static const int NSH5 = 36;
-    static const int NSH6 = 49;
-    static const int NSH7 = 64;
-    static const int NSH8 = 81;
-    static const int NSH9 = 100;
-    static const int NL0 = 1;
-    static const int NL1 = 3;
-    static const int NL2 = 5;
-    static const int NL3 = 7;
-    static const int NL4 = 9;
-    static const int NL5 = 11;
-    static const int NL6 = 13;
-    static const int NL7 = 15;
-    static const int NL8 = 17;
-    static const int NL9 = 19;
-
-    static inline void rot(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout)
-    {
-        xout = x*ct - y*st;
-        yout = y*ct + x*st;
-    }
-
-    static inline void rot_inv(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout)
-    {
-        xout = x*ct + y*st;
-        yout = y*ct - x*st;
-    }
-
-    static inline void rot_1(REAL ct,REAL st,REAL ctm[1],REAL stm[1])
-    {
-        ctm[0] = ct;
-        stm[0] = st;
-    }
-
-    static inline void rot_2(REAL ct,REAL st,REAL ctm[2],REAL stm[2])
-    {
-        REAL ct2 = CONSTANT(2.0)*ct;
-        ctm[0] = ct;         
-        stm[0] = st;
-        ctm[1] = ct2*ct-CONSTANT(1.0); 
-        stm[1] = ct2*st;
-    }
-
-    static inline void rot_3(REAL ct,REAL st,REAL ctm[3],REAL stm[3])
-    {
-        REAL ct2 = CONSTANT(2.0)*ct;
-        ctm[0] = ct;         
-        stm[0] = st;
-        ctm[1] = ct2*ct-CONSTANT(1.0); 
-        stm[1] = ct2*st;
-        ctm[2] = ct2*ctm[1] - ct;
-        stm[2] = ct2*stm[1] - st;
-    }
-
-    static inline void rot_4(REAL ct,REAL st,REAL ctm[4],REAL stm[4])
-    {
-        REAL ct2 = CONSTANT(2.0)*ct;
-        ctm[0] = ct;         
-        stm[0] = st;
-        ctm[1] = ct2*ct-CONSTANT(1.0); 
-        stm[1] = ct2*st;
-        ctm[2] = ct2*ctm[1] - ct;
-        stm[2] = ct2*stm[1] - st;
-        ctm[3] = ct2*ctm[2] - ctm[1];
-        stm[3] = ct2*stm[2] - stm[1];
-    }
-
-    static inline void rot_5(REAL ct,REAL st,REAL ctm[5],REAL stm[5])
-    {
-        REAL ct2 = CONSTANT(2.0)*ct;
-        ctm[0] = ct;         
-        stm[0] = st;
-        ctm[1] = ct2*ct-CONSTANT(1.0); 
-        stm[1] = ct2*st;
-        ctm[2] = ct2*ctm[1] - ct;
-        stm[2] = ct2*stm[1] - st;
-        ctm[3] = ct2*ctm[2] - ctm[1];
-        stm[3] = ct2*stm[2] - stm[1];
-        ctm[4] = ct2*ctm[3] - ctm[2];
-        stm[4] = ct2*stm[3] - stm[2];
-    }
-
-    static inline void sh_rotz_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1])
-    {
-        yr[1] = y[1];
-        rot_inv(ctm[0],stm[0],y[0],y[2],yr[0],yr[2]);
-    }
-
-    static inline void sh_rotz_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2])
-    {
-        yr[2] = y[2];
-        rot_inv(ctm[0],stm[0],y[1],y[3],yr[1],yr[3]);
-        rot_inv(ctm[1],stm[1],y[0],y[4],yr[0],yr[4]);
-    }
-
-    static inline void sh_rotz_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3])
-    {
-        yr[3] = y[3];
-        rot_inv(ctm[0],stm[0],y[2],y[4],yr[2],yr[4]);
-        rot_inv(ctm[1],stm[1],y[1],y[5],yr[1],yr[5]);
-        rot_inv(ctm[2],stm[2],y[0],y[6],yr[0],yr[6]);
-    }
-
-    static inline void sh_rotz_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4])
-    {
-        yr[4] = y[4];
-        rot_inv(ctm[0],stm[0],y[3],y[5],yr[3],yr[5]);
-        rot_inv(ctm[1],stm[1],y[2],y[6],yr[2],yr[6]);
-        rot_inv(ctm[2],stm[2],y[1],y[7],yr[1],yr[7]);
-        rot_inv(ctm[3],stm[3],y[0],y[8],yr[0],yr[8]);
-    }
-
-    static inline void sh_rotz_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5])
-    {
-        yr[5] = y[5];
-        rot_inv(ctm[0],stm[0],y[4],y[6],yr[4],yr[6]);
-        rot_inv(ctm[1],stm[1],y[3],y[7],yr[3],yr[7]);
-        rot_inv(ctm[2],stm[2],y[2],y[8],yr[2],yr[8]);
-        rot_inv(ctm[3],stm[3],y[1],y[9],yr[1],yr[9]);
-        rot_inv(ctm[4],stm[4],y[0],y[10],yr[0],yr[10]);
-    }
-
-    // rotation code generated programmatically by rotatex (2000x4000 samples, eps=1e-008)
-
-    static REAL fx_1_001 = (REAL) ( sqrt(1.0)/1.0); // 1
-    static REAL fx_1_002 = (REAL) (-sqrt(1.0)/1.0); // -1.00000030843
-
-    static inline void sh_rotx90_1(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_1_001*y[  1];
-        yr[  1] =  fx_1_002*y[  0];
-        yr[  2] =  fx_1_001*y[  2];
-    };
-
-    static inline void sh_rotx90_inv_1(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_1_002*y[  1];
-        yr[  1] =  fx_1_001*y[  0];
-        yr[  2] =  fx_1_001*y[  2];
-    }
-
-    static REAL fx_2_001 = (REAL) ( sqrt(4.0)/2.0); // 1
-    static REAL fx_2_002 = (REAL) (-sqrt(4.0)/2.0); // -1
-    static REAL fx_2_003 = (REAL) (-sqrt(1.0)/2.0); // -0.500000257021
-    static REAL fx_2_004 = (REAL) (-sqrt(3.0)/2.0); // -0.866025848959
-    static REAL fx_2_005 = (REAL) ( sqrt(1.0)/2.0); // 0.5
-
-    static inline void sh_rotx90_2(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_2_001*y[  3];
-        yr[  1] =  fx_2_002*y[  1];
-        yr[  2] =  fx_2_003*y[  2]+fx_2_004*y[  4];
-        yr[  3] =  fx_2_002*y[  0];
-        yr[  4] =  fx_2_004*y[  2]+fx_2_005*y[  4];
-    };
-
-    static inline void sh_rotx90_inv_2(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_2_002*y[  3];
-        yr[  1] =  fx_2_002*y[  1];
-        yr[  2] =  fx_2_003*y[  2]+fx_2_004*y[  4];
-        yr[  3] =  fx_2_001*y[  0];
-        yr[  4] =  fx_2_004*y[  2]+fx_2_005*y[  4];
-    }
-
-    static REAL fx_3_001 = (REAL) (-sqrt(10.0)/4.0); // -0.790569415042
-    static REAL fx_3_002 = (REAL) ( sqrt(6.0)/4.0); // 0.612372435696
-    static REAL fx_3_003 = (REAL) (-sqrt(16.0)/4.0); // -1
-    static REAL fx_3_004 = (REAL) (-sqrt(6.0)/4.0); // -0.612372435695
-    static REAL fx_3_005 = (REAL) (-sqrt(1.0)/4.0); // -0.25
-    static REAL fx_3_006 = (REAL) (-sqrt(15.0)/4.0); // -0.968245836551
-    static REAL fx_3_007 = (REAL) ( sqrt(1.0)/4.0); // 0.25
-    static REAL fx_3_008 = (REAL) ( sqrt(10.0)/4.0); // 0.790569983984
-
-    static inline void sh_rotx90_3(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_3_001*y[  3]+fx_3_002*y[  5];
-        yr[  1] =  fx_3_003*y[  1];
-        yr[  2] =  fx_3_004*y[  3]+fx_3_001*y[  5];
-        yr[  3] =  fx_3_008*y[  0]+fx_3_002*y[  2];
-        yr[  4] =  fx_3_005*y[  4]+fx_3_006*y[  6];
-        yr[  5] =  fx_3_004*y[  0]-fx_3_001*y[  2];
-        yr[  6] =  fx_3_006*y[  4]+fx_3_007*y[  6];
-    };
-
-    static inline void sh_rotx90_inv_3(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_3_008*y[  3]+fx_3_004*y[  5];
-        yr[  1] =  fx_3_003*y[  1];
-        yr[  2] =  fx_3_002*y[  3]-fx_3_001*y[  5];
-        yr[  3] =  fx_3_001*y[  0]+fx_3_004*y[  2];
-        yr[  4] =  fx_3_005*y[  4]+fx_3_006*y[  6];
-        yr[  5] =  fx_3_002*y[  0]+fx_3_001*y[  2];
-        yr[  6] =  fx_3_006*y[  4]+fx_3_007*y[  6];
-    }
-
-    static REAL fx_4_001 = (REAL) (-sqrt(56.0)/8.0); // -0.935414346694
-    static REAL fx_4_002 = (REAL) ( sqrt(8.0)/8.0); // 0.353553390593
-    static REAL fx_4_003 = (REAL) (-sqrt(36.0)/8.0); // -0.75
-    static REAL fx_4_004 = (REAL) ( sqrt(28.0)/8.0); // 0.661437827766
-    static REAL fx_4_005 = (REAL) (-sqrt(8.0)/8.0); // -0.353553390593
-    static REAL fx_4_006 = (REAL) ( sqrt(36.0)/8.0); // 0.749999999999
-    static REAL fx_4_007 = (REAL) ( sqrt(9.0)/8.0); // 0.37500034698
-    static REAL fx_4_008 = (REAL) ( sqrt(20.0)/8.0); // 0.559017511622
-    static REAL fx_4_009 = (REAL) ( sqrt(35.0)/8.0); // 0.739510657141
-    static REAL fx_4_010 = (REAL) ( sqrt(16.0)/8.0); // 0.5
-    static REAL fx_4_011 = (REAL) (-sqrt(28.0)/8.0); // -0.661437827766
-    static REAL fx_4_012 = (REAL) ( sqrt(1.0)/8.0); // 0.125
-    static REAL fx_4_013 = (REAL) ( sqrt(56.0)/8.0); // 0.935414346692
-
-    static inline void sh_rotx90_4(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_4_001*y[  5]+fx_4_002*y[  7];
-        yr[  1] =  fx_4_003*y[  1]+fx_4_004*y[  3];
-        yr[  2] =  fx_4_005*y[  5]+fx_4_001*y[  7];
-        yr[  3] =  fx_4_004*y[  1]+fx_4_006*y[  3];
-        yr[  4] =  fx_4_007*y[  4]+fx_4_008*y[  6]+fx_4_009*y[  8];
-        yr[  5] =  fx_4_013*y[  0]+fx_4_002*y[  2];
-        yr[  6] =  fx_4_008*y[  4]+fx_4_010*y[  6]+fx_4_011*y[  8];
-        yr[  7] =  fx_4_005*y[  0]-fx_4_001*y[  2];
-        yr[  8] =  fx_4_009*y[  4]+fx_4_011*y[  6]+fx_4_012*y[  8];
-    };
-
-    static inline void sh_rotx90_inv_4(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_4_013*y[  5]+fx_4_005*y[  7];
-        yr[  1] =  fx_4_003*y[  1]+fx_4_004*y[  3];
-        yr[  2] =  fx_4_002*y[  5]-fx_4_001*y[  7];
-        yr[  3] =  fx_4_004*y[  1]+fx_4_006*y[  3];
-        yr[  4] =  fx_4_007*y[  4]+fx_4_008*y[  6]+fx_4_009*y[  8];
-        yr[  5] =  fx_4_001*y[  0]+fx_4_005*y[  2];
-        yr[  6] =  fx_4_008*y[  4]+fx_4_010*y[  6]+fx_4_011*y[  8];
-        yr[  7] =  fx_4_002*y[  0]+fx_4_001*y[  2];
-        yr[  8] =  fx_4_009*y[  4]+fx_4_011*y[  6]+fx_4_012*y[  8];
-    }
-
-    static REAL fx_5_001 = (REAL) ( sqrt(126.0)/16.0); // 0.70156076002
-    static REAL fx_5_002 = (REAL) (-sqrt(120.0)/16.0); // -0.684653196882
-    static REAL fx_5_003 = (REAL) ( sqrt(10.0)/16.0); // 0.197642353761
-    static REAL fx_5_004 = (REAL) (-sqrt(64.0)/16.0); // -0.5
-    static REAL fx_5_005 = (REAL) ( sqrt(192.0)/16.0); // 0.866025403784
-    static REAL fx_5_006 = (REAL) ( sqrt(70.0)/16.0); // 0.522912516584
-    static REAL fx_5_007 = (REAL) ( sqrt(24.0)/16.0); // 0.306186217848
-    static REAL fx_5_008 = (REAL) (-sqrt(162.0)/16.0); // -0.795495128835
-    static REAL fx_5_009 = (REAL) ( sqrt(64.0)/16.0); // 0.5
-    static REAL fx_5_010 = (REAL) ( sqrt(60.0)/16.0); // 0.484122918274
-    static REAL fx_5_011 = (REAL) ( sqrt(112.0)/16.0); // 0.661437827763
-    static REAL fx_5_012 = (REAL) ( sqrt(84.0)/16.0); // 0.572821961867
-    static REAL fx_5_013 = (REAL) ( sqrt(4.0)/16.0); // 0.125
-    static REAL fx_5_014 = (REAL) ( sqrt(42.0)/16.0); // 0.405046293649
-    static REAL fx_5_015 = (REAL) ( sqrt(210.0)/16.0); // 0.905711046633
-    static REAL fx_5_016 = (REAL) ( sqrt(169.0)/16.0); // 0.8125
-    static REAL fx_5_017 = (REAL) (-sqrt(45.0)/16.0); // -0.419262745781
-    static REAL fx_5_018 = (REAL) ( sqrt(1.0)/16.0); // 0.0625
-    static REAL fx_5_019 = (REAL) (-sqrt(126.0)/16.0); // -0.701561553415
-    static REAL fx_5_020 = (REAL) ( sqrt(120.0)/16.0); // 0.684653196881
-    static REAL fx_5_021 = (REAL) (-sqrt(10.0)/16.0); // -0.197642353761
-    static REAL fx_5_022 = (REAL) (-sqrt(70.0)/16.0); // -0.522913107945
-    static REAL fx_5_023 = (REAL) (-sqrt(60.0)/16.0); // -0.48412346577
-
-    static inline void sh_rotx90_5(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_5_001*y[  5]+fx_5_002*y[  7]+fx_5_003*y[  9];
-        yr[  1] =  fx_5_004*y[  1]+fx_5_005*y[  3];
-        yr[  2] =  fx_5_006*y[  5]+fx_5_007*y[  7]+fx_5_008*y[  9];
-        yr[  3] =  fx_5_005*y[  1]+fx_5_009*y[  3];
-        yr[  4] =  fx_5_010*y[  5]+fx_5_011*y[  7]+fx_5_012*y[  9];
-        yr[  5] =  fx_5_019*y[  0]+fx_5_022*y[  2]+fx_5_023*y[  4];
-        yr[  6] =  fx_5_013*y[  6]+fx_5_014*y[  8]+fx_5_015*y[ 10];
-        yr[  7] =  fx_5_020*y[  0]-fx_5_007*y[  2]-fx_5_011*y[  4];
-        yr[  8] =  fx_5_014*y[  6]+fx_5_016*y[  8]+fx_5_017*y[ 10];
-        yr[  9] =  fx_5_021*y[  0]-fx_5_008*y[  2]-fx_5_012*y[  4];
-        yr[ 10] =  fx_5_015*y[  6]+fx_5_017*y[  8]+fx_5_018*y[ 10];
-    };
-
-    static inline void sh_rotx90_inv_5(REAL y[],REAL yr[])
-    {
-        yr[  0] =  fx_5_019*y[  5]+fx_5_020*y[  7]+fx_5_021*y[  9];
-        yr[  1] =  fx_5_004*y[  1]+fx_5_005*y[  3];
-        yr[  2] =  fx_5_022*y[  5]-fx_5_007*y[  7]-fx_5_008*y[  9];
-        yr[  3] =  fx_5_005*y[  1]+fx_5_009*y[  3];
-        yr[  4] =  fx_5_023*y[  5]-fx_5_011*y[  7]-fx_5_012*y[  9];
-        yr[  5] =  fx_5_001*y[  0]+fx_5_006*y[  2]+fx_5_010*y[  4];
-        yr[  6] =  fx_5_013*y[  6]+fx_5_014*y[  8]+fx_5_015*y[ 10];
-        yr[  7] =  fx_5_002*y[  0]+fx_5_007*y[  2]+fx_5_011*y[  4];
-        yr[  8] =  fx_5_014*y[  6]+fx_5_016*y[  8]+fx_5_017*y[ 10];
-        yr[  9] =  fx_5_003*y[  0]+fx_5_008*y[  2]+fx_5_012*y[  4];
-        yr[ 10] =  fx_5_015*y[  6]+fx_5_017*y[  8]+fx_5_018*y[ 10];
-    }
-
-    static inline void sh_rot_1(REAL m[3*3],REAL y[NL1],REAL yr[NL1])
-    {
-        REAL yr0 = m[4]*y[0] - m[5]*y[1] + m[3]*y[2];
-        REAL yr1 = m[8]*y[1] - m[7]*y[0] - m[6]*y[2];
-        REAL yr2 = m[1]*y[0] - m[2]*y[1] + m[0]*y[2];
-
-        yr[0] = yr0;
-        yr[1] = yr1;
-        yr[2] = yr2;
-    }
-
-    static inline void sh_roty_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1])
-    {
-        yr[0] = y[0];
-        rot_inv(ctm[0],stm[0],y[1],y[2],yr[1],yr[2]);
-    }
-
-    static inline void sh_roty_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2])
-    {
-        REAL ytmp[NL2];
-        sh_rotx90_2(y,yr);
-        sh_rotz_2(ctm,stm,yr,ytmp);
-        sh_rotx90_inv_2(ytmp,yr);
-    }
-
-    static inline void sh_roty_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3])
-    {
-        REAL ytmp[NL3];
-        sh_rotx90_3(y,yr);
-        sh_rotz_3(ctm,stm,yr,ytmp);
-        sh_rotx90_inv_3(ytmp,yr);
-    }
-
-    static inline void sh_roty_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4])
-    {
-        REAL ytmp[NL4];
-        sh_rotx90_4(y,yr);
-        sh_rotz_4(ctm,stm,yr,ytmp);
-        sh_rotx90_inv_4(ytmp,yr);
-    }
-
-    static inline void sh_roty_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5])
-    {
-        REAL ytmp[NL5];
-        sh_rotx90_5(y,yr);
-        sh_rotz_5(ctm,stm,yr,ytmp);
-        sh_rotx90_inv_5(ytmp,yr);
-    } 
-
-    #define ROT_TOL CONSTANT(1e-4)
-
-    /*
-    Finds cosine,sine pairs for zyz rotation (i.e. rotation R_z2 R_y R_z1 v).
-    The rotation is one which maps mx to (1,0,0) and mz to (0,0,1).
-    */
-    static inline void zyz(REAL m[3*3],REAL &zc1,REAL &zs1,REAL &yc,REAL &ys,REAL &zc2,REAL &zs2)
-    {
-        REAL cz = m[8];
-
-        // rotate so that (cx,cy,0) aligns to (1,0,0)
-        REAL cxylen = (REAL) sqrtf(1.0f - cz*cz);
-        if (cxylen >= ROT_TOL)
-        {
-            // if above is a NaN, will do the correct thing
-            yc = cz;
-            ys = cxylen;
-            REAL len67inv = 1.0f/sqrtf(m[6]*m[6] + m[7]*m[7]);
-            zc1 = -m[6]*len67inv;
-            zs1 =  m[7]*len67inv;
-            REAL len25inv = 1.0f/sqrtf(m[2]*m[2] + m[5]*m[5]);
-            zc2 = m[2]*len25inv;
-            zs2 = m[5]*len25inv; 
-        } else {  // m[6],m[7],m[8] already aligned to (0,0,1)
-            zc1 = 1.0; zs1 = 0.0;        // identity
-            yc = cz; ys = 0.0;           // identity
-            zc2 = m[0]*cz; zs2 = -m[1];  // align x axis (mx[0],mx[1],0) to (1,0,0)
-        }
-    }
-
-    static inline void sh_rotzyz_2(REAL zc1m[2],REAL zs1m[2],REAL ycm[2],REAL ysm[2],REAL zc2m[2],REAL zs2m[2],REAL y[NL2],REAL yr[NL2])
-    {
-        REAL ytmp[NL2];
-        sh_rotz_2(zc1m,zs1m,y,yr);
-        sh_roty_2(ycm,ysm,yr,ytmp);
-        sh_rotz_2(zc2m,zs2m,ytmp,yr);
-    }
-
-    static inline void sh_rotzyz_3(REAL zc1m[3],REAL zs1m[3],REAL ycm[3],REAL ysm[3],REAL zc2m[3],REAL zs2m[3],REAL y[NL3],REAL yr[NL3])
-    {
-        REAL ytmp[NL3];
-        sh_rotz_3(zc1m,zs1m,y,yr);
-        sh_roty_3(ycm,ysm,yr,ytmp);
-        sh_rotz_3(zc2m,zs2m,ytmp,yr);
-    }
-
-    static inline void sh_rotzyz_4(REAL zc1m[4],REAL zs1m[4],REAL ycm[4],REAL ysm[4],REAL zc2m[4],REAL zs2m[4],REAL y[NL4],REAL yr[NL4])
-    {
-        REAL ytmp[NL4];
-        sh_rotz_4(zc1m,zs1m,y,yr);
-        sh_roty_4(ycm,ysm,yr,ytmp);
-        sh_rotz_4(zc2m,zs2m,ytmp,yr);
-    }
-
-    static inline void sh_rotzyz_5(REAL zc1m[5],REAL zs1m[5],REAL ycm[5],REAL ysm[5],REAL zc2m[5],REAL zs2m[5],REAL y[NL5],REAL yr[NL5])
-    {
-        REAL ytmp[NL5];
-        sh_rotz_5(zc1m,zs1m,y,yr);
-        sh_roty_5(ycm,ysm,yr,ytmp);
-        sh_rotz_5(zc2m,zs2m,ytmp,yr);
-    }
-
-    static inline void sh3_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH3],REAL yr[NSH3])
-    {
-        REAL zc1m[3],zs1m[3];
-        rot_3(zc1,zs1,zc1m,zs1m);
-        REAL ycm[3],ysm[3];
-        rot_3(yc,ys,ycm,ysm);
-        REAL zc2m[3],zs2m[3];
-        rot_3(zc2,zs2,zc2m,zs2m);
-
-        yr[0] = y[0];
-        sh_rot_1(m,y+NSH0,yr+NSH0);
-        sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1);
-        sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2);
-    }
-
-    static inline void sh4_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH4],REAL yr[NSH4])
-    {
-        REAL zc1m[4],zs1m[4];
-        rot_4(zc1,zs1,zc1m,zs1m);
-        REAL ycm[4],ysm[4];
-        rot_4(yc,ys,ycm,ysm);
-        REAL zc2m[4],zs2m[4];
-        rot_4(zc2,zs2,zc2m,zs2m);
-
-        yr[0] = y[0];
-        sh_rot_1(m,y+NSH0,yr+NSH0);
-        sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1);
-        sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2);
-        sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3);
-    }
-
-    static inline void sh5_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH5],REAL yr[NSH5])
-    {
-        REAL zc1m[5],zs1m[5];
-        rot_5(zc1,zs1,zc1m,zs1m);
-        REAL ycm[5],ysm[5];
-        rot_5(yc,ys,ycm,ysm);
-        REAL zc2m[5],zs2m[5];
-        rot_5(zc2,zs2,zc2m,zs2m);
-
-        yr[0] = y[0];
-        sh_rot_1(m,y+NSH0,yr+NSH0);
-        sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1);
-        sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2);
-        sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3);
-        sh_rotzyz_5(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH4,yr+NSH4);
-    }
-
-    inline void sh1_rot(REAL m[3*3],REAL y[NSH1],REAL yr[NSH1])
-    {
-        yr[0] = y[0];
-        sh_rot_1(m,y+NSH0,yr+NSH0);
-    }
-
-    inline void sh3_rot(REAL m[3*3],REAL y[NSH3],REAL yr[NSH3])
-    {
-        REAL zc1,zs1,yc,ys,zc2,zs2;
-        zyz(m,zc1,zs1,yc,ys,zc2,zs2);
-        sh3_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr);
-    }
-
-    inline void sh4_rot(REAL m[3*3],REAL y[NSH4],REAL yr[NSH4])
-    {
-        REAL zc1,zs1,yc,ys,zc2,zs2;
-        zyz(m,zc1,zs1,yc,ys,zc2,zs2);
-        sh4_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr);
-    }
-
-    inline void sh5_rot(REAL m[3*3],REAL y[NSH5],REAL yr[NSH5])
-    {
-        REAL zc1,zs1,yc,ys,zc2,zs2;
-        zyz(m,zc1,zs1,yc,ys,zc2,zs2);
-        sh5_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr);
-    }
-
-    // simple matrix vector multiply for a square matrix (only used by ZRotation)
-    static inline void SimpMatMul(size_t dim, const float *matrix, const float *input, float *result)
-    {
-        for(size_t iR=0; iR < dim; ++iR)
-        {
-            result[iR + 0] = matrix[iR*dim + 0] * input[0];
-            for(size_t iC=1; iC < dim; ++iC)
-            {
-                result[iR] += matrix[iR*dim+ iC] * input[iC]; 
-            }
-        }
-    }
-
-}; // anonymous namespace
-
-
-namespace DirectX
-{
-
-//-------------------------------------------------------------------------------------
-// Evaluates the Spherical Harmonic basis functions
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205448.aspx
-//-------------------------------------------------------------------------------------
-float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result,
-                                      _In_ size_t order,
-                                      _In_ FXMVECTOR dir )
-{
-    if ( !result )
-        return nullptr;
-
-    XMFLOAT4A dv;
-    XMStoreFloat4A( &dv, dir );
-
-    const float fX = dv.x;
-    const float fY = dv.y;
-    const float fZ = dv.z;
-
-    switch( order )
-    {
-    case 2:
-        sh_eval_basis_1(fX,fY,fZ,result);
-        break;
-
-    case 3:
-        sh_eval_basis_2(fX,fY,fZ,result);
-        break;
-
-    case 4:
-        sh_eval_basis_3(fX,fY,fZ,result);
-        break;
-
-    case 5:
-        sh_eval_basis_4(fX,fY,fZ,result);
-        break;
-
-    case 6:
-        sh_eval_basis_5(fX,fY,fZ,result);
-        break;
-
-    default:
-        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
-        return nullptr;
-    }
-
-    return result;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Rotates SH vector by a rotation matrix
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204992.aspx
-//-------------------------------------------------------------------------------------
-float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result,
-                               _In_ size_t order,
-                               _In_ FXMMATRIX rotMatrix,
-                               _In_reads_(order*order) const float *input )
-{
-    if ( !result || !input )
-        return nullptr;
-
-    if( result == input )
-        return nullptr;
-
-    XMFLOAT3X3 mat;
-    XMStoreFloat3x3( &mat, rotMatrix );
-
-    float mRot[3*3];
-    const float r00 = mRot[0*3 +0] = mat._11;
-    const float r10 = mRot[1*3 +0] = mat._12;
-    const float r20 = mRot[2*3 +0] = mat._13;
-
-    const float r01 = mRot[0*3 +1] = mat._21;
-    const float r11 = mRot[1*3 +1] = mat._22;
-    const float r21 = mRot[2*3 +1] = mat._23;
-
-    const float r02 = mRot[0*3 +2] = mat._31;
-    const float r12 = mRot[1*3 +2] = mat._32;
-    const float r22 = mRot[2*3 +2] = mat._33;
-
-    result[0] = input[0]; // rotate the constant term
-
-    switch (order)
-    {
-    case 2:
-            {
-                // do linear by hand...
-
-                result[1] = r11*input[1] - r12*input[2] + r10*input[3];
-                result[2] = -r21*input[1] + r22*input[2] - r20*input[3];
-                result[3] = r01*input[1] -r02*input[2] + r00*input[3];
-            }
-            break;
-
-    case 3:
-        {
-            float R[25];
-            // do linear by hand...
-
-            result[1] = r11*input[1] - r12*input[2] + r10*input[3];
-            result[2] = -r21*input[1] + r22*input[2] - r20*input[3];
-            result[3] = r01*input[1] -r02*input[2] + r00*input[3];
-
-            // direct code for quadratics is faster than ZYZ reccurence relations
-
-            const float t41 = r01 * r00;
-            const float t43 = r11 * r10;
-            const float t48 = r11 * r12;
-            const float t50 = r01 * r02;
-            const float t55 = r02 * r02;
-            const float t57 = r22 * r22;
-            const float t58 = r12 * r12;
-            const float t61 = r00 * r02;
-            const float t63 = r10 * r12;
-            const float t68 = r10 * r10;
-            const float t70 = r01 * r01;
-            const float t72 = r11 * r11;
-            const float t74 = r00 * r00;
-            const float t76 = r21 * r21;
-            const float t78 = r20 * r20;
-
-            const float v173 = 0.1732050808e1f;
-            const float v577 = 0.5773502693e0f;
-            const float v115 = 0.1154700539e1f;
-            const float v288 = 0.2886751347e0f;
-            const float v866 = 0.8660254040e0f;
-
-            R[0] = r11 * r00 + r01 * r10;
-            R[1] = - r01 * r12 -  r11 * r02;
-            R[2] =  v173 * r02 * r12;
-            R[3] = - r10 * r02 -  r00 * r12;
-            R[4] = r00 * r10 -  r01 * r11;
-            R[5] = - r11 * r20 -  r21 * r10;
-            R[6] = r11 * r22 + r21 * r12;
-            R[7] = -v173 * r22 * r12;
-            R[8] = r20 * r12 + r10 * r22;
-            R[9] = - r10 * r20 + r11 * r21;
-            R[10] = - v577* (t41 + t43) + v115 * r21 * r20;
-            R[11] = v577* (t48 +  t50) - v115 * r21 * r22;
-            R[12] = -0.5000000000e0f * (t55 + t58) + t57;
-            R[13] = v577 * (t61 +  t63) - v115 * r20 * r22;
-            R[14] =  v288 * (t70 - t68 +  t72 -  t74) - v577 * (t76 - t78);
-            R[15] = - r01 * r20 -  r21 * r00;
-            R[16] = r01 * r22 + r21 * r02;
-            R[17] = -v173 * r22 * r02;
-            R[18] = r00 * r22 + r20 * r02;
-            R[19] = - r00 * r20 + r01 * r21;
-            R[20] = t41 -  t43;
-            R[21] = - t50 + t48;
-            R[22] =  v866 * (t55 - t58);
-            R[23] = t63 -  t61;
-            R[24] = 0.5000000000e0f *( t74 -  t68 -  t70 +  t72);
-
-            // blow the matrix multiply out by hand, looping is ineficient on a P4...
-            for(unsigned int iR=0; iR<5;iR++)
-            {
-                const unsigned int uBase = iR*5;
-                result[4 + iR] = R[uBase + 0]*input[4] + R[uBase + 1]*input[5] + R[uBase + 2]*input[6] + R[uBase + 3]*input[7] + R[uBase + 4]*input[8];
-            }
-        }
-        break;
-
-    case 4:
-        sh3_rot(mRot,const_cast<float *>(input),result);
-        break;
-
-    case 5:
-        sh4_rot(mRot,const_cast<float *>(input),result);
-        break;
-
-    case 6:
-        sh5_rot(mRot,const_cast<float *>(input),result);
-        break;
-
-    default:
-        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
-        return nullptr;
-    }
-
-    return result;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Rotates the SH vector in the Z axis by an angle
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205461.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHRotateZ( _Out_writes_(order*order) float *result,
-                    _In_ size_t order,
-                    _In_ float angle,
-                    _In_reads_(order*order) const float *input )
-{
-    if ( !result || !input )
-        return nullptr;
-
-    if( result == input )
-        return nullptr;
-
-    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
-        return nullptr;
-
-    float R[(2*(XM_SH_MAXORDER-1) + 1)*(2* (XM_SH_MAXORDER-1) + 1)]; // used to store rotation matrices...
-
-    // these are actually very sparse matrices, most of the entries are zero's...
-
-    const float ca = cosf(angle);
-    const float sa = sinf(angle);
-
-    const float t1 = ca;
-    const float t2 = sa;
-    R[0] = t1;
-    R[1] = 0.0f;
-    R[2] = t2;
-    R[3] = 0.0f;
-    R[4] = 1.0f;
-    R[5] = 0.0f;
-    R[6] = -t2;
-    R[7] = 0.0f;
-    R[8] = t1;
-
-    result[0] = input[0];
-    SimpMatMul(3,R,input+1,result+1);
-
-    if (order > 2)
-    {
-        for(int j=0;j<5*5;j++) R[j] = 0.0f;
-        const float t1 = sa;
-        const float t2 = t1*t1;
-        const float t3 = ca;
-        const float t4 = t3*t3;
-        const float t5 = -t2+t4;
-        const float t7 = 2.0f*t3*t1;
-        R[0] = t5;
-        R[4] = t7;
-        R[6] = t3;
-        R[8] = t1;
-        R[12] = 1.0f;
-        R[16] = -t1;
-        R[18] = t3;
-        R[20] = -t7;
-        R[24] = t5;
-
-        SimpMatMul(5,R,input+4,result+4); // un-roll matrix/vector multiply
-        if (order > 3)
-        {
-            for(int j=0;j<7*7;j++) R[j] = 0.0f;
-            const float t1 = ca;
-            const float t2 = t1*t1;
-            const float t4 = sa;
-            const float t5 = t4*t4;
-            const float t8 = t2*t1-3.0f*t1*t5;
-            const float t12 = 3.0f*t4*t2-t5*t4;
-            const float t13 = -t5+t2;
-            const float t15 = 2.0f*t1*t4;
-            R[0] = t8;
-            R[6] = t12;
-            R[8] = t13;
-            R[12] = t15;
-            R[16] = t1;
-            R[18] = t4;
-            R[24] = 1.0f;
-            R[30] = -t4;
-            R[32] = t1;
-            R[36] = -t15;
-            R[40] = t13;
-            R[42] = -t12;
-            R[48] = t8;
-            SimpMatMul(7,R,input+9,result+9);
-            if (order > 4)
-            {
-                for(int j=0;j<=9*9;j++) R[j] = 0.0f;
-                const float t1 = ca;
-                const float t2 = t1*t1;
-                const float t3 = t2*t2;
-                const float t4 = sa;
-                const float t5 = t4*t4;
-                const float t6 = t5*t5;
-                const float t9 = t3+t6-6.0f*t5*t2;
-                const float t10 = t5*t4;
-                const float t12 = t2*t1;
-                const float t14 = -t10*t1+t4*t12;
-                const float t17 = t12-3.0f*t1*t5;
-                const float t20 = 3.0f*t4*t2-t10;
-                const float t21 = -t5+t2;
-                const float t23 = 2.0f*t1*t4;
-                R[0] = t9;
-                R[8] = 4.0f*t14;
-                R[10] = t17;
-                R[16] = t20;
-                R[20] = t21;
-                R[24] = t23;
-                R[30] = t1;
-                R[32] = t4;
-                R[40] = 1.0f;
-                R[48] = -t4;
-                R[50] = t1;
-                R[56] = -t23;
-                R[60] = t21;
-                R[64] = -t20;
-                R[70] = t17;
-                R[72] = -4.0f*t14;
-                R[80] = t9;
-
-                SimpMatMul(9,R,input+16,result+16);
-                if (order > 5)
-                {
-                    for(int j=0;j<11*11;j++) R[j] = 0.0f;
-                    const float t1 = ca;
-                    const float t2 = sa;
-                    const float t3 = t2*t2;
-                    const float t4 = t3*t3;
-                    const float t7 = t1*t1;
-                    const float t8 = t7*t1;
-                    const float t11 = t7*t7;
-                    const float t13 = 5.0f*t1*t4-10.0f*t3*t8+t11*t1;
-                    const float t14 = t3*t2;
-                    const float t20 = -10.0f*t14*t7+5.0f*t2*t11+t4*t2;
-                    const float t23 = t11+t4-6.0f*t3*t7;
-                    const float t26 = -t14*t1+t2*t8;
-                    const float t29 = t8-3.0f*t1*t3;
-                    const float t32 = 3.0f*t2*t7-t14;
-                    const float t33 = -t3+t7;
-                    const float t35 = 2.0f*t1*t2;
-                    R[0] = t13;
-                    R[10] = t20;
-                    R[12] = t23;
-                    R[20] = 4.0f*t26;
-                    R[24] = t29;
-                    R[30] = t32;
-                    R[36] = t33;
-                    R[40] = t35;
-                    R[48] = t1;
-                    R[50] = t2;
-                    R[60] = 1.0f;
-                    R[70] = -t2;
-                    R[72] = t1;
-                    R[80] = -t35;
-                    R[84] = t33;
-                    R[90] = -t32;
-                    R[96] = t29;
-                    R[100] = -4.0f*t26;
-                    R[108] = t23;
-                    R[110] = -t20;
-                    R[120] = t13;
-                    SimpMatMul(11,R,input+25,result+25);
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Adds two SH vectors, result[i] = inputA[i] + inputB[i];
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205438.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHAdd( _Out_writes_(order*order) float *result,
-                _In_ size_t order,
-                _In_reads_(order*order) const float *inputA,
-                _In_reads_(order*order) const float *inputB )
-{
-    if ( !result || !inputA || !inputB )
-        return nullptr;
-
-    const size_t numcoeff = order*order;
-
-    for( size_t i=0; i < numcoeff; ++i )
-    {
-        result[i] = inputA[i] + inputB[i];
-    }
-
-    return result;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Scales a SH vector, result[i] = input[i] * scale;
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204994.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHScale( _Out_writes_(order*order) float *result,
-                  _In_ size_t order,
-                  _In_reads_(order*order) const float *input,
-                  _In_ float scale )
-{
-    if ( !result || !input )
-        return nullptr;
-
-    const size_t numcoeff = order*order;
-
-    for( size_t i=0; i < numcoeff; ++i )
-    {
-        result[i] = scale * input[i];
-    }
-
-    return result;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Computes the dot product of two SH vectors
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205446.aspx
-//-------------------------------------------------------------------------------------
-float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB )
-{
-    if ( !inputA || !inputB )
-        return 0.f;
-
-    float result = inputA[0] * inputB[0];
-
-    const size_t numcoeff = order*order;
-
-    for( size_t i=1; i < numcoeff; ++i )
-    {
-        result += inputA[i] * inputB[i];
-    }
-
-    return result;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Computes the product of two functions represented using SH (f and g), where:
-// result[i] = int(y_i(s) * f(s) * g(s)), where y_i(s) is the ith SH basis
-// function, f(s) and g(s) are SH functions (sum_i(y_i(s)*c_i)).  The order O
-// determines the lengths of the arrays, where there should always be O^2 
-// coefficients.  In general the product of two SH functions of order O generates
-// and SH function of order 2*O - 1, but we truncate the result.  This means
-// that the product commutes (f*g == g*f) but doesn't associate 
-// (f*(g*h) != (f*g)*h.
-//-------------------------------------------------------------------------------------
-float* XMSHMultiply( _Out_writes_(order*order) float *result,
-                     _In_ size_t order,
-                     _In_reads_(order*order) const float *inputF,
-                     _In_reads_(order*order) const float *inputG )
-{
-    switch( order )
-    {
-    case 2:
-        return XMSHMultiply2( result, inputF, inputG );
-        
-    case 3:
-        return XMSHMultiply3( result, inputF, inputG );
-
-    case 4:
-        return XMSHMultiply4( result, inputF, inputG );
-
-    case 5:
-        return XMSHMultiply5( result, inputF, inputG );
-
-    case 6:
-        return XMSHMultiply6( result, inputF, inputG );
-
-    default:
-        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
-        return nullptr;
-    }
-}
-
-
-//-------------------------------------------------------------------------------------
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205454.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHMultiply2( _Out_writes_(4) float *y,
-                      _In_reads_(4) const float *f,
-                      _In_reads_(4) const float *g )
-{
-    if ( !y || !f || !g )
-        return nullptr;
-
-    REAL tf,tg,t;
-    // [0,0]: 0,
-    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
-
-    // [1,1]: 0,
-    tf = CONSTANT(0.282094791773000010)*f[0];
-    tg = CONSTANT(0.282094791773000010)*g[0];
-    y[1]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[1];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-
-    // [2,2]: 0,
-    tf = CONSTANT(0.282094795249000000)*f[0];
-    tg = CONSTANT(0.282094795249000000)*g[0];
-    y[2]  = tf*g[2]+tg*f[2];
-    t = f[2]*g[2];
-    y[0] += CONSTANT(0.282094795249000000)*t;
-
-    // [3,3]: 0,
-    tf = CONSTANT(0.282094791773000010)*f[0];
-    tg = CONSTANT(0.282094791773000010)*g[0];
-    y[3]  = tf*g[3]+tg*f[3];
-    t = f[3]*g[3];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-
-    // multiply count=20
-
-    return y;
-}
-
-
-//-------------------------------------------------------------------------------------
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232906.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHMultiply3( _Out_writes_(9) float *y,
-                      _In_reads_(9) const float *f,
-                      _In_reads_(9) const float *g )
-{
-    if ( !y || !f || !g )
-        return nullptr;
-
-    REAL tf,tg,t;
-    // [0,0]: 0,
-    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
-
-    // [1,1]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
-    y[1]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[1];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6]  = CONSTANT(-0.126156626101000010)*t;
-    y[8]  = CONSTANT(-0.218509686119999990)*t;
-
-    // [1,2]: 5,
-    tf = CONSTANT(0.218509686118000010)*f[5];
-    tg = CONSTANT(0.218509686118000010)*g[5];
-    y[1] += tf*g[2]+tg*f[2];
-    y[2]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[2]+f[2]*g[1];
-    y[5]  = CONSTANT(0.218509686118000010)*t;
-
-    // [1,3]: 4,
-    tf = CONSTANT(0.218509686114999990)*f[4];
-    tg = CONSTANT(0.218509686114999990)*g[4];
-    y[1] += tf*g[3]+tg*f[3];
-    y[3]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[3]+f[3]*g[1];
-    y[4]  = CONSTANT(0.218509686114999990)*t;
-
-    // [2,2]: 0,6,
-    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
-    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
-    y[2] += tf*g[2]+tg*f[2];
-    t = f[2]*g[2];
-    y[0] += CONSTANT(0.282094795249000000)*t;
-    y[6] += CONSTANT(0.252313259986999990)*t;
-
-    // [2,3]: 7,
-    tf = CONSTANT(0.218509686118000010)*f[7];
-    tg = CONSTANT(0.218509686118000010)*g[7];
-    y[2] += tf*g[3]+tg*f[3];
-    y[3] += tf*g[2]+tg*f[2];
-    t = f[2]*g[3]+f[3]*g[2];
-    y[7]  = CONSTANT(0.218509686118000010)*t;
-
-    // [3,3]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
-    y[3] += tf*g[3]+tg*f[3];
-    t = f[3]*g[3];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6] += CONSTANT(-0.126156626101000010)*t;
-    y[8] += CONSTANT(0.218509686119999990)*t;
-
-    // [4,4]: 0,6,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
-    y[4] += tf*g[4]+tg*f[4];
-    t = f[4]*g[4];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-
-    // [4,5]: 7,
-    tf = CONSTANT(0.156078347226000000)*f[7];
-    tg = CONSTANT(0.156078347226000000)*g[7];
-    y[4] += tf*g[5]+tg*f[5];
-    y[5] += tf*g[4]+tg*f[4];
-    t = f[4]*g[5]+f[5]*g[4];
-    y[7] += CONSTANT(0.156078347226000000)*t;
-
-    // [5,5]: 0,6,8,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8];
-    y[5] += tf*g[5]+tg*f[5];
-    t = f[5]*g[5];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[8] += CONSTANT(-0.156078347227999990)*t;
-
-    // [6,6]: 0,6,
-    tf = CONSTANT(0.282094797560000000)*f[0];
-    tg = CONSTANT(0.282094797560000000)*g[0];
-    y[6] += tf*g[6]+tg*f[6];
-    t = f[6]*g[6];
-    y[0] += CONSTANT(0.282094797560000000)*t;
-    y[6] += CONSTANT(0.180223764527000010)*t;
-
-    // [7,7]: 0,6,8,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.156078347227999990)*f[8];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.156078347227999990)*g[8];
-    y[7] += tf*g[7]+tg*f[7];
-    t = f[7]*g[7];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[8] += CONSTANT(0.156078347227999990)*t;
-
-    // [8,8]: 0,6,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
-    y[8] += tf*g[8]+tg*f[8];
-    t = f[8]*g[8];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-
-    // multiply count=120
-
-    return y;
-}
-
-
-//-------------------------------------------------------------------------------------
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232907.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHMultiply4( _Out_writes_(16) float *y,
-                      _In_reads_(16) const float *f,
-                      _In_reads_(16) const float *g )
-{
-    if ( !y || !f || !g )
-        return nullptr;
-
-    REAL tf,tg,t;
-    // [0,0]: 0,
-    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
-
-    // [1,1]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
-    y[1]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[1];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6]  = CONSTANT(-0.126156626101000010)*t;
-    y[8]  = CONSTANT(-0.218509686119999990)*t;
-
-    // [1,4]: 3,13,15,
-    tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15];
-    tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15];
-    y[1] += tf*g[4]+tg*f[4];
-    y[4]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[4]+f[4]*g[1];
-    y[3]  = CONSTANT(0.218509686114999990)*t;
-    y[13]  = CONSTANT(-0.058399170082300000)*t;
-    y[15]  = CONSTANT(-0.226179013157999990)*t;
-
-    // [1,5]: 2,12,14,
-    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14];
-    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14];
-    y[1] += tf*g[5]+tg*f[5];
-    y[5]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[5]+f[5]*g[1];
-    y[2]  = CONSTANT(0.218509686118000010)*t;
-    y[12]  = CONSTANT(-0.143048168103000000)*t;
-    y[14]  = CONSTANT(-0.184674390923000000)*t;
-
-    // [1,6]: 11,
-    tf = CONSTANT(0.202300659402999990)*f[11];
-    tg = CONSTANT(0.202300659402999990)*g[11];
-    y[1] += tf*g[6]+tg*f[6];
-    y[6] += tf*g[1]+tg*f[1];
-    t = f[1]*g[6]+f[6]*g[1];
-    y[11]  = CONSTANT(0.202300659402999990)*t;
-
-    // [1,8]: 9,11,
-    tf = CONSTANT(0.226179013155000000)*f[9]+CONSTANT(0.058399170081799998)*f[11];
-    tg = CONSTANT(0.226179013155000000)*g[9]+CONSTANT(0.058399170081799998)*g[11];
-    y[1] += tf*g[8]+tg*f[8];
-    y[8] += tf*g[1]+tg*f[1];
-    t = f[1]*g[8]+f[8]*g[1];
-    y[9]  = CONSTANT(0.226179013155000000)*t;
-    y[11] += CONSTANT(0.058399170081799998)*t;
-
-    // [2,2]: 0,6,
-    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
-    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
-    y[2] += tf*g[2]+tg*f[2];
-    t = f[2]*g[2];
-    y[0] += CONSTANT(0.282094795249000000)*t;
-    y[6] += CONSTANT(0.252313259986999990)*t;
-
-    // [2,6]: 12,
-    tf = CONSTANT(0.247766706973999990)*f[12];
-    tg = CONSTANT(0.247766706973999990)*g[12];
-    y[2] += tf*g[6]+tg*f[6];
-    y[6] += tf*g[2]+tg*f[2];
-    t = f[2]*g[6]+f[6]*g[2];
-    y[12] += CONSTANT(0.247766706973999990)*t;
-
-    // [3,3]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
-    y[3] += tf*g[3]+tg*f[3];
-    t = f[3]*g[3];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6] += CONSTANT(-0.126156626101000010)*t;
-    y[8] += CONSTANT(0.218509686119999990)*t;
-
-    // [3,6]: 13,
-    tf = CONSTANT(0.202300659402999990)*f[13];
-    tg = CONSTANT(0.202300659402999990)*g[13];
-    y[3] += tf*g[6]+tg*f[6];
-    y[6] += tf*g[3]+tg*f[3];
-    t = f[3]*g[6]+f[6]*g[3];
-    y[13] += CONSTANT(0.202300659402999990)*t;
-
-    // [3,7]: 2,12,14,
-    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14];
-    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14];
-    y[3] += tf*g[7]+tg*f[7];
-    y[7]  = tf*g[3]+tg*f[3];
-    t = f[3]*g[7]+f[7]*g[3];
-    y[2] += CONSTANT(0.218509686118000010)*t;
-    y[12] += CONSTANT(-0.143048168103000000)*t;
-    y[14] += CONSTANT(0.184674390923000000)*t;
-
-    // [3,8]: 13,15,
-    tf = CONSTANT(-0.058399170081799998)*f[13]+CONSTANT(0.226179013155000000)*f[15];
-    tg = CONSTANT(-0.058399170081799998)*g[13]+CONSTANT(0.226179013155000000)*g[15];
-    y[3] += tf*g[8]+tg*f[8];
-    y[8] += tf*g[3]+tg*f[3];
-    t = f[3]*g[8]+f[8]*g[3];
-    y[13] += CONSTANT(-0.058399170081799998)*t;
-    y[15] += CONSTANT(0.226179013155000000)*t;
-
-    // [4,4]: 0,6,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
-    y[4] += tf*g[4]+tg*f[4];
-    t = f[4]*g[4];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-
-    // [4,5]: 7,
-    tf = CONSTANT(0.156078347226000000)*f[7];
-    tg = CONSTANT(0.156078347226000000)*g[7];
-    y[4] += tf*g[5]+tg*f[5];
-    y[5] += tf*g[4]+tg*f[4];
-    t = f[4]*g[5]+f[5]*g[4];
-    y[7] += CONSTANT(0.156078347226000000)*t;
-
-    // [4,9]: 3,13,
-    tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13];
-    tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13];
-    y[4] += tf*g[9]+tg*f[9];
-    y[9] += tf*g[4]+tg*f[4];
-    t = f[4]*g[9]+f[9]*g[4];
-    y[3] += CONSTANT(0.226179013157999990)*t;
-    y[13] += CONSTANT(-0.094031597258400004)*t;
-
-    // [4,10]: 2,12,
-    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12];
-    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12];
-    y[4] += tf*g[10]+tg*f[10];
-    y[10]  = tf*g[4]+tg*f[4];
-    t = f[4]*g[10]+f[10]*g[4];
-    y[2] += CONSTANT(0.184674390919999990)*t;
-    y[12] += CONSTANT(-0.188063194517999990)*t;
-
-    // [4,11]: 3,13,15,
-    tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15];
-    tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15];
-    y[4] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[4]+tg*f[4];
-    t = f[4]*g[11]+f[11]*g[4];
-    y[3] += CONSTANT(-0.058399170082300000)*t;
-    y[13] += CONSTANT(0.145673124078000010)*t;
-    y[15] += CONSTANT(0.094031597258400004)*t;
-
-    // [5,5]: 0,6,8,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8];
-    y[5] += tf*g[5]+tg*f[5];
-    t = f[5]*g[5];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[8] += CONSTANT(-0.156078347227999990)*t;
-
-    // [5,9]: 14,
-    tf = CONSTANT(0.148677009677999990)*f[14];
-    tg = CONSTANT(0.148677009677999990)*g[14];
-    y[5] += tf*g[9]+tg*f[9];
-    y[9] += tf*g[5]+tg*f[5];
-    t = f[5]*g[9]+f[9]*g[5];
-    y[14] += CONSTANT(0.148677009677999990)*t;
-
-    // [5,10]: 3,13,15,
-    tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15];
-    tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15];
-    y[5] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[5]+tg*f[5];
-    t = f[5]*g[10]+f[10]*g[5];
-    y[3] += CONSTANT(0.184674390919999990)*t;
-    y[13] += CONSTANT(0.115164716490000000)*t;
-    y[15] += CONSTANT(-0.148677009678999990)*t;
-
-    // [5,11]: 2,12,14,
-    tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14];
-    tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14];
-    y[5] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[5]+tg*f[5];
-    t = f[5]*g[11]+f[11]*g[5];
-    y[2] += CONSTANT(0.233596680327000010)*t;
-    y[12] += CONSTANT(0.059470803871800003)*t;
-    y[14] += CONSTANT(-0.115164716491000000)*t;
-
-    // [6,6]: 0,6,
-    tf = CONSTANT(0.282094797560000000)*f[0];
-    tg = CONSTANT(0.282094797560000000)*g[0];
-    y[6] += tf*g[6]+tg*f[6];
-    t = f[6]*g[6];
-    y[0] += CONSTANT(0.282094797560000000)*t;
-    y[6] += CONSTANT(0.180223764527000010)*t;
-
-    // [7,7]: 6,0,8,
-    tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8];
-    tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8];
-    y[7] += tf*g[7]+tg*f[7];
-    t = f[7]*g[7];
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.156078347227999990)*t;
-
-    // [7,10]: 9,1,11,
-    tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11];
-    tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11];
-    y[7] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[7]+tg*f[7];
-    t = f[7]*g[10]+f[10]*g[7];
-    y[9] += CONSTANT(0.148677009678999990)*t;
-    y[1] += CONSTANT(0.184674390919999990)*t;
-    y[11] += CONSTANT(0.115164716490000000)*t;
-
-    // [7,13]: 12,2,14,
-    tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14];
-    tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14];
-    y[7] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[7]+tg*f[7];
-    t = f[7]*g[13]+f[13]*g[7];
-    y[12] += CONSTANT(0.059470803871800003)*t;
-    y[2] += CONSTANT(0.233596680327000010)*t;
-    y[14] += CONSTANT(0.115164716491000000)*t;
-
-    // [7,14]: 15,
-    tf = CONSTANT(0.148677009677999990)*f[15];
-    tg = CONSTANT(0.148677009677999990)*g[15];
-    y[7] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[7]+tg*f[7];
-    t = f[7]*g[14]+f[14]*g[7];
-    y[15] += CONSTANT(0.148677009677999990)*t;
-
-    // [8,8]: 0,6,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
-    y[8] += tf*g[8]+tg*f[8];
-    t = f[8]*g[8];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-
-    // [8,9]: 11,
-    tf = CONSTANT(-0.094031597259499999)*f[11];
-    tg = CONSTANT(-0.094031597259499999)*g[11];
-    y[8] += tf*g[9]+tg*f[9];
-    y[9] += tf*g[8]+tg*f[8];
-    t = f[8]*g[9]+f[9]*g[8];
-    y[11] += CONSTANT(-0.094031597259499999)*t;
-
-    // [8,13]: 15,
-    tf = CONSTANT(-0.094031597259499999)*f[15];
-    tg = CONSTANT(-0.094031597259499999)*g[15];
-    y[8] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[8]+tg*f[8];
-    t = f[8]*g[13]+f[13]*g[8];
-    y[15] += CONSTANT(-0.094031597259499999)*t;
-
-    // [8,14]: 2,12,
-    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12];
-    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12];
-    y[8] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[8]+tg*f[8];
-    t = f[8]*g[14]+f[14]*g[8];
-    y[2] += CONSTANT(0.184674390919999990)*t;
-    y[12] += CONSTANT(-0.188063194517999990)*t;
-
-    // [9,9]: 6,0,
-    tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0];
-    tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0];
-    y[9] += tf*g[9]+tg*f[9];
-    t = f[9]*g[9];
-    y[6] += CONSTANT(-0.210261043508000010)*t;
-    y[0] += CONSTANT(0.282094791766999970)*t;
-
-    // [10,10]: 0,
-    tf = CONSTANT(0.282094791771999980)*f[0];
-    tg = CONSTANT(0.282094791771999980)*g[0];
-    y[10] += tf*g[10]+tg*f[10];
-    t = f[10]*g[10];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-
-    // [11,11]: 0,6,8,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8];
-    y[11] += tf*g[11]+tg*f[11];
-    t = f[11]*g[11];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.126156626101000010)*t;
-    y[8] += CONSTANT(-0.145673124078999990)*t;
-
-    // [12,12]: 0,6,
-    tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6];
-    tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6];
-    y[12] += tf*g[12]+tg*f[12];
-    t = f[12]*g[12];
-    y[0] += CONSTANT(0.282094799871999980)*t;
-    y[6] += CONSTANT(0.168208852954000010)*t;
-
-    // [13,13]: 0,8,6,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6];
-    y[13] += tf*g[13]+tg*f[13];
-    t = f[13]*g[13];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.145673124078999990)*t;
-    y[6] += CONSTANT(0.126156626101000010)*t;
-
-    // [14,14]: 0,
-    tf = CONSTANT(0.282094791771999980)*f[0];
-    tg = CONSTANT(0.282094791771999980)*g[0];
-    y[14] += tf*g[14]+tg*f[14];
-    t = f[14]*g[14];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-
-    // [15,15]: 0,6,
-    tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6];
-    tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6];
-    y[15] += tf*g[15]+tg*f[15];
-    t = f[15]*g[15];
-    y[0] += CONSTANT(0.282094791766999970)*t;
-    y[6] += CONSTANT(-0.210261043508000010)*t;
-
-    // multiply count=399
-
-    return y;
-}
-
-
-//-------------------------------------------------------------------------------------
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232908.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHMultiply5( _Out_writes_(25) float *y,
-                      _In_reads_(25) const float *f,
-                      _In_reads_(25) const float *g )
-{
-    if ( !y || !f || !g )
-        return nullptr;
-
-    REAL tf,tg,t;
-    // [0,0]: 0,
-    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
-
-    // [1,1]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
-    y[1]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[1];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6]  = CONSTANT(-0.126156626101000010)*t;
-    y[8]  = CONSTANT(-0.218509686119999990)*t;
-
-    // [1,4]: 3,13,15,
-    tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15];
-    tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15];
-    y[1] += tf*g[4]+tg*f[4];
-    y[4]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[4]+f[4]*g[1];
-    y[3]  = CONSTANT(0.218509686114999990)*t;
-    y[13]  = CONSTANT(-0.058399170082300000)*t;
-    y[15]  = CONSTANT(-0.226179013157999990)*t;
-
-    // [1,5]: 2,12,14,
-    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14];
-    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14];
-    y[1] += tf*g[5]+tg*f[5];
-    y[5]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[5]+f[5]*g[1];
-    y[2]  = CONSTANT(0.218509686118000010)*t;
-    y[12]  = CONSTANT(-0.143048168103000000)*t;
-    y[14]  = CONSTANT(-0.184674390923000000)*t;
-
-    // [1,9]: 8,22,24,
-    tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(-0.230329432978999990)*f[24];
-    tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(-0.230329432978999990)*g[24];
-    y[1] += tf*g[9]+tg*f[9];
-    y[9]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[9]+f[9]*g[1];
-    y[8] += CONSTANT(0.226179013155000000)*t;
-    y[22]  = CONSTANT(-0.043528171378199997)*t;
-    y[24]  = CONSTANT(-0.230329432978999990)*t;
-
-    // [1,10]: 7,21,23,
-    tf = CONSTANT(0.184674390919999990)*f[7]+CONSTANT(-0.075393004386799994)*f[21]+CONSTANT(-0.199471140200000010)*f[23];
-    tg = CONSTANT(0.184674390919999990)*g[7]+CONSTANT(-0.075393004386799994)*g[21]+CONSTANT(-0.199471140200000010)*g[23];
-    y[1] += tf*g[10]+tg*f[10];
-    y[10]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[10]+f[10]*g[1];
-    y[7]  = CONSTANT(0.184674390919999990)*t;
-    y[21]  = CONSTANT(-0.075393004386799994)*t;
-    y[23]  = CONSTANT(-0.199471140200000010)*t;
-
-    // [1,11]: 6,8,20,22,
-    tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22];
-    tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22];
-    y[1] += tf*g[11]+tg*f[11];
-    y[11]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[11]+f[11]*g[1];
-    y[6] += CONSTANT(0.202300659402999990)*t;
-    y[8] += CONSTANT(0.058399170081799998)*t;
-    y[20]  = CONSTANT(-0.150786008773000000)*t;
-    y[22] += CONSTANT(-0.168583882836999990)*t;
-
-    // [1,12]: 19,
-    tf = CONSTANT(0.194663900273000010)*f[19];
-    tg = CONSTANT(0.194663900273000010)*g[19];
-    y[1] += tf*g[12]+tg*f[12];
-    y[12] += tf*g[1]+tg*f[1];
-    t = f[1]*g[12]+f[12]*g[1];
-    y[19]  = CONSTANT(0.194663900273000010)*t;
-
-    // [1,13]: 18,
-    tf = CONSTANT(0.168583882834000000)*f[18];
-    tg = CONSTANT(0.168583882834000000)*g[18];
-    y[1] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[1]+tg*f[1];
-    t = f[1]*g[13]+f[13]*g[1];
-    y[18]  = CONSTANT(0.168583882834000000)*t;
-
-    // [1,14]: 17,19,
-    tf = CONSTANT(0.199471140196999990)*f[17]+CONSTANT(0.075393004386399995)*f[19];
-    tg = CONSTANT(0.199471140196999990)*g[17]+CONSTANT(0.075393004386399995)*g[19];
-    y[1] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[1]+tg*f[1];
-    t = f[1]*g[14]+f[14]*g[1];
-    y[17]  = CONSTANT(0.199471140196999990)*t;
-    y[19] += CONSTANT(0.075393004386399995)*t;
-
-    // [1,15]: 16,18,
-    tf = CONSTANT(0.230329432973999990)*f[16]+CONSTANT(0.043528171377799997)*f[18];
-    tg = CONSTANT(0.230329432973999990)*g[16]+CONSTANT(0.043528171377799997)*g[18];
-    y[1] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[1]+tg*f[1];
-    t = f[1]*g[15]+f[15]*g[1];
-    y[16]  = CONSTANT(0.230329432973999990)*t;
-    y[18] += CONSTANT(0.043528171377799997)*t;
-
-    // [2,2]: 0,6,
-    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
-    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
-    y[2] += tf*g[2]+tg*f[2];
-    t = f[2]*g[2];
-    y[0] += CONSTANT(0.282094795249000000)*t;
-    y[6] += CONSTANT(0.252313259986999990)*t;
-
-    // [2,10]: 4,18,
-    tf = CONSTANT(0.184674390919999990)*f[4]+CONSTANT(0.213243618621000000)*f[18];
-    tg = CONSTANT(0.184674390919999990)*g[4]+CONSTANT(0.213243618621000000)*g[18];
-    y[2] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[2]+tg*f[2];
-    t = f[2]*g[10]+f[10]*g[2];
-    y[4] += CONSTANT(0.184674390919999990)*t;
-    y[18] += CONSTANT(0.213243618621000000)*t;
-
-    // [2,12]: 6,20,
-    tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20];
-    tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20];
-    y[2] += tf*g[12]+tg*f[12];
-    y[12] += tf*g[2]+tg*f[2];
-    t = f[2]*g[12]+f[12]*g[2];
-    y[6] += CONSTANT(0.247766706973999990)*t;
-    y[20] += CONSTANT(0.246232537174000010)*t;
-
-    // [2,14]: 8,22,
-    tf = CONSTANT(0.184674390919999990)*f[8]+CONSTANT(0.213243618621000000)*f[22];
-    tg = CONSTANT(0.184674390919999990)*g[8]+CONSTANT(0.213243618621000000)*g[22];
-    y[2] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[2]+tg*f[2];
-    t = f[2]*g[14]+f[14]*g[2];
-    y[8] += CONSTANT(0.184674390919999990)*t;
-    y[22] += CONSTANT(0.213243618621000000)*t;
-
-    // [3,3]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
-    y[3] += tf*g[3]+tg*f[3];
-    t = f[3]*g[3];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6] += CONSTANT(-0.126156626101000010)*t;
-    y[8] += CONSTANT(0.218509686119999990)*t;
-
-    // [3,7]: 2,12,14,
-    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14];
-    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14];
-    y[3] += tf*g[7]+tg*f[7];
-    y[7] += tf*g[3]+tg*f[3];
-    t = f[3]*g[7]+f[7]*g[3];
-    y[2] += CONSTANT(0.218509686118000010)*t;
-    y[12] += CONSTANT(-0.143048168103000000)*t;
-    y[14] += CONSTANT(0.184674390923000000)*t;
-
-    // [3,9]: 4,16,18,
-    tf = CONSTANT(0.226179013157999990)*f[4]+CONSTANT(0.230329432973999990)*f[16]+CONSTANT(-0.043528171377799997)*f[18];
-    tg = CONSTANT(0.226179013157999990)*g[4]+CONSTANT(0.230329432973999990)*g[16]+CONSTANT(-0.043528171377799997)*g[18];
-    y[3] += tf*g[9]+tg*f[9];
-    y[9] += tf*g[3]+tg*f[3];
-    t = f[3]*g[9]+f[9]*g[3];
-    y[4] += CONSTANT(0.226179013157999990)*t;
-    y[16] += CONSTANT(0.230329432973999990)*t;
-    y[18] += CONSTANT(-0.043528171377799997)*t;
-
-    // [3,10]: 5,17,19,
-    tf = CONSTANT(0.184674390919999990)*f[5]+CONSTANT(0.199471140200000010)*f[17]+CONSTANT(-0.075393004386799994)*f[19];
-    tg = CONSTANT(0.184674390919999990)*g[5]+CONSTANT(0.199471140200000010)*g[17]+CONSTANT(-0.075393004386799994)*g[19];
-    y[3] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[3]+tg*f[3];
-    t = f[3]*g[10]+f[10]*g[3];
-    y[5] += CONSTANT(0.184674390919999990)*t;
-    y[17] += CONSTANT(0.199471140200000010)*t;
-    y[19] += CONSTANT(-0.075393004386799994)*t;
-
-    // [3,12]: 21,
-    tf = CONSTANT(0.194663900273000010)*f[21];
-    tg = CONSTANT(0.194663900273000010)*g[21];
-    y[3] += tf*g[12]+tg*f[12];
-    y[12] += tf*g[3]+tg*f[3];
-    t = f[3]*g[12]+f[12]*g[3];
-    y[21] += CONSTANT(0.194663900273000010)*t;
-
-    // [3,13]: 8,6,20,22,
-    tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22];
-    tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22];
-    y[3] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[3]+tg*f[3];
-    t = f[3]*g[13]+f[13]*g[3];
-    y[8] += CONSTANT(-0.058399170081799998)*t;
-    y[6] += CONSTANT(0.202300659402999990)*t;
-    y[20] += CONSTANT(-0.150786008773000000)*t;
-    y[22] += CONSTANT(0.168583882836999990)*t;
-
-    // [3,14]: 21,23,
-    tf = CONSTANT(-0.075393004386399995)*f[21]+CONSTANT(0.199471140196999990)*f[23];
-    tg = CONSTANT(-0.075393004386399995)*g[21]+CONSTANT(0.199471140196999990)*g[23];
-    y[3] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[3]+tg*f[3];
-    t = f[3]*g[14]+f[14]*g[3];
-    y[21] += CONSTANT(-0.075393004386399995)*t;
-    y[23] += CONSTANT(0.199471140196999990)*t;
-
-    // [3,15]: 8,22,24,
-    tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(0.230329432978999990)*f[24];
-    tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(0.230329432978999990)*g[24];
-    y[3] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[3]+tg*f[3];
-    t = f[3]*g[15]+f[15]*g[3];
-    y[8] += CONSTANT(0.226179013155000000)*t;
-    y[22] += CONSTANT(-0.043528171378199997)*t;
-    y[24] += CONSTANT(0.230329432978999990)*t;
-
-    // [4,4]: 0,6,20,24,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24];
-    y[4] += tf*g[4]+tg*f[4];
-    t = f[4]*g[4];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-    y[20] += CONSTANT(0.040299255967500003)*t;
-    y[24] += CONSTANT(-0.238413613505999990)*t;
-
-    // [4,5]: 7,21,23,
-    tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23];
-    tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23];
-    y[4] += tf*g[5]+tg*f[5];
-    y[5] += tf*g[4]+tg*f[4];
-    t = f[4]*g[5]+f[5]*g[4];
-    y[7] += CONSTANT(0.156078347226000000)*t;
-    y[21] += CONSTANT(-0.063718718434399996)*t;
-    y[23] += CONSTANT(-0.168583882835000000)*t;
-
-    // [4,11]: 3,13,15,
-    tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15];
-    tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15];
-    y[4] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[4]+tg*f[4];
-    t = f[4]*g[11]+f[11]*g[4];
-    y[3] += CONSTANT(-0.058399170082300000)*t;
-    y[13] += CONSTANT(0.145673124078000010)*t;
-    y[15] += CONSTANT(0.094031597258400004)*t;
-
-    // [4,16]: 8,22,
-    tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22];
-    tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22];
-    y[4] += tf*g[16]+tg*f[16];
-    y[16] += tf*g[4]+tg*f[4];
-    t = f[4]*g[16]+f[16]*g[4];
-    y[8] += CONSTANT(0.238413613494000000)*t;
-    y[22] += CONSTANT(-0.075080816693699995)*t;
-
-    // [4,18]: 6,20,24,
-    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24];
-    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24];
-    y[4] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[4]+tg*f[4];
-    t = f[4]*g[18]+f[18]*g[4];
-    y[6] += CONSTANT(0.156078347226000000)*t;
-    y[20] += CONSTANT(-0.190364615029000010)*t;
-    y[24] += CONSTANT(0.075080816691500005)*t;
-
-    // [4,19]: 7,21,23,
-    tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23];
-    tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23];
-    y[4] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[4]+tg*f[4];
-    t = f[4]*g[19]+f[19]*g[4];
-    y[7] += CONSTANT(-0.063718718434399996)*t;
-    y[21] += CONSTANT(0.141889406569999990)*t;
-    y[23] += CONSTANT(0.112621225039000000)*t;
-
-    // [5,5]: 0,6,8,20,22,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22];
-    y[5] += tf*g[5]+tg*f[5];
-    t = f[5]*g[5];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[8] += CONSTANT(-0.156078347227999990)*t;
-    y[20] += CONSTANT(-0.161197023870999990)*t;
-    y[22] += CONSTANT(-0.180223751574000000)*t;
-
-    // [5,11]: 2,12,14,
-    tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14];
-    tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14];
-    y[5] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[5]+tg*f[5];
-    t = f[5]*g[11]+f[11]*g[5];
-    y[2] += CONSTANT(0.233596680327000010)*t;
-    y[12] += CONSTANT(0.059470803871800003)*t;
-    y[14] += CONSTANT(-0.115164716491000000)*t;
-
-    // [5,17]: 8,22,24,
-    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24];
-    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24];
-    y[5] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[5]+tg*f[5];
-    t = f[5]*g[17]+f[17]*g[5];
-    y[8] += CONSTANT(0.168583882832999990)*t;
-    y[22] += CONSTANT(0.132725386548000010)*t;
-    y[24] += CONSTANT(-0.140463346189000000)*t;
-
-    // [5,18]: 7,21,23,
-    tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23];
-    tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23];
-    y[5] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[5]+tg*f[5];
-    t = f[5]*g[18]+f[18]*g[5];
-    y[7] += CONSTANT(0.180223751571000010)*t;
-    y[21] += CONSTANT(0.090297865407399994)*t;
-    y[23] += CONSTANT(-0.132725386549000010)*t;
-
-    // [5,19]: 6,8,20,22,
-    tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22];
-    tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22];
-    y[5] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[5]+tg*f[5];
-    t = f[5]*g[19]+f[19]*g[5];
-    y[6] += CONSTANT(0.220728115440999990)*t;
-    y[8] += CONSTANT(0.063718718433900007)*t;
-    y[20] += CONSTANT(0.044869370061299998)*t;
-    y[22] += CONSTANT(-0.090297865408399999)*t;
-
-    // [6,6]: 0,6,20,
-    tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20];
-    tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20];
-    y[6] += tf*g[6]+tg*f[6];
-    t = f[6]*g[6];
-    y[0] += CONSTANT(0.282094797560000000)*t;
-    y[6] += CONSTANT(0.180223764527000010)*t;
-    y[20] += CONSTANT(0.241795553185999990)*t;
-
-    // [7,7]: 6,0,8,20,22,
-    tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22];
-    tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22];
-    y[7] += tf*g[7]+tg*f[7];
-    t = f[7]*g[7];
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.156078347227999990)*t;
-    y[20] += CONSTANT(-0.161197023870999990)*t;
-    y[22] += CONSTANT(0.180223751574000000)*t;
-
-    // [7,13]: 12,2,14,
-    tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14];
-    tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14];
-    y[7] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[7]+tg*f[7];
-    t = f[7]*g[13]+f[13]*g[7];
-    y[12] += CONSTANT(0.059470803871800003)*t;
-    y[2] += CONSTANT(0.233596680327000010)*t;
-    y[14] += CONSTANT(0.115164716491000000)*t;
-
-    // [7,17]: 16,4,18,
-    tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18];
-    tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18];
-    y[7] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[7]+tg*f[7];
-    t = f[7]*g[17]+f[17]*g[7];
-    y[16] += CONSTANT(0.140463346187999990)*t;
-    y[4] += CONSTANT(0.168583882835000000)*t;
-    y[18] += CONSTANT(0.132725386549000010)*t;
-
-    // [7,21]: 8,20,6,22,
-    tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22];
-    tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22];
-    y[7] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[7]+tg*f[7];
-    t = f[7]*g[21]+f[21]*g[7];
-    y[8] += CONSTANT(-0.063718718433900007)*t;
-    y[20] += CONSTANT(0.044869370061299998)*t;
-    y[6] += CONSTANT(0.220728115440999990)*t;
-    y[22] += CONSTANT(0.090297865408399999)*t;
-
-    // [7,23]: 8,22,24,
-    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24];
-    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24];
-    y[7] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[7]+tg*f[7];
-    t = f[7]*g[23]+f[23]*g[7];
-    y[8] += CONSTANT(0.168583882832999990)*t;
-    y[22] += CONSTANT(0.132725386548000010)*t;
-    y[24] += CONSTANT(0.140463346189000000)*t;
-
-    // [8,8]: 0,6,20,24,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24];
-    y[8] += tf*g[8]+tg*f[8];
-    t = f[8]*g[8];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-    y[20] += CONSTANT(0.040299255967500003)*t;
-    y[24] += CONSTANT(0.238413613505999990)*t;
-
-    // [8,22]: 6,20,24,
-    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24];
-    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24];
-    y[8] += tf*g[22]+tg*f[22];
-    y[22] += tf*g[8]+tg*f[8];
-    t = f[8]*g[22]+f[22]*g[8];
-    y[6] += CONSTANT(0.156078347226000000)*t;
-    y[20] += CONSTANT(-0.190364615029000010)*t;
-    y[24] += CONSTANT(-0.075080816691500005)*t;
-
-    // [9,9]: 6,0,20,
-    tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20];
-    tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20];
-    y[9] += tf*g[9]+tg*f[9];
-    t = f[9]*g[9];
-    y[6] += CONSTANT(-0.210261043508000010)*t;
-    y[0] += CONSTANT(0.282094791766999970)*t;
-    y[20] += CONSTANT(0.076934943209800002)*t;
-
-    // [9,10]: 7,21,
-    tf = CONSTANT(0.148677009678999990)*f[7]+CONSTANT(-0.099322584599600000)*f[21];
-    tg = CONSTANT(0.148677009678999990)*g[7]+CONSTANT(-0.099322584599600000)*g[21];
-    y[9] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[9]+tg*f[9];
-    t = f[9]*g[10]+f[10]*g[9];
-    y[7] += CONSTANT(0.148677009678999990)*t;
-    y[21] += CONSTANT(-0.099322584599600000)*t;
-
-    // [9,11]: 8,22,24,
-    tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(0.117520066950999990)*f[24];
-    tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(0.117520066950999990)*g[24];
-    y[9] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[9]+tg*f[9];
-    t = f[9]*g[11]+f[11]*g[9];
-    y[8] += CONSTANT(-0.094031597259499999)*t;
-    y[22] += CONSTANT(0.133255230518000010)*t;
-    y[24] += CONSTANT(0.117520066950999990)*t;
-
-    // [9,13]: 4,16,18,
-    tf = CONSTANT(-0.094031597258400004)*f[4]+CONSTANT(-0.117520066953000000)*f[16]+CONSTANT(0.133255230519000010)*f[18];
-    tg = CONSTANT(-0.094031597258400004)*g[4]+CONSTANT(-0.117520066953000000)*g[16]+CONSTANT(0.133255230519000010)*g[18];
-    y[9] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[9]+tg*f[9];
-    t = f[9]*g[13]+f[13]*g[9];
-    y[4] += CONSTANT(-0.094031597258400004)*t;
-    y[16] += CONSTANT(-0.117520066953000000)*t;
-    y[18] += CONSTANT(0.133255230519000010)*t;
-
-    // [9,14]: 5,19,
-    tf = CONSTANT(0.148677009677999990)*f[5]+CONSTANT(-0.099322584600699995)*f[19];
-    tg = CONSTANT(0.148677009677999990)*g[5]+CONSTANT(-0.099322584600699995)*g[19];
-    y[9] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[9]+tg*f[9];
-    t = f[9]*g[14]+f[14]*g[9];
-    y[5] += CONSTANT(0.148677009677999990)*t;
-    y[19] += CONSTANT(-0.099322584600699995)*t;
-
-    // [9,17]: 2,12,
-    tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12];
-    tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12];
-    y[9] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[9]+tg*f[9];
-    t = f[9]*g[17]+f[17]*g[9];
-    y[2] += CONSTANT(0.162867503964999990)*t;
-    y[12] += CONSTANT(-0.203550726872999990)*t;
-
-    // [10,10]: 0,20,24,
-    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24];
-    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24];
-    y[10] += tf*g[10]+tg*f[10];
-    t = f[10]*g[10];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.179514867494000000)*t;
-    y[24] += CONSTANT(-0.151717754049000010)*t;
-
-    // [10,11]: 7,21,23,
-    tf = CONSTANT(0.115164716490000000)*f[7]+CONSTANT(0.102579924281000000)*f[21]+CONSTANT(-0.067850242288900006)*f[23];
-    tg = CONSTANT(0.115164716490000000)*g[7]+CONSTANT(0.102579924281000000)*g[21]+CONSTANT(-0.067850242288900006)*g[23];
-    y[10] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[10]+tg*f[10];
-    t = f[10]*g[11]+f[11]*g[10];
-    y[7] += CONSTANT(0.115164716490000000)*t;
-    y[21] += CONSTANT(0.102579924281000000)*t;
-    y[23] += CONSTANT(-0.067850242288900006)*t;
-
-    // [10,12]: 4,18,
-    tf = CONSTANT(-0.188063194517999990)*f[4]+CONSTANT(-0.044418410173299998)*f[18];
-    tg = CONSTANT(-0.188063194517999990)*g[4]+CONSTANT(-0.044418410173299998)*g[18];
-    y[10] += tf*g[12]+tg*f[12];
-    y[12] += tf*g[10]+tg*f[10];
-    t = f[10]*g[12]+f[12]*g[10];
-    y[4] += CONSTANT(-0.188063194517999990)*t;
-    y[18] += CONSTANT(-0.044418410173299998)*t;
-
-    // [10,13]: 5,17,19,
-    tf = CONSTANT(0.115164716490000000)*f[5]+CONSTANT(0.067850242288900006)*f[17]+CONSTANT(0.102579924281000000)*f[19];
-    tg = CONSTANT(0.115164716490000000)*g[5]+CONSTANT(0.067850242288900006)*g[17]+CONSTANT(0.102579924281000000)*g[19];
-    y[10] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[10]+tg*f[10];
-    t = f[10]*g[13]+f[13]*g[10];
-    y[5] += CONSTANT(0.115164716490000000)*t;
-    y[17] += CONSTANT(0.067850242288900006)*t;
-    y[19] += CONSTANT(0.102579924281000000)*t;
-
-    // [10,14]: 16,
-    tf = CONSTANT(0.151717754044999990)*f[16];
-    tg = CONSTANT(0.151717754044999990)*g[16];
-    y[10] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[10]+tg*f[10];
-    t = f[10]*g[14]+f[14]*g[10];
-    y[16] += CONSTANT(0.151717754044999990)*t;
-
-    // [10,15]: 5,19,
-    tf = CONSTANT(-0.148677009678999990)*f[5]+CONSTANT(0.099322584599600000)*f[19];
-    tg = CONSTANT(-0.148677009678999990)*g[5]+CONSTANT(0.099322584599600000)*g[19];
-    y[10] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[10]+tg*f[10];
-    t = f[10]*g[15]+f[15]*g[10];
-    y[5] += CONSTANT(-0.148677009678999990)*t;
-    y[19] += CONSTANT(0.099322584599600000)*t;
-
-    // [11,11]: 0,6,8,20,22,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22];
-    y[11] += tf*g[11]+tg*f[11];
-    t = f[11]*g[11];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.126156626101000010)*t;
-    y[8] += CONSTANT(-0.145673124078999990)*t;
-    y[20] += CONSTANT(0.025644981070299999)*t;
-    y[22] += CONSTANT(-0.114687841910000000)*t;
-
-    // [11,14]: 17,
-    tf = CONSTANT(0.067850242288500007)*f[17];
-    tg = CONSTANT(0.067850242288500007)*g[17];
-    y[11] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[11]+tg*f[11];
-    t = f[11]*g[14]+f[14]*g[11];
-    y[17] += CONSTANT(0.067850242288500007)*t;
-
-    // [11,15]: 16,
-    tf = CONSTANT(-0.117520066953000000)*f[16];
-    tg = CONSTANT(-0.117520066953000000)*g[16];
-    y[11] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[11]+tg*f[11];
-    t = f[11]*g[15]+f[15]*g[11];
-    y[16] += CONSTANT(-0.117520066953000000)*t;
-
-    // [11,18]: 3,13,15,
-    tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15];
-    tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15];
-    y[11] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[11]+tg*f[11];
-    t = f[11]*g[18]+f[18]*g[11];
-    y[3] += CONSTANT(0.168583882834000000)*t;
-    y[13] += CONSTANT(0.114687841909000000)*t;
-    y[15] += CONSTANT(-0.133255230519000010)*t;
-
-    // [11,19]: 2,14,12,
-    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12];
-    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12];
-    y[11] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[11]+tg*f[11];
-    t = f[11]*g[19]+f[19]*g[11];
-    y[2] += CONSTANT(0.238413613504000000)*t;
-    y[14] += CONSTANT(-0.102579924282000000)*t;
-    y[12] += CONSTANT(0.099322584599300004)*t;
-
-    // [12,12]: 0,6,20,
-    tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20];
-    tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20];
-    y[12] += tf*g[12]+tg*f[12];
-    t = f[12]*g[12];
-    y[0] += CONSTANT(0.282094799871999980)*t;
-    y[6] += CONSTANT(0.168208852954000010)*t;
-    y[20] += CONSTANT(0.153869910786000010)*t;
-
-    // [12,14]: 8,22,
-    tf = CONSTANT(-0.188063194517999990)*f[8]+CONSTANT(-0.044418410173299998)*f[22];
-    tg = CONSTANT(-0.188063194517999990)*g[8]+CONSTANT(-0.044418410173299998)*g[22];
-    y[12] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[12]+tg*f[12];
-    t = f[12]*g[14]+f[14]*g[12];
-    y[8] += CONSTANT(-0.188063194517999990)*t;
-    y[22] += CONSTANT(-0.044418410173299998)*t;
-
-    // [13,13]: 0,8,6,20,22,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22];
-    y[13] += tf*g[13]+tg*f[13];
-    t = f[13]*g[13];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.145673124078999990)*t;
-    y[6] += CONSTANT(0.126156626101000010)*t;
-    y[20] += CONSTANT(0.025644981070299999)*t;
-    y[22] += CONSTANT(0.114687841910000000)*t;
-
-    // [13,14]: 23,
-    tf = CONSTANT(0.067850242288500007)*f[23];
-    tg = CONSTANT(0.067850242288500007)*g[23];
-    y[13] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[13]+tg*f[13];
-    t = f[13]*g[14]+f[14]*g[13];
-    y[23] += CONSTANT(0.067850242288500007)*t;
-
-    // [13,15]: 8,22,24,
-    tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(-0.117520066950999990)*f[24];
-    tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(-0.117520066950999990)*g[24];
-    y[13] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[13]+tg*f[13];
-    t = f[13]*g[15]+f[15]*g[13];
-    y[8] += CONSTANT(-0.094031597259499999)*t;
-    y[22] += CONSTANT(0.133255230518000010)*t;
-    y[24] += CONSTANT(-0.117520066950999990)*t;
-
-    // [13,21]: 2,12,14,
-    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14];
-    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14];
-    y[13] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[13]+tg*f[13];
-    t = f[13]*g[21]+f[21]*g[13];
-    y[2] += CONSTANT(0.238413613504000000)*t;
-    y[12] += CONSTANT(0.099322584599300004)*t;
-    y[14] += CONSTANT(0.102579924282000000)*t;
-
-    // [14,14]: 0,20,24,
-    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24];
-    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24];
-    y[14] += tf*g[14]+tg*f[14];
-    t = f[14]*g[14];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.179514867494000000)*t;
-    y[24] += CONSTANT(0.151717754049000010)*t;
-
-    // [14,15]: 7,21,
-    tf = CONSTANT(0.148677009677999990)*f[7]+CONSTANT(-0.099322584600699995)*f[21];
-    tg = CONSTANT(0.148677009677999990)*g[7]+CONSTANT(-0.099322584600699995)*g[21];
-    y[14] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[14]+tg*f[14];
-    t = f[14]*g[15]+f[15]*g[14];
-    y[7] += CONSTANT(0.148677009677999990)*t;
-    y[21] += CONSTANT(-0.099322584600699995)*t;
-
-    // [15,15]: 0,6,20,
-    tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20];
-    tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20];
-    y[15] += tf*g[15]+tg*f[15];
-    t = f[15]*g[15];
-    y[0] += CONSTANT(0.282094791766999970)*t;
-    y[6] += CONSTANT(-0.210261043508000010)*t;
-    y[20] += CONSTANT(0.076934943209800002)*t;
-
-    // [15,23]: 12,2,
-    tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2];
-    tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2];
-    y[15] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[15]+tg*f[15];
-    t = f[15]*g[23]+f[23]*g[15];
-    y[12] += CONSTANT(-0.203550726872999990)*t;
-    y[2] += CONSTANT(0.162867503964999990)*t;
-
-    // [16,16]: 0,6,20,
-    tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20];
-    tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20];
-    y[16] += tf*g[16]+tg*f[16];
-    t = f[16]*g[16];
-    y[0] += CONSTANT(0.282094791763999990)*t;
-    y[6] += CONSTANT(-0.229375683829000000)*t;
-    y[20] += CONSTANT(0.106525305981000000)*t;
-
-    // [16,18]: 8,22,
-    tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22];
-    tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22];
-    y[16] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[16]+tg*f[16];
-    t = f[16]*g[18]+f[18]*g[16];
-    y[8] += CONSTANT(-0.075080816693699995)*t;
-    y[22] += CONSTANT(0.135045473380000000)*t;
-
-    // [16,23]: 19,5,
-    tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5];
-    tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5];
-    y[16] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[16]+tg*f[16];
-    t = f[16]*g[23]+f[23]*g[16];
-    y[19] += CONSTANT(-0.119098912754999990)*t;
-    y[5] += CONSTANT(0.140463346187999990)*t;
-
-    // [17,17]: 0,6,20,
-    tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20];
-    tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20];
-    y[17] += tf*g[17]+tg*f[17];
-    t = f[17]*g[17];
-    y[0] += CONSTANT(0.282094791768999990)*t;
-    y[6] += CONSTANT(-0.057343920955899998)*t;
-    y[20] += CONSTANT(-0.159787958979000000)*t;
-
-    // [17,19]: 8,22,24,
-    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24];
-    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24];
-    y[17] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[17]+tg*f[17];
-    t = f[17]*g[19]+f[19]*g[17];
-    y[8] += CONSTANT(-0.112621225039000000)*t;
-    y[22] += CONSTANT(0.045015157794100001)*t;
-    y[24] += CONSTANT(0.119098912753000000)*t;
-
-    // [17,21]: 16,4,18,
-    tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18];
-    tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18];
-    y[17] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[17]+tg*f[17];
-    t = f[17]*g[21]+f[21]*g[17];
-    y[16] += CONSTANT(-0.119098912754999990)*t;
-    y[4] += CONSTANT(-0.112621225039000000)*t;
-    y[18] += CONSTANT(0.045015157794399997)*t;
-
-    // [18,18]: 6,0,20,24,
-    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24];
-    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24];
-    y[18] += tf*g[18]+tg*f[18];
-    t = f[18]*g[18];
-    y[6] += CONSTANT(0.065535909662600006)*t;
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.083698454702400005)*t;
-    y[24] += CONSTANT(-0.135045473384000000)*t;
-
-    // [18,19]: 7,21,23,
-    tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23];
-    tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23];
-    y[18] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[18]+tg*f[18];
-    t = f[18]*g[19]+f[19]*g[18];
-    y[7] += CONSTANT(0.090297865407399994)*t;
-    y[21] += CONSTANT(0.102084782359000000)*t;
-    y[23] += CONSTANT(-0.045015157794399997)*t;
-
-    // [19,19]: 6,8,0,20,22,
-    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22];
-    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22];
-    y[19] += tf*g[19]+tg*f[19];
-    t = f[19]*g[19];
-    y[6] += CONSTANT(0.139263808033999990)*t;
-    y[8] += CONSTANT(-0.141889406570999990)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[20] += CONSTANT(0.068480553847200004)*t;
-    y[22] += CONSTANT(-0.102084782360000000)*t;
-
-    // [20,20]: 6,0,20,
-    tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0];
-    tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0];
-    y[20] += tf*g[20]+tg*f[20];
-    t = f[20]*g[20];
-    y[6] += CONSTANT(0.163839797503000010)*t;
-    y[0] += CONSTANT(0.282094802232000010)*t;
-    y[20] += CONSTANT(0.136961139005999990)*t;
-
-    // [21,21]: 6,20,0,8,22,
-    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22];
-    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22];
-    y[21] += tf*g[21]+tg*f[21];
-    t = f[21]*g[21];
-    y[6] += CONSTANT(0.139263808033999990)*t;
-    y[20] += CONSTANT(0.068480553847200004)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.141889406570999990)*t;
-    y[22] += CONSTANT(0.102084782360000000)*t;
-
-    // [21,23]: 8,22,24,
-    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24];
-    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24];
-    y[21] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[21]+tg*f[21];
-    t = f[21]*g[23]+f[23]*g[21];
-    y[8] += CONSTANT(-0.112621225039000000)*t;
-    y[22] += CONSTANT(0.045015157794100001)*t;
-    y[24] += CONSTANT(-0.119098912753000000)*t;
-
-    // [22,22]: 6,20,0,24,
-    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24];
-    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24];
-    y[22] += tf*g[22]+tg*f[22];
-    t = f[22]*g[22];
-    y[6] += CONSTANT(0.065535909662600006)*t;
-    y[20] += CONSTANT(-0.083698454702400005)*t;
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[24] += CONSTANT(0.135045473384000000)*t;
-
-    // [23,23]: 6,20,0,
-    tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0];
-    tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0];
-    y[23] += tf*g[23]+tg*f[23];
-    t = f[23]*g[23];
-    y[6] += CONSTANT(-0.057343920955899998)*t;
-    y[20] += CONSTANT(-0.159787958979000000)*t;
-    y[0] += CONSTANT(0.282094791768999990)*t;
-
-    // [24,24]: 6,0,20,
-    tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20];
-    tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20];
-    y[24] += tf*g[24]+tg*f[24];
-    t = f[24]*g[24];
-    y[6] += CONSTANT(-0.229375683829000000)*t;
-    y[0] += CONSTANT(0.282094791763999990)*t;
-    y[20] += CONSTANT(0.106525305981000000)*t;
-
-    // multiply count=1135
-
-    return y;
-}
-
-
-//-------------------------------------------------------------------------------------
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232909.aspx
-//-------------------------------------------------------------------------------------
-float* XMSHMultiply6( _Out_writes_(36) float *y,
-                      _In_reads_(36) const float *f,
-                      _In_reads_(36) const float *g )
-{
-    if ( !y || !f || !g )
-        return nullptr;
-
-    REAL tf,tg,t;
-    // [0,0]: 0,
-    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
-
-    // [1,1]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
-    y[1]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[1];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6]  = CONSTANT(-0.126156626101000010)*t;
-    y[8]  = CONSTANT(-0.218509686119999990)*t;
-
-    // [1,4]: 3,13,15,
-    tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15];
-    tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15];
-    y[1] += tf*g[4]+tg*f[4];
-    y[4]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[4]+f[4]*g[1];
-    y[3]  = CONSTANT(0.218509686114999990)*t;
-    y[13]  = CONSTANT(-0.058399170082300000)*t;
-    y[15]  = CONSTANT(-0.226179013157999990)*t;
-
-    // [1,5]: 2,12,
-    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12];
-    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12];
-    y[1] += tf*g[5]+tg*f[5];
-    y[5]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[5]+f[5]*g[1];
-    y[2]  = CONSTANT(0.218509686118000010)*t;
-    y[12]  = CONSTANT(-0.143048168103000000)*t;
-
-    // [1,11]: 6,8,20,22,
-    tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22];
-    tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22];
-    y[1] += tf*g[11]+tg*f[11];
-    y[11]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[11]+f[11]*g[1];
-    y[6] += CONSTANT(0.202300659402999990)*t;
-    y[8] += CONSTANT(0.058399170081799998)*t;
-    y[20]  = CONSTANT(-0.150786008773000000)*t;
-    y[22]  = CONSTANT(-0.168583882836999990)*t;
-
-    // [1,16]: 15,33,35,
-    tf = CONSTANT(0.230329432973999990)*f[15]+CONSTANT(-0.034723468517399998)*f[33]+CONSTANT(-0.232932108051999990)*f[35];
-    tg = CONSTANT(0.230329432973999990)*g[15]+CONSTANT(-0.034723468517399998)*g[33]+CONSTANT(-0.232932108051999990)*g[35];
-    y[1] += tf*g[16]+tg*f[16];
-    y[16]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[16]+f[16]*g[1];
-    y[15] += CONSTANT(0.230329432973999990)*t;
-    y[33]  = CONSTANT(-0.034723468517399998)*t;
-    y[35]  = CONSTANT(-0.232932108051999990)*t;
-
-    // [1,18]: 15,13,31,33,
-    tf = CONSTANT(0.043528171377799997)*f[15]+CONSTANT(0.168583882834000000)*f[13]+CONSTANT(-0.085054779966799998)*f[31]+CONSTANT(-0.183739324705999990)*f[33];
-    tg = CONSTANT(0.043528171377799997)*g[15]+CONSTANT(0.168583882834000000)*g[13]+CONSTANT(-0.085054779966799998)*g[31]+CONSTANT(-0.183739324705999990)*g[33];
-    y[1] += tf*g[18]+tg*f[18];
-    y[18]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[18]+f[18]*g[1];
-    y[15] += CONSTANT(0.043528171377799997)*t;
-    y[13] += CONSTANT(0.168583882834000000)*t;
-    y[31]  = CONSTANT(-0.085054779966799998)*t;
-    y[33] += CONSTANT(-0.183739324705999990)*t;
-
-    // [1,19]: 14,12,30,32,
-    tf = CONSTANT(0.075393004386399995)*f[14]+CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(-0.159122922869999990)*f[32];
-    tg = CONSTANT(0.075393004386399995)*g[14]+CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(-0.159122922869999990)*g[32];
-    y[1] += tf*g[19]+tg*f[19];
-    y[19]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[19]+f[19]*g[1];
-    y[14]  = CONSTANT(0.075393004386399995)*t;
-    y[12] += CONSTANT(0.194663900273000010)*t;
-    y[30]  = CONSTANT(-0.155288072037000010)*t;
-    y[32]  = CONSTANT(-0.159122922869999990)*t;
-
-    // [1,24]: 9,25,27,
-    tf = CONSTANT(-0.230329432978999990)*f[9]+CONSTANT(0.232932108049000000)*f[25]+CONSTANT(0.034723468517100002)*f[27];
-    tg = CONSTANT(-0.230329432978999990)*g[9]+CONSTANT(0.232932108049000000)*g[25]+CONSTANT(0.034723468517100002)*g[27];
-    y[1] += tf*g[24]+tg*f[24];
-    y[24]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[24]+f[24]*g[1];
-    y[9]  = CONSTANT(-0.230329432978999990)*t;
-    y[25]  = CONSTANT(0.232932108049000000)*t;
-    y[27]  = CONSTANT(0.034723468517100002)*t;
-
-    // [1,29]: 22,20,
-    tf = CONSTANT(0.085054779965999999)*f[22]+CONSTANT(0.190188269815000010)*f[20];
-    tg = CONSTANT(0.085054779965999999)*g[22]+CONSTANT(0.190188269815000010)*g[20];
-    y[1] += tf*g[29]+tg*f[29];
-    y[29]  = tf*g[1]+tg*f[1];
-    t = f[1]*g[29]+f[29]*g[1];
-    y[22] += CONSTANT(0.085054779965999999)*t;
-    y[20] += CONSTANT(0.190188269815000010)*t;
-
-    // [2,2]: 0,6,
-    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
-    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
-    y[2] += tf*g[2]+tg*f[2];
-    t = f[2]*g[2];
-    y[0] += CONSTANT(0.282094795249000000)*t;
-    y[6] += CONSTANT(0.252313259986999990)*t;
-
-    // [2,12]: 6,20,
-    tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20];
-    tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20];
-    y[2] += tf*g[12]+tg*f[12];
-    y[12] += tf*g[2]+tg*f[2];
-    t = f[2]*g[12]+f[12]*g[2];
-    y[6] += CONSTANT(0.247766706973999990)*t;
-    y[20] += CONSTANT(0.246232537174000010)*t;
-
-    // [2,20]: 30,
-    tf = CONSTANT(0.245532020560000010)*f[30];
-    tg = CONSTANT(0.245532020560000010)*g[30];
-    y[2] += tf*g[20]+tg*f[20];
-    y[20] += tf*g[2]+tg*f[2];
-    t = f[2]*g[20]+f[20]*g[2];
-    y[30] += CONSTANT(0.245532020560000010)*t;
-
-    // [3,3]: 0,6,8,
-    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
-    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
-    y[3] += tf*g[3]+tg*f[3];
-    t = f[3]*g[3];
-    y[0] += CONSTANT(0.282094791773000010)*t;
-    y[6] += CONSTANT(-0.126156626101000010)*t;
-    y[8] += CONSTANT(0.218509686119999990)*t;
-
-    // [3,7]: 2,12,
-    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12];
-    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12];
-    y[3] += tf*g[7]+tg*f[7];
-    y[7]  = tf*g[3]+tg*f[3];
-    t = f[3]*g[7]+f[7]*g[3];
-    y[2] += CONSTANT(0.218509686118000010)*t;
-    y[12] += CONSTANT(-0.143048168103000000)*t;
-
-    // [3,13]: 8,6,20,22,
-    tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22];
-    tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22];
-    y[3] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[3]+tg*f[3];
-    t = f[3]*g[13]+f[13]*g[3];
-    y[8] += CONSTANT(-0.058399170081799998)*t;
-    y[6] += CONSTANT(0.202300659402999990)*t;
-    y[20] += CONSTANT(-0.150786008773000000)*t;
-    y[22] += CONSTANT(0.168583882836999990)*t;
-
-    // [3,16]: 9,25,27,
-    tf = CONSTANT(0.230329432973999990)*f[9]+CONSTANT(0.232932108051999990)*f[25]+CONSTANT(-0.034723468517399998)*f[27];
-    tg = CONSTANT(0.230329432973999990)*g[9]+CONSTANT(0.232932108051999990)*g[25]+CONSTANT(-0.034723468517399998)*g[27];
-    y[3] += tf*g[16]+tg*f[16];
-    y[16] += tf*g[3]+tg*f[3];
-    t = f[3]*g[16]+f[16]*g[3];
-    y[9] += CONSTANT(0.230329432973999990)*t;
-    y[25] += CONSTANT(0.232932108051999990)*t;
-    y[27] += CONSTANT(-0.034723468517399998)*t;
-
-    // [3,21]: 12,14,30,32,
-    tf = CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.075393004386399995)*f[14]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(0.159122922869999990)*f[32];
-    tg = CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.075393004386399995)*g[14]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(0.159122922869999990)*g[32];
-    y[3] += tf*g[21]+tg*f[21];
-    y[21]  = tf*g[3]+tg*f[3];
-    t = f[3]*g[21]+f[21]*g[3];
-    y[12] += CONSTANT(0.194663900273000010)*t;
-    y[14] += CONSTANT(-0.075393004386399995)*t;
-    y[30] += CONSTANT(-0.155288072037000010)*t;
-    y[32] += CONSTANT(0.159122922869999990)*t;
-
-    // [3,24]: 15,33,35,
-    tf = CONSTANT(0.230329432978999990)*f[15]+CONSTANT(-0.034723468517100002)*f[33]+CONSTANT(0.232932108049000000)*f[35];
-    tg = CONSTANT(0.230329432978999990)*g[15]+CONSTANT(-0.034723468517100002)*g[33]+CONSTANT(0.232932108049000000)*g[35];
-    y[3] += tf*g[24]+tg*f[24];
-    y[24] += tf*g[3]+tg*f[3];
-    t = f[3]*g[24]+f[24]*g[3];
-    y[15] += CONSTANT(0.230329432978999990)*t;
-    y[33] += CONSTANT(-0.034723468517100002)*t;
-    y[35] += CONSTANT(0.232932108049000000)*t;
-
-    // [3,31]: 20,22,
-    tf = CONSTANT(0.190188269815000010)*f[20]+CONSTANT(-0.085054779965999999)*f[22];
-    tg = CONSTANT(0.190188269815000010)*g[20]+CONSTANT(-0.085054779965999999)*g[22];
-    y[3] += tf*g[31]+tg*f[31];
-    y[31] += tf*g[3]+tg*f[3];
-    t = f[3]*g[31]+f[31]*g[3];
-    y[20] += CONSTANT(0.190188269815000010)*t;
-    y[22] += CONSTANT(-0.085054779965999999)*t;
-
-    // [4,4]: 0,6,20,24,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24];
-    y[4] += tf*g[4]+tg*f[4];
-    t = f[4]*g[4];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-    y[20] += CONSTANT(0.040299255967500003)*t;
-    y[24] += CONSTANT(-0.238413613505999990)*t;
-
-    // [4,5]: 7,21,23,
-    tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23];
-    tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23];
-    y[4] += tf*g[5]+tg*f[5];
-    y[5] += tf*g[4]+tg*f[4];
-    t = f[4]*g[5]+f[5]*g[4];
-    y[7] += CONSTANT(0.156078347226000000)*t;
-    y[21] += CONSTANT(-0.063718718434399996)*t;
-    y[23]  = CONSTANT(-0.168583882835000000)*t;
-
-    // [4,9]: 3,13,31,35,
-    tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13]+CONSTANT(0.016943317729299998)*f[31]+CONSTANT(-0.245532000542000000)*f[35];
-    tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13]+CONSTANT(0.016943317729299998)*g[31]+CONSTANT(-0.245532000542000000)*g[35];
-    y[4] += tf*g[9]+tg*f[9];
-    y[9] += tf*g[4]+tg*f[4];
-    t = f[4]*g[9]+f[9]*g[4];
-    y[3] += CONSTANT(0.226179013157999990)*t;
-    y[13] += CONSTANT(-0.094031597258400004)*t;
-    y[31] += CONSTANT(0.016943317729299998)*t;
-    y[35] += CONSTANT(-0.245532000542000000)*t;
-
-    // [4,10]: 2,12,30,34,
-    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(-0.190188269816000010)*f[34];
-    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(-0.190188269816000010)*g[34];
-    y[4] += tf*g[10]+tg*f[10];
-    y[10]  = tf*g[4]+tg*f[4];
-    t = f[4]*g[10]+f[10]*g[4];
-    y[2] += CONSTANT(0.184674390919999990)*t;
-    y[12] += CONSTANT(-0.188063194517999990)*t;
-    y[30] += CONSTANT(0.053579475144400000)*t;
-    y[34]  = CONSTANT(-0.190188269816000010)*t;
-
-    // [4,11]: 3,13,15,31,33,
-    tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]+CONSTANT(-0.065621187395699998)*f[31]+CONSTANT(-0.141757966610000010)*f[33];
-    tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]+CONSTANT(-0.065621187395699998)*g[31]+CONSTANT(-0.141757966610000010)*g[33];
-    y[4] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[4]+tg*f[4];
-    t = f[4]*g[11]+f[11]*g[4];
-    y[3] += CONSTANT(-0.058399170082300000)*t;
-    y[13] += CONSTANT(0.145673124078000010)*t;
-    y[15] += CONSTANT(0.094031597258400004)*t;
-    y[31] += CONSTANT(-0.065621187395699998)*t;
-    y[33] += CONSTANT(-0.141757966610000010)*t;
-
-    // [4,16]: 8,22,
-    tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22];
-    tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22];
-    y[4] += tf*g[16]+tg*f[16];
-    y[16] += tf*g[4]+tg*f[4];
-    t = f[4]*g[16]+f[16]*g[4];
-    y[8] += CONSTANT(0.238413613494000000)*t;
-    y[22] += CONSTANT(-0.075080816693699995)*t;
-
-    // [4,18]: 6,20,24,
-    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24];
-    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24];
-    y[4] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[4]+tg*f[4];
-    t = f[4]*g[18]+f[18]*g[4];
-    y[6] += CONSTANT(0.156078347226000000)*t;
-    y[20] += CONSTANT(-0.190364615029000010)*t;
-    y[24] += CONSTANT(0.075080816691500005)*t;
-
-    // [4,19]: 7,21,23,
-    tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23];
-    tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23];
-    y[4] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[4]+tg*f[4];
-    t = f[4]*g[19]+f[19]*g[4];
-    y[7] += CONSTANT(-0.063718718434399996)*t;
-    y[21] += CONSTANT(0.141889406569999990)*t;
-    y[23] += CONSTANT(0.112621225039000000)*t;
-
-    // [4,25]: 15,33,
-    tf = CONSTANT(0.245532000542000000)*f[15]+CONSTANT(-0.062641347680800000)*f[33];
-    tg = CONSTANT(0.245532000542000000)*g[15]+CONSTANT(-0.062641347680800000)*g[33];
-    y[4] += tf*g[25]+tg*f[25];
-    y[25] += tf*g[4]+tg*f[4];
-    t = f[4]*g[25]+f[25]*g[4];
-    y[15] += CONSTANT(0.245532000542000000)*t;
-    y[33] += CONSTANT(-0.062641347680800000)*t;
-
-    // [4,26]: 14,32,
-    tf = CONSTANT(0.190188269806999990)*f[14]+CONSTANT(-0.097043558542400002)*f[32];
-    tg = CONSTANT(0.190188269806999990)*g[14]+CONSTANT(-0.097043558542400002)*g[32];
-    y[4] += tf*g[26]+tg*f[26];
-    y[26]  = tf*g[4]+tg*f[4];
-    t = f[4]*g[26]+f[26]*g[4];
-    y[14] += CONSTANT(0.190188269806999990)*t;
-    y[32] += CONSTANT(-0.097043558542400002)*t;
-
-    // [4,27]: 13,31,35,
-    tf = CONSTANT(0.141757966610000010)*f[13]+CONSTANT(-0.121034582549000000)*f[31]+CONSTANT(0.062641347680800000)*f[35];
-    tg = CONSTANT(0.141757966610000010)*g[13]+CONSTANT(-0.121034582549000000)*g[31]+CONSTANT(0.062641347680800000)*g[35];
-    y[4] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[4]+tg*f[4];
-    t = f[4]*g[27]+f[27]*g[4];
-    y[13] += CONSTANT(0.141757966610000010)*t;
-    y[31] += CONSTANT(-0.121034582549000000)*t;
-    y[35] += CONSTANT(0.062641347680800000)*t;
-
-    // [4,28]: 12,30,34,
-    tf = CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.097043558538899996)*f[34];
-    tg = CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.097043558538899996)*g[34];
-    y[4] += tf*g[28]+tg*f[28];
-    y[28]  = tf*g[4]+tg*f[4];
-    t = f[4]*g[28]+f[28]*g[4];
-    y[12] += CONSTANT(0.141757966609000000)*t;
-    y[30] += CONSTANT(-0.191372478254000000)*t;
-    y[34] += CONSTANT(0.097043558538899996)*t;
-
-    // [4,29]: 13,15,31,33,
-    tf = CONSTANT(-0.065621187395699998)*f[13]+CONSTANT(-0.016943317729299998)*f[15]+CONSTANT(0.140070311613999990)*f[31]+CONSTANT(0.121034582549000000)*f[33];
-    tg = CONSTANT(-0.065621187395699998)*g[13]+CONSTANT(-0.016943317729299998)*g[15]+CONSTANT(0.140070311613999990)*g[31]+CONSTANT(0.121034582549000000)*g[33];
-    y[4] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[4]+tg*f[4];
-    t = f[4]*g[29]+f[29]*g[4];
-    y[13] += CONSTANT(-0.065621187395699998)*t;
-    y[15] += CONSTANT(-0.016943317729299998)*t;
-    y[31] += CONSTANT(0.140070311613999990)*t;
-    y[33] += CONSTANT(0.121034582549000000)*t;
-
-    // [5,5]: 0,6,8,20,22,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22];
-    y[5] += tf*g[5]+tg*f[5];
-    t = f[5]*g[5];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[8] += CONSTANT(-0.156078347227999990)*t;
-    y[20] += CONSTANT(-0.161197023870999990)*t;
-    y[22] += CONSTANT(-0.180223751574000000)*t;
-
-    // [5,10]: 3,13,15,31,33,
-    tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15]+CONSTANT(-0.083004965974099995)*f[31]+CONSTANT(-0.179311220383999990)*f[33];
-    tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15]+CONSTANT(-0.083004965974099995)*g[31]+CONSTANT(-0.179311220383999990)*g[33];
-    y[5] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[5]+tg*f[5];
-    t = f[5]*g[10]+f[10]*g[5];
-    y[3] += CONSTANT(0.184674390919999990)*t;
-    y[13] += CONSTANT(0.115164716490000000)*t;
-    y[15] += CONSTANT(-0.148677009678999990)*t;
-    y[31] += CONSTANT(-0.083004965974099995)*t;
-    y[33] += CONSTANT(-0.179311220383999990)*t;
-
-    // [5,11]: 2,12,14,30,32,
-    tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(-0.173617342585000000)*f[32];
-    tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(-0.173617342585000000)*g[32];
-    y[5] += tf*g[11]+tg*f[11];
-    y[11] += tf*g[5]+tg*f[5];
-    t = f[5]*g[11]+f[11]*g[5];
-    y[2] += CONSTANT(0.233596680327000010)*t;
-    y[12] += CONSTANT(0.059470803871800003)*t;
-    y[14] += CONSTANT(-0.115164716491000000)*t;
-    y[30] += CONSTANT(-0.169433177294000010)*t;
-    y[32] += CONSTANT(-0.173617342585000000)*t;
-
-    // [5,14]: 9,1,27,29,
-    tf = CONSTANT(0.148677009677999990)*f[9]+CONSTANT(-0.184674390923000000)*f[1]+CONSTANT(0.179311220382000010)*f[27]+CONSTANT(0.083004965973399999)*f[29];
-    tg = CONSTANT(0.148677009677999990)*g[9]+CONSTANT(-0.184674390923000000)*g[1]+CONSTANT(0.179311220382000010)*g[27]+CONSTANT(0.083004965973399999)*g[29];
-    y[5] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[5]+tg*f[5];
-    t = f[5]*g[14]+f[14]*g[5];
-    y[9] += CONSTANT(0.148677009677999990)*t;
-    y[1] += CONSTANT(-0.184674390923000000)*t;
-    y[27] += CONSTANT(0.179311220382000010)*t;
-    y[29] += CONSTANT(0.083004965973399999)*t;
-
-    // [5,17]: 8,22,24,
-    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24];
-    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24];
-    y[5] += tf*g[17]+tg*f[17];
-    y[17]  = tf*g[5]+tg*f[5];
-    t = f[5]*g[17]+f[17]*g[5];
-    y[8] += CONSTANT(0.168583882832999990)*t;
-    y[22] += CONSTANT(0.132725386548000010)*t;
-    y[24] += CONSTANT(-0.140463346189000000)*t;
-
-    // [5,18]: 7,21,23,
-    tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23];
-    tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23];
-    y[5] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[5]+tg*f[5];
-    t = f[5]*g[18]+f[18]*g[5];
-    y[7] += CONSTANT(0.180223751571000010)*t;
-    y[21] += CONSTANT(0.090297865407399994)*t;
-    y[23] += CONSTANT(-0.132725386549000010)*t;
-
-    // [5,19]: 6,8,20,22,
-    tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22];
-    tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22];
-    y[5] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[5]+tg*f[5];
-    t = f[5]*g[19]+f[19]*g[5];
-    y[6] += CONSTANT(0.220728115440999990)*t;
-    y[8] += CONSTANT(0.063718718433900007)*t;
-    y[20] += CONSTANT(0.044869370061299998)*t;
-    y[22] += CONSTANT(-0.090297865408399999)*t;
-
-    // [5,26]: 15,33,35,
-    tf = CONSTANT(0.155288072035000000)*f[15]+CONSTANT(0.138662534056999990)*f[33]+CONSTANT(-0.132882365179999990)*f[35];
-    tg = CONSTANT(0.155288072035000000)*g[15]+CONSTANT(0.138662534056999990)*g[33]+CONSTANT(-0.132882365179999990)*g[35];
-    y[5] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[5]+tg*f[5];
-    t = f[5]*g[26]+f[26]*g[5];
-    y[15] += CONSTANT(0.155288072035000000)*t;
-    y[33] += CONSTANT(0.138662534056999990)*t;
-    y[35] += CONSTANT(-0.132882365179999990)*t;
-
-    // [5,28]: 15,13,31,33,
-    tf = CONSTANT(0.044827805096399997)*f[15]+CONSTANT(0.173617342584000000)*f[13]+CONSTANT(0.074118242118699995)*f[31]+CONSTANT(-0.114366930522000000)*f[33];
-    tg = CONSTANT(0.044827805096399997)*g[15]+CONSTANT(0.173617342584000000)*g[13]+CONSTANT(0.074118242118699995)*g[31]+CONSTANT(-0.114366930522000000)*g[33];
-    y[5] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[5]+tg*f[5];
-    t = f[5]*g[28]+f[28]*g[5];
-    y[15] += CONSTANT(0.044827805096399997)*t;
-    y[13] += CONSTANT(0.173617342584000000)*t;
-    y[31] += CONSTANT(0.074118242118699995)*t;
-    y[33] += CONSTANT(-0.114366930522000000)*t;
-
-    // [5,29]: 12,30,32,
-    tf = CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.036165998945399999)*f[30]+CONSTANT(-0.074118242119099995)*f[32];
-    tg = CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.036165998945399999)*g[30]+CONSTANT(-0.074118242119099995)*g[32];
-    y[5] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[5]+tg*f[5];
-    t = f[5]*g[29]+f[29]*g[5];
-    y[12] += CONSTANT(0.214317900578999990)*t;
-    y[30] += CONSTANT(0.036165998945399999)*t;
-    y[32] += CONSTANT(-0.074118242119099995)*t;
-
-    // [5,32]: 9,27,
-    tf = CONSTANT(-0.044827805096799997)*f[9]+CONSTANT(0.114366930522000000)*f[27];
-    tg = CONSTANT(-0.044827805096799997)*g[9]+CONSTANT(0.114366930522000000)*g[27];
-    y[5] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[5]+tg*f[5];
-    t = f[5]*g[32]+f[32]*g[5];
-    y[9] += CONSTANT(-0.044827805096799997)*t;
-    y[27] += CONSTANT(0.114366930522000000)*t;
-
-    // [5,34]: 9,27,25,
-    tf = CONSTANT(-0.155288072036000010)*f[9]+CONSTANT(-0.138662534059000000)*f[27]+CONSTANT(0.132882365179000010)*f[25];
-    tg = CONSTANT(-0.155288072036000010)*g[9]+CONSTANT(-0.138662534059000000)*g[27]+CONSTANT(0.132882365179000010)*g[25];
-    y[5] += tf*g[34]+tg*f[34];
-    y[34] += tf*g[5]+tg*f[5];
-    t = f[5]*g[34]+f[34]*g[5];
-    y[9] += CONSTANT(-0.155288072036000010)*t;
-    y[27] += CONSTANT(-0.138662534059000000)*t;
-    y[25] += CONSTANT(0.132882365179000010)*t;
-
-    // [6,6]: 0,6,20,
-    tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20];
-    tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20];
-    y[6] += tf*g[6]+tg*f[6];
-    t = f[6]*g[6];
-    y[0] += CONSTANT(0.282094797560000000)*t;
-    y[6] += CONSTANT(0.180223764527000010)*t;
-    y[20] += CONSTANT(0.241795553185999990)*t;
-
-    // [7,7]: 6,0,8,20,22,
-    tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22];
-    tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22];
-    y[7] += tf*g[7]+tg*f[7];
-    t = f[7]*g[7];
-    y[6] += CONSTANT(0.090111875786499998)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.156078347227999990)*t;
-    y[20] += CONSTANT(-0.161197023870999990)*t;
-    y[22] += CONSTANT(0.180223751574000000)*t;
-
-    // [7,10]: 9,1,11,27,29,
-    tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11]+CONSTANT(0.179311220383999990)*f[27]+CONSTANT(-0.083004965974099995)*f[29];
-    tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11]+CONSTANT(0.179311220383999990)*g[27]+CONSTANT(-0.083004965974099995)*g[29];
-    y[7] += tf*g[10]+tg*f[10];
-    y[10] += tf*g[7]+tg*f[7];
-    t = f[7]*g[10]+f[10]*g[7];
-    y[9] += CONSTANT(0.148677009678999990)*t;
-    y[1] += CONSTANT(0.184674390919999990)*t;
-    y[11] += CONSTANT(0.115164716490000000)*t;
-    y[27] += CONSTANT(0.179311220383999990)*t;
-    y[29] += CONSTANT(-0.083004965974099995)*t;
-
-    // [7,13]: 12,2,14,30,32,
-    tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(0.173617342585000000)*f[32];
-    tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(0.173617342585000000)*g[32];
-    y[7] += tf*g[13]+tg*f[13];
-    y[13] += tf*g[7]+tg*f[7];
-    t = f[7]*g[13]+f[13]*g[7];
-    y[12] += CONSTANT(0.059470803871800003)*t;
-    y[2] += CONSTANT(0.233596680327000010)*t;
-    y[14] += CONSTANT(0.115164716491000000)*t;
-    y[30] += CONSTANT(-0.169433177294000010)*t;
-    y[32] += CONSTANT(0.173617342585000000)*t;
-
-    // [7,14]: 3,15,31,33,
-    tf = CONSTANT(0.184674390923000000)*f[3]+CONSTANT(0.148677009677999990)*f[15]+CONSTANT(-0.083004965973399999)*f[31]+CONSTANT(0.179311220382000010)*f[33];
-    tg = CONSTANT(0.184674390923000000)*g[3]+CONSTANT(0.148677009677999990)*g[15]+CONSTANT(-0.083004965973399999)*g[31]+CONSTANT(0.179311220382000010)*g[33];
-    y[7] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[7]+tg*f[7];
-    t = f[7]*g[14]+f[14]*g[7];
-    y[3] += CONSTANT(0.184674390923000000)*t;
-    y[15] += CONSTANT(0.148677009677999990)*t;
-    y[31] += CONSTANT(-0.083004965973399999)*t;
-    y[33] += CONSTANT(0.179311220382000010)*t;
-
-    // [7,17]: 16,4,18,
-    tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18];
-    tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18];
-    y[7] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[7]+tg*f[7];
-    t = f[7]*g[17]+f[17]*g[7];
-    y[16] += CONSTANT(0.140463346187999990)*t;
-    y[4] += CONSTANT(0.168583882835000000)*t;
-    y[18] += CONSTANT(0.132725386549000010)*t;
-
-    // [7,21]: 8,20,6,22,
-    tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22];
-    tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22];
-    y[7] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[7]+tg*f[7];
-    t = f[7]*g[21]+f[21]*g[7];
-    y[8] += CONSTANT(-0.063718718433900007)*t;
-    y[20] += CONSTANT(0.044869370061299998)*t;
-    y[6] += CONSTANT(0.220728115440999990)*t;
-    y[22] += CONSTANT(0.090297865408399999)*t;
-
-    // [7,23]: 8,22,24,
-    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24];
-    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24];
-    y[7] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[7]+tg*f[7];
-    t = f[7]*g[23]+f[23]*g[7];
-    y[8] += CONSTANT(0.168583882832999990)*t;
-    y[22] += CONSTANT(0.132725386548000010)*t;
-    y[24] += CONSTANT(0.140463346189000000)*t;
-
-    // [7,26]: 9,25,27,
-    tf = CONSTANT(0.155288072035000000)*f[9]+CONSTANT(0.132882365179999990)*f[25]+CONSTANT(0.138662534056999990)*f[27];
-    tg = CONSTANT(0.155288072035000000)*g[9]+CONSTANT(0.132882365179999990)*g[25]+CONSTANT(0.138662534056999990)*g[27];
-    y[7] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[7]+tg*f[7];
-    t = f[7]*g[26]+f[26]*g[7];
-    y[9] += CONSTANT(0.155288072035000000)*t;
-    y[25] += CONSTANT(0.132882365179999990)*t;
-    y[27] += CONSTANT(0.138662534056999990)*t;
-
-    // [7,28]: 27,11,9,29,
-    tf = CONSTANT(0.114366930522000000)*f[27]+CONSTANT(0.173617342584000000)*f[11]+CONSTANT(-0.044827805096399997)*f[9]+CONSTANT(0.074118242118699995)*f[29];
-    tg = CONSTANT(0.114366930522000000)*g[27]+CONSTANT(0.173617342584000000)*g[11]+CONSTANT(-0.044827805096399997)*g[9]+CONSTANT(0.074118242118699995)*g[29];
-    y[7] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[7]+tg*f[7];
-    t = f[7]*g[28]+f[28]*g[7];
-    y[27] += CONSTANT(0.114366930522000000)*t;
-    y[11] += CONSTANT(0.173617342584000000)*t;
-    y[9] += CONSTANT(-0.044827805096399997)*t;
-    y[29] += CONSTANT(0.074118242118699995)*t;
-
-    // [7,31]: 30,12,32,
-    tf = CONSTANT(0.036165998945399999)*f[30]+CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.074118242119099995)*f[32];
-    tg = CONSTANT(0.036165998945399999)*g[30]+CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.074118242119099995)*g[32];
-    y[7] += tf*g[31]+tg*f[31];
-    y[31] += tf*g[7]+tg*f[7];
-    t = f[7]*g[31]+f[31]*g[7];
-    y[30] += CONSTANT(0.036165998945399999)*t;
-    y[12] += CONSTANT(0.214317900578999990)*t;
-    y[32] += CONSTANT(0.074118242119099995)*t;
-
-    // [7,32]: 15,33,
-    tf = CONSTANT(-0.044827805096799997)*f[15]+CONSTANT(0.114366930522000000)*f[33];
-    tg = CONSTANT(-0.044827805096799997)*g[15]+CONSTANT(0.114366930522000000)*g[33];
-    y[7] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[7]+tg*f[7];
-    t = f[7]*g[32]+f[32]*g[7];
-    y[15] += CONSTANT(-0.044827805096799997)*t;
-    y[33] += CONSTANT(0.114366930522000000)*t;
-
-    // [7,34]: 15,33,35,
-    tf = CONSTANT(0.155288072036000010)*f[15]+CONSTANT(0.138662534059000000)*f[33]+CONSTANT(0.132882365179000010)*f[35];
-    tg = CONSTANT(0.155288072036000010)*g[15]+CONSTANT(0.138662534059000000)*g[33]+CONSTANT(0.132882365179000010)*g[35];
-    y[7] += tf*g[34]+tg*f[34];
-    y[34] += tf*g[7]+tg*f[7];
-    t = f[7]*g[34]+f[34]*g[7];
-    y[15] += CONSTANT(0.155288072036000010)*t;
-    y[33] += CONSTANT(0.138662534059000000)*t;
-    y[35] += CONSTANT(0.132882365179000010)*t;
-
-    // [8,8]: 0,6,20,24,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24];
-    y[8] += tf*g[8]+tg*f[8];
-    t = f[8]*g[8];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[6] += CONSTANT(-0.180223751576000010)*t;
-    y[20] += CONSTANT(0.040299255967500003)*t;
-    y[24] += CONSTANT(0.238413613505999990)*t;
-
-    // [8,9]: 1,11,25,29,
-    tf = CONSTANT(0.226179013155000000)*f[1]+CONSTANT(-0.094031597259499999)*f[11]+CONSTANT(0.245532000541000000)*f[25]+CONSTANT(0.016943317729199998)*f[29];
-    tg = CONSTANT(0.226179013155000000)*g[1]+CONSTANT(-0.094031597259499999)*g[11]+CONSTANT(0.245532000541000000)*g[25]+CONSTANT(0.016943317729199998)*g[29];
-    y[8] += tf*g[9]+tg*f[9];
-    y[9] += tf*g[8]+tg*f[8];
-    t = f[8]*g[9]+f[9]*g[8];
-    y[1] += CONSTANT(0.226179013155000000)*t;
-    y[11] += CONSTANT(-0.094031597259499999)*t;
-    y[25] += CONSTANT(0.245532000541000000)*t;
-    y[29] += CONSTANT(0.016943317729199998)*t;
-
-    // [8,14]: 2,12,30,34,
-    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(0.190188269816000010)*f[34];
-    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(0.190188269816000010)*g[34];
-    y[8] += tf*g[14]+tg*f[14];
-    y[14] += tf*g[8]+tg*f[8];
-    t = f[8]*g[14]+f[14]*g[8];
-    y[2] += CONSTANT(0.184674390919999990)*t;
-    y[12] += CONSTANT(-0.188063194517999990)*t;
-    y[30] += CONSTANT(0.053579475144400000)*t;
-    y[34] += CONSTANT(0.190188269816000010)*t;
-
-    // [8,15]: 13,3,31,35,
-    tf = CONSTANT(-0.094031597259499999)*f[13]+CONSTANT(0.226179013155000000)*f[3]+CONSTANT(0.016943317729199998)*f[31]+CONSTANT(0.245532000541000000)*f[35];
-    tg = CONSTANT(-0.094031597259499999)*g[13]+CONSTANT(0.226179013155000000)*g[3]+CONSTANT(0.016943317729199998)*g[31]+CONSTANT(0.245532000541000000)*g[35];
-    y[8] += tf*g[15]+tg*f[15];
-    y[15] += tf*g[8]+tg*f[8];
-    t = f[8]*g[15]+f[15]*g[8];
-    y[13] += CONSTANT(-0.094031597259499999)*t;
-    y[3] += CONSTANT(0.226179013155000000)*t;
-    y[31] += CONSTANT(0.016943317729199998)*t;
-    y[35] += CONSTANT(0.245532000541000000)*t;
-
-    // [8,22]: 6,20,24,
-    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24];
-    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24];
-    y[8] += tf*g[22]+tg*f[22];
-    y[22] += tf*g[8]+tg*f[8];
-    t = f[8]*g[22]+f[22]*g[8];
-    y[6] += CONSTANT(0.156078347226000000)*t;
-    y[20] += CONSTANT(-0.190364615029000010)*t;
-    y[24] += CONSTANT(-0.075080816691500005)*t;
-
-    // [8,26]: 10,28,
-    tf = CONSTANT(0.190188269806999990)*f[10]+CONSTANT(-0.097043558542400002)*f[28];
-    tg = CONSTANT(0.190188269806999990)*g[10]+CONSTANT(-0.097043558542400002)*g[28];
-    y[8] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[8]+tg*f[8];
-    t = f[8]*g[26]+f[26]*g[8];
-    y[10] += CONSTANT(0.190188269806999990)*t;
-    y[28] += CONSTANT(-0.097043558542400002)*t;
-
-    // [8,27]: 25,11,29,
-    tf = CONSTANT(-0.062641347680800000)*f[25]+CONSTANT(0.141757966609000000)*f[11]+CONSTANT(-0.121034582550000010)*f[29];
-    tg = CONSTANT(-0.062641347680800000)*g[25]+CONSTANT(0.141757966609000000)*g[11]+CONSTANT(-0.121034582550000010)*g[29];
-    y[8] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[8]+tg*f[8];
-    t = f[8]*g[27]+f[27]*g[8];
-    y[25] += CONSTANT(-0.062641347680800000)*t;
-    y[11] += CONSTANT(0.141757966609000000)*t;
-    y[29] += CONSTANT(-0.121034582550000010)*t;
-
-    // [8,32]: 30,12,34,
-    tf = CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.097043558538899996)*f[34];
-    tg = CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.097043558538899996)*g[34];
-    y[8] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[8]+tg*f[8];
-    t = f[8]*g[32]+f[32]*g[8];
-    y[30] += CONSTANT(-0.191372478254000000)*t;
-    y[12] += CONSTANT(0.141757966609000000)*t;
-    y[34] += CONSTANT(-0.097043558538899996)*t;
-
-    // [8,33]: 13,31,35,
-    tf = CONSTANT(0.141757966609000000)*f[13]+CONSTANT(-0.121034582550000010)*f[31]+CONSTANT(-0.062641347680800000)*f[35];
-    tg = CONSTANT(0.141757966609000000)*g[13]+CONSTANT(-0.121034582550000010)*g[31]+CONSTANT(-0.062641347680800000)*g[35];
-    y[8] += tf*g[33]+tg*f[33];
-    y[33] += tf*g[8]+tg*f[8];
-    t = f[8]*g[33]+f[33]*g[8];
-    y[13] += CONSTANT(0.141757966609000000)*t;
-    y[31] += CONSTANT(-0.121034582550000010)*t;
-    y[35] += CONSTANT(-0.062641347680800000)*t;
-
-    // [9,9]: 6,0,20,
-    tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20];
-    tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20];
-    y[9] += tf*g[9]+tg*f[9];
-    t = f[9]*g[9];
-    y[6] += CONSTANT(-0.210261043508000010)*t;
-    y[0] += CONSTANT(0.282094791766999970)*t;
-    y[20] += CONSTANT(0.076934943209800002)*t;
-
-    // [9,17]: 2,12,30,
-    tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.098140130728100003)*f[30];
-    tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.098140130728100003)*g[30];
-    y[9] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[9]+tg*f[9];
-    t = f[9]*g[17]+f[17]*g[9];
-    y[2] += CONSTANT(0.162867503964999990)*t;
-    y[12] += CONSTANT(-0.203550726872999990)*t;
-    y[30] += CONSTANT(0.098140130728100003)*t;
-
-    // [9,18]: 3,13,31,35,
-    tf = CONSTANT(-0.043528171377799997)*f[3]+CONSTANT(0.133255230519000010)*f[13]+CONSTANT(-0.101584686310000010)*f[31]+CONSTANT(0.098140130731999994)*f[35];
-    tg = CONSTANT(-0.043528171377799997)*g[3]+CONSTANT(0.133255230519000010)*g[13]+CONSTANT(-0.101584686310000010)*g[31]+CONSTANT(0.098140130731999994)*g[35];
-    y[9] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[9]+tg*f[9];
-    t = f[9]*g[18]+f[18]*g[9];
-    y[3] += CONSTANT(-0.043528171377799997)*t;
-    y[13] += CONSTANT(0.133255230519000010)*t;
-    y[31] += CONSTANT(-0.101584686310000010)*t;
-    y[35] += CONSTANT(0.098140130731999994)*t;
-
-    // [9,19]: 14,32,34,
-    tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(0.131668802180999990)*f[34];
-    tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(0.131668802180999990)*g[34];
-    y[9] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[9]+tg*f[9];
-    t = f[9]*g[19]+f[19]*g[9];
-    y[14] += CONSTANT(-0.099322584600699995)*t;
-    y[32] += CONSTANT(0.126698363970000010)*t;
-    y[34] += CONSTANT(0.131668802180999990)*t;
-
-    // [9,22]: 1,11,25,29,
-    tf = CONSTANT(-0.043528171378199997)*f[1]+CONSTANT(0.133255230518000010)*f[11]+CONSTANT(-0.098140130732499997)*f[25]+CONSTANT(-0.101584686311000000)*f[29];
-    tg = CONSTANT(-0.043528171378199997)*g[1]+CONSTANT(0.133255230518000010)*g[11]+CONSTANT(-0.098140130732499997)*g[25]+CONSTANT(-0.101584686311000000)*g[29];
-    y[9] += tf*g[22]+tg*f[22];
-    y[22] += tf*g[9]+tg*f[9];
-    t = f[9]*g[22]+f[22]*g[9];
-    y[1] += CONSTANT(-0.043528171378199997)*t;
-    y[11] += CONSTANT(0.133255230518000010)*t;
-    y[25] += CONSTANT(-0.098140130732499997)*t;
-    y[29] += CONSTANT(-0.101584686311000000)*t;
-
-    // [9,27]: 6,20,
-    tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20];
-    tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20];
-    y[9] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[9]+tg*f[9];
-    t = f[9]*g[27]+f[27]*g[9];
-    y[6] += CONSTANT(0.126792179874999990)*t;
-    y[20] += CONSTANT(-0.196280261464999990)*t;
-
-    // [10,10]: 0,20,24,
-    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24];
-    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24];
-    y[10] += tf*g[10]+tg*f[10];
-    t = f[10]*g[10];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.179514867494000000)*t;
-    y[24] += CONSTANT(-0.151717754049000010)*t;
-
-    // [10,16]: 14,32,
-    tf = CONSTANT(0.151717754044999990)*f[14]+CONSTANT(-0.077413979111300005)*f[32];
-    tg = CONSTANT(0.151717754044999990)*g[14]+CONSTANT(-0.077413979111300005)*g[32];
-    y[10] += tf*g[16]+tg*f[16];
-    y[16] += tf*g[10]+tg*f[10];
-    t = f[10]*g[16]+f[16]*g[10];
-    y[14] += CONSTANT(0.151717754044999990)*t;
-    y[32] += CONSTANT(-0.077413979111300005)*t;
-
-    // [10,17]: 13,3,31,35,
-    tf = CONSTANT(0.067850242288900006)*f[13]+CONSTANT(0.199471140200000010)*f[3]+CONSTANT(-0.113793659091000000)*f[31]+CONSTANT(-0.149911525925999990)*f[35];
-    tg = CONSTANT(0.067850242288900006)*g[13]+CONSTANT(0.199471140200000010)*g[3]+CONSTANT(-0.113793659091000000)*g[31]+CONSTANT(-0.149911525925999990)*g[35];
-    y[10] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[10]+tg*f[10];
-    t = f[10]*g[17]+f[17]*g[10];
-    y[13] += CONSTANT(0.067850242288900006)*t;
-    y[3] += CONSTANT(0.199471140200000010)*t;
-    y[31] += CONSTANT(-0.113793659091000000)*t;
-    y[35] += CONSTANT(-0.149911525925999990)*t;
-
-    // [10,18]: 12,2,30,34,
-    tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(-0.101358691177000000)*f[34];
-    tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(-0.101358691177000000)*g[34];
-    y[10] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[10]+tg*f[10];
-    t = f[10]*g[18]+f[18]*g[10];
-    y[12] += CONSTANT(-0.044418410173299998)*t;
-    y[2] += CONSTANT(0.213243618621000000)*t;
-    y[30] += CONSTANT(-0.171327458205000000)*t;
-    y[34] += CONSTANT(-0.101358691177000000)*t;
-
-    // [10,19]: 3,15,13,31,33,
-    tf = CONSTANT(-0.075393004386799994)*f[3]+CONSTANT(0.099322584599600000)*f[15]+CONSTANT(0.102579924281000000)*f[13]+CONSTANT(0.097749909976500002)*f[31]+CONSTANT(-0.025339672794100002)*f[33];
-    tg = CONSTANT(-0.075393004386799994)*g[3]+CONSTANT(0.099322584599600000)*g[15]+CONSTANT(0.102579924281000000)*g[13]+CONSTANT(0.097749909976500002)*g[31]+CONSTANT(-0.025339672794100002)*g[33];
-    y[10] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[10]+tg*f[10];
-    t = f[10]*g[19]+f[19]*g[10];
-    y[3] += CONSTANT(-0.075393004386799994)*t;
-    y[15] += CONSTANT(0.099322584599600000)*t;
-    y[13] += CONSTANT(0.102579924281000000)*t;
-    y[31] += CONSTANT(0.097749909976500002)*t;
-    y[33] += CONSTANT(-0.025339672794100002)*t;
-
-    // [10,21]: 11,1,9,27,29,
-    tf = CONSTANT(0.102579924281000000)*f[11]+CONSTANT(-0.075393004386799994)*f[1]+CONSTANT(-0.099322584599600000)*f[9]+CONSTANT(0.025339672794100002)*f[27]+CONSTANT(0.097749909976500002)*f[29];
-    tg = CONSTANT(0.102579924281000000)*g[11]+CONSTANT(-0.075393004386799994)*g[1]+CONSTANT(-0.099322584599600000)*g[9]+CONSTANT(0.025339672794100002)*g[27]+CONSTANT(0.097749909976500002)*g[29];
-    y[10] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[10]+tg*f[10];
-    t = f[10]*g[21]+f[21]*g[10];
-    y[11] += CONSTANT(0.102579924281000000)*t;
-    y[1] += CONSTANT(-0.075393004386799994)*t;
-    y[9] += CONSTANT(-0.099322584599600000)*t;
-    y[27] += CONSTANT(0.025339672794100002)*t;
-    y[29] += CONSTANT(0.097749909976500002)*t;
-
-    // [10,23]: 11,1,25,29,
-    tf = CONSTANT(-0.067850242288900006)*f[11]+CONSTANT(-0.199471140200000010)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(0.113793659091000000)*f[29];
-    tg = CONSTANT(-0.067850242288900006)*g[11]+CONSTANT(-0.199471140200000010)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(0.113793659091000000)*g[29];
-    y[10] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[10]+tg*f[10];
-    t = f[10]*g[23]+f[23]*g[10];
-    y[11] += CONSTANT(-0.067850242288900006)*t;
-    y[1] += CONSTANT(-0.199471140200000010)*t;
-    y[25] += CONSTANT(0.149911525925999990)*t;
-    y[29] += CONSTANT(0.113793659091000000)*t;
-
-    // [10,28]: 6,20,24,
-    tf = CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.077413979109600004)*f[24];
-    tg = CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.077413979109600004)*g[24];
-    y[10] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[10]+tg*f[10];
-    t = f[10]*g[28]+f[28]*g[10];
-    y[6] += CONSTANT(0.190188269814000000)*t;
-    y[20] += CONSTANT(-0.065426753820500005)*t;
-    y[24] += CONSTANT(0.077413979109600004)*t;
-
-    // [11,11]: 0,6,8,20,22,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22];
-    y[11] += tf*g[11]+tg*f[11];
-    t = f[11]*g[11];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[6] += CONSTANT(0.126156626101000010)*t;
-    y[8] += CONSTANT(-0.145673124078999990)*t;
-    y[20] += CONSTANT(0.025644981070299999)*t;
-    y[22] += CONSTANT(-0.114687841910000000)*t;
-
-    // [11,16]: 15,33,35,
-    tf = CONSTANT(-0.117520066953000000)*f[15]+CONSTANT(0.119929220739999990)*f[33]+CONSTANT(0.134084945035999990)*f[35];
-    tg = CONSTANT(-0.117520066953000000)*g[15]+CONSTANT(0.119929220739999990)*g[33]+CONSTANT(0.134084945035999990)*g[35];
-    y[11] += tf*g[16]+tg*f[16];
-    y[16] += tf*g[11]+tg*f[11];
-    t = f[11]*g[16]+f[16]*g[11];
-    y[15] += CONSTANT(-0.117520066953000000)*t;
-    y[33] += CONSTANT(0.119929220739999990)*t;
-    y[35] += CONSTANT(0.134084945035999990)*t;
-
-    // [11,18]: 3,13,15,31,33,
-    tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15]+CONSTANT(0.075189952564900006)*f[31]+CONSTANT(-0.101990215611000000)*f[33];
-    tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15]+CONSTANT(0.075189952564900006)*g[31]+CONSTANT(-0.101990215611000000)*g[33];
-    y[11] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[11]+tg*f[11];
-    t = f[11]*g[18]+f[18]*g[11];
-    y[3] += CONSTANT(0.168583882834000000)*t;
-    y[13] += CONSTANT(0.114687841909000000)*t;
-    y[15] += CONSTANT(-0.133255230519000010)*t;
-    y[31] += CONSTANT(0.075189952564900006)*t;
-    y[33] += CONSTANT(-0.101990215611000000)*t;
-
-    // [11,19]: 2,14,12,30,32,
-    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(-0.104682806112000000)*f[32];
-    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(-0.104682806112000000)*g[32];
-    y[11] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[11]+tg*f[11];
-    t = f[11]*g[19]+f[19]*g[11];
-    y[2] += CONSTANT(0.238413613504000000)*t;
-    y[14] += CONSTANT(-0.102579924282000000)*t;
-    y[12] += CONSTANT(0.099322584599300004)*t;
-    y[30] += CONSTANT(0.009577496073830001)*t;
-    y[32] += CONSTANT(-0.104682806112000000)*t;
-
-    // [11,24]: 9,25,27,
-    tf = CONSTANT(0.117520066950999990)*f[9]+CONSTANT(-0.134084945037000000)*f[25]+CONSTANT(-0.119929220742000010)*f[27];
-    tg = CONSTANT(0.117520066950999990)*g[9]+CONSTANT(-0.134084945037000000)*g[25]+CONSTANT(-0.119929220742000010)*g[27];
-    y[11] += tf*g[24]+tg*f[24];
-    y[24] += tf*g[11]+tg*f[11];
-    t = f[11]*g[24]+f[24]*g[11];
-    y[9] += CONSTANT(0.117520066950999990)*t;
-    y[25] += CONSTANT(-0.134084945037000000)*t;
-    y[27] += CONSTANT(-0.119929220742000010)*t;
-
-    // [11,29]: 6,20,22,8,
-    tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.075189952565200002)*f[22]+CONSTANT(0.065621187395299999)*f[8];
-    tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.075189952565200002)*g[22]+CONSTANT(0.065621187395299999)*g[8];
-    y[11] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[11]+tg*f[11];
-    t = f[11]*g[29]+f[29]*g[11];
-    y[6] += CONSTANT(0.227318461243000010)*t;
-    y[20] += CONSTANT(0.086019920779800002)*t;
-    y[22] += CONSTANT(-0.075189952565200002)*t;
-    y[8] += CONSTANT(0.065621187395299999)*t;
-
-    // [12,12]: 0,6,20,
-    tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20];
-    tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20];
-    y[12] += tf*g[12]+tg*f[12];
-    t = f[12]*g[12];
-    y[0] += CONSTANT(0.282094799871999980)*t;
-    y[6] += CONSTANT(0.168208852954000010)*t;
-    y[20] += CONSTANT(0.153869910786000010)*t;
-
-    // [12,30]: 20,6,
-    tf = CONSTANT(0.148373961712999990)*f[20]+CONSTANT(0.239614719999000000)*f[6];
-    tg = CONSTANT(0.148373961712999990)*g[20]+CONSTANT(0.239614719999000000)*g[6];
-    y[12] += tf*g[30]+tg*f[30];
-    y[30] += tf*g[12]+tg*f[12];
-    t = f[12]*g[30]+f[30]*g[12];
-    y[20] += CONSTANT(0.148373961712999990)*t;
-    y[6] += CONSTANT(0.239614719999000000)*t;
-
-    // [13,13]: 0,8,6,20,22,
-    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22];
-    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22];
-    y[13] += tf*g[13]+tg*f[13];
-    t = f[13]*g[13];
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.145673124078999990)*t;
-    y[6] += CONSTANT(0.126156626101000010)*t;
-    y[20] += CONSTANT(0.025644981070299999)*t;
-    y[22] += CONSTANT(0.114687841910000000)*t;
-
-    // [13,16]: 9,25,27,
-    tf = CONSTANT(-0.117520066953000000)*f[9]+CONSTANT(-0.134084945035999990)*f[25]+CONSTANT(0.119929220739999990)*f[27];
-    tg = CONSTANT(-0.117520066953000000)*g[9]+CONSTANT(-0.134084945035999990)*g[25]+CONSTANT(0.119929220739999990)*g[27];
-    y[13] += tf*g[16]+tg*f[16];
-    y[16] += tf*g[13]+tg*f[13];
-    t = f[13]*g[16]+f[16]*g[13];
-    y[9] += CONSTANT(-0.117520066953000000)*t;
-    y[25] += CONSTANT(-0.134084945035999990)*t;
-    y[27] += CONSTANT(0.119929220739999990)*t;
-
-    // [13,21]: 2,12,14,30,32,
-    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(0.104682806112000000)*f[32];
-    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(0.104682806112000000)*g[32];
-    y[13] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[13]+tg*f[13];
-    t = f[13]*g[21]+f[21]*g[13];
-    y[2] += CONSTANT(0.238413613504000000)*t;
-    y[12] += CONSTANT(0.099322584599300004)*t;
-    y[14] += CONSTANT(0.102579924282000000)*t;
-    y[30] += CONSTANT(0.009577496073830001)*t;
-    y[32] += CONSTANT(0.104682806112000000)*t;
-
-    // [13,24]: 15,33,35,
-    tf = CONSTANT(-0.117520066950999990)*f[15]+CONSTANT(0.119929220742000010)*f[33]+CONSTANT(-0.134084945037000000)*f[35];
-    tg = CONSTANT(-0.117520066950999990)*g[15]+CONSTANT(0.119929220742000010)*g[33]+CONSTANT(-0.134084945037000000)*g[35];
-    y[13] += tf*g[24]+tg*f[24];
-    y[24] += tf*g[13]+tg*f[13];
-    t = f[13]*g[24]+f[24]*g[13];
-    y[15] += CONSTANT(-0.117520066950999990)*t;
-    y[33] += CONSTANT(0.119929220742000010)*t;
-    y[35] += CONSTANT(-0.134084945037000000)*t;
-
-    // [13,31]: 6,22,20,8,
-    tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.075189952565200002)*f[22]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.065621187395299999)*f[8];
-    tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.075189952565200002)*g[22]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.065621187395299999)*g[8];
-    y[13] += tf*g[31]+tg*f[31];
-    y[31] += tf*g[13]+tg*f[13];
-    t = f[13]*g[31]+f[31]*g[13];
-    y[6] += CONSTANT(0.227318461243000010)*t;
-    y[22] += CONSTANT(0.075189952565200002)*t;
-    y[20] += CONSTANT(0.086019920779800002)*t;
-    y[8] += CONSTANT(-0.065621187395299999)*t;
-
-    // [14,14]: 0,20,24,
-    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24];
-    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24];
-    y[14] += tf*g[14]+tg*f[14];
-    t = f[14]*g[14];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.179514867494000000)*t;
-    y[24] += CONSTANT(0.151717754049000010)*t;
-
-    // [14,17]: 11,1,25,29,
-    tf = CONSTANT(0.067850242288500007)*f[11]+CONSTANT(0.199471140196999990)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(-0.113793659092000000)*f[29];
-    tg = CONSTANT(0.067850242288500007)*g[11]+CONSTANT(0.199471140196999990)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(-0.113793659092000000)*g[29];
-    y[14] += tf*g[17]+tg*f[17];
-    y[17] += tf*g[14]+tg*f[14];
-    t = f[14]*g[17]+f[17]*g[14];
-    y[11] += CONSTANT(0.067850242288500007)*t;
-    y[1] += CONSTANT(0.199471140196999990)*t;
-    y[25] += CONSTANT(0.149911525925999990)*t;
-    y[29] += CONSTANT(-0.113793659092000000)*t;
-
-    // [14,22]: 12,2,30,34,
-    tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(0.101358691177000000)*f[34];
-    tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(0.101358691177000000)*g[34];
-    y[14] += tf*g[22]+tg*f[22];
-    y[22] += tf*g[14]+tg*f[14];
-    t = f[14]*g[22]+f[22]*g[14];
-    y[12] += CONSTANT(-0.044418410173299998)*t;
-    y[2] += CONSTANT(0.213243618621000000)*t;
-    y[30] += CONSTANT(-0.171327458205000000)*t;
-    y[34] += CONSTANT(0.101358691177000000)*t;
-
-    // [14,23]: 13,3,31,35,
-    tf = CONSTANT(0.067850242288500007)*f[13]+CONSTANT(0.199471140196999990)*f[3]+CONSTANT(-0.113793659092000000)*f[31]+CONSTANT(0.149911525925999990)*f[35];
-    tg = CONSTANT(0.067850242288500007)*g[13]+CONSTANT(0.199471140196999990)*g[3]+CONSTANT(-0.113793659092000000)*g[31]+CONSTANT(0.149911525925999990)*g[35];
-    y[14] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[14]+tg*f[14];
-    t = f[14]*g[23]+f[23]*g[14];
-    y[13] += CONSTANT(0.067850242288500007)*t;
-    y[3] += CONSTANT(0.199471140196999990)*t;
-    y[31] += CONSTANT(-0.113793659092000000)*t;
-    y[35] += CONSTANT(0.149911525925999990)*t;
-
-    // [14,32]: 20,6,24,
-    tf = CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.077413979109600004)*f[24];
-    tg = CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.077413979109600004)*g[24];
-    y[14] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[14]+tg*f[14];
-    t = f[14]*g[32]+f[32]*g[14];
-    y[20] += CONSTANT(-0.065426753820500005)*t;
-    y[6] += CONSTANT(0.190188269814000000)*t;
-    y[24] += CONSTANT(-0.077413979109600004)*t;
-
-    // [15,15]: 0,6,20,
-    tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20];
-    tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20];
-    y[15] += tf*g[15]+tg*f[15];
-    t = f[15]*g[15];
-    y[0] += CONSTANT(0.282094791766999970)*t;
-    y[6] += CONSTANT(-0.210261043508000010)*t;
-    y[20] += CONSTANT(0.076934943209800002)*t;
-
-    // [15,21]: 14,32,34,
-    tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(-0.131668802180999990)*f[34];
-    tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(-0.131668802180999990)*g[34];
-    y[15] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[15]+tg*f[15];
-    t = f[15]*g[21]+f[21]*g[15];
-    y[14] += CONSTANT(-0.099322584600699995)*t;
-    y[32] += CONSTANT(0.126698363970000010)*t;
-    y[34] += CONSTANT(-0.131668802180999990)*t;
-
-    // [15,22]: 13,3,31,35,
-    tf = CONSTANT(0.133255230518000010)*f[13]+CONSTANT(-0.043528171378199997)*f[3]+CONSTANT(-0.101584686311000000)*f[31]+CONSTANT(-0.098140130732499997)*f[35];
-    tg = CONSTANT(0.133255230518000010)*g[13]+CONSTANT(-0.043528171378199997)*g[3]+CONSTANT(-0.101584686311000000)*g[31]+CONSTANT(-0.098140130732499997)*g[35];
-    y[15] += tf*g[22]+tg*f[22];
-    y[22] += tf*g[15]+tg*f[15];
-    t = f[15]*g[22]+f[22]*g[15];
-    y[13] += CONSTANT(0.133255230518000010)*t;
-    y[3] += CONSTANT(-0.043528171378199997)*t;
-    y[31] += CONSTANT(-0.101584686311000000)*t;
-    y[35] += CONSTANT(-0.098140130732499997)*t;
-
-    // [15,23]: 12,2,30,
-    tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2]+CONSTANT(0.098140130728100003)*f[30];
-    tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2]+CONSTANT(0.098140130728100003)*g[30];
-    y[15] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[15]+tg*f[15];
-    t = f[15]*g[23]+f[23]*g[15];
-    y[12] += CONSTANT(-0.203550726872999990)*t;
-    y[2] += CONSTANT(0.162867503964999990)*t;
-    y[30] += CONSTANT(0.098140130728100003)*t;
-
-    // [15,33]: 6,20,
-    tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20];
-    tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20];
-    y[15] += tf*g[33]+tg*f[33];
-    y[33] += tf*g[15]+tg*f[15];
-    t = f[15]*g[33]+f[33]*g[15];
-    y[6] += CONSTANT(0.126792179874999990)*t;
-    y[20] += CONSTANT(-0.196280261464999990)*t;
-
-    // [16,16]: 0,6,20,
-    tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20];
-    tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20];
-    y[16] += tf*g[16]+tg*f[16];
-    t = f[16]*g[16];
-    y[0] += CONSTANT(0.282094791763999990)*t;
-    y[6] += CONSTANT(-0.229375683829000000)*t;
-    y[20] += CONSTANT(0.106525305981000000)*t;
-
-    // [16,18]: 8,22,
-    tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22];
-    tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22];
-    y[16] += tf*g[18]+tg*f[18];
-    y[18] += tf*g[16]+tg*f[16];
-    t = f[16]*g[18]+f[18]*g[16];
-    y[8] += CONSTANT(-0.075080816693699995)*t;
-    y[22] += CONSTANT(0.135045473380000000)*t;
-
-    // [16,23]: 19,5,
-    tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5];
-    tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5];
-    y[16] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[16]+tg*f[16];
-    t = f[16]*g[23]+f[23]*g[16];
-    y[19] += CONSTANT(-0.119098912754999990)*t;
-    y[5] += CONSTANT(0.140463346187999990)*t;
-
-    // [16,26]: 12,2,30,
-    tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.147319200325000010)*f[2]+CONSTANT(0.130197596199999990)*f[30];
-    tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.147319200325000010)*g[2]+CONSTANT(0.130197596199999990)*g[30];
-    y[16] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[16]+tg*f[16];
-    t = f[16]*g[26]+f[26]*g[16];
-    y[12] += CONSTANT(-0.207723503645000000)*t;
-    y[2] += CONSTANT(0.147319200325000010)*t;
-    y[30] += CONSTANT(0.130197596199999990)*t;
-
-    // [16,28]: 14,32,
-    tf = CONSTANT(-0.077413979111300005)*f[14]+CONSTANT(0.128376561115000010)*f[32];
-    tg = CONSTANT(-0.077413979111300005)*g[14]+CONSTANT(0.128376561115000010)*g[32];
-    y[16] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[16]+tg*f[16];
-    t = f[16]*g[28]+f[28]*g[16];
-    y[14] += CONSTANT(-0.077413979111300005)*t;
-    y[32] += CONSTANT(0.128376561115000010)*t;
-
-    // [16,29]: 15,33,35,
-    tf = CONSTANT(0.035835708931099997)*f[15]+CONSTANT(-0.118853600623999990)*f[33]+CONSTANT(-0.053152946071899999)*f[35];
-    tg = CONSTANT(0.035835708931099997)*g[15]+CONSTANT(-0.118853600623999990)*g[33]+CONSTANT(-0.053152946071899999)*g[35];
-    y[16] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[16]+tg*f[16];
-    t = f[16]*g[29]+f[29]*g[16];
-    y[15] += CONSTANT(0.035835708931099997)*t;
-    y[33] += CONSTANT(-0.118853600623999990)*t;
-    y[35] += CONSTANT(-0.053152946071899999)*t;
-
-    // [16,31]: 27,9,25,
-    tf = CONSTANT(-0.118853600623999990)*f[27]+CONSTANT(0.035835708931099997)*f[9]+CONSTANT(0.053152946071899999)*f[25];
-    tg = CONSTANT(-0.118853600623999990)*g[27]+CONSTANT(0.035835708931099997)*g[9]+CONSTANT(0.053152946071899999)*g[25];
-    y[16] += tf*g[31]+tg*f[31];
-    y[31] += tf*g[16]+tg*f[16];
-    t = f[16]*g[31]+f[31]*g[16];
-    y[27] += CONSTANT(-0.118853600623999990)*t;
-    y[9] += CONSTANT(0.035835708931099997)*t;
-    y[25] += CONSTANT(0.053152946071899999)*t;
-
-    // [17,17]: 0,6,20,
-    tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20];
-    tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20];
-    y[17] += tf*g[17]+tg*f[17];
-    t = f[17]*g[17];
-    y[0] += CONSTANT(0.282094791768999990)*t;
-    y[6] += CONSTANT(-0.057343920955899998)*t;
-    y[20] += CONSTANT(-0.159787958979000000)*t;
-
-    // [17,19]: 8,22,24,
-    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24];
-    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24];
-    y[17] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[17]+tg*f[17];
-    t = f[17]*g[19]+f[19]*g[17];
-    y[8] += CONSTANT(-0.112621225039000000)*t;
-    y[22] += CONSTANT(0.045015157794100001)*t;
-    y[24] += CONSTANT(0.119098912753000000)*t;
-
-    // [17,21]: 16,4,18,
-    tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18];
-    tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18];
-    y[17] += tf*g[21]+tg*f[21];
-    y[21] += tf*g[17]+tg*f[17];
-    t = f[17]*g[21]+f[21]*g[17];
-    y[16] += CONSTANT(-0.119098912754999990)*t;
-    y[4] += CONSTANT(-0.112621225039000000)*t;
-    y[18] += CONSTANT(0.045015157794399997)*t;
-
-    // [17,26]: 3,13,31,
-    tf = CONSTANT(0.208340811096000000)*f[3]+CONSTANT(0.029982305185199998)*f[13]+CONSTANT(-0.118853600623999990)*f[31];
-    tg = CONSTANT(0.208340811096000000)*g[3]+CONSTANT(0.029982305185199998)*g[13]+CONSTANT(-0.118853600623999990)*g[31];
-    y[17] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[17]+tg*f[17];
-    t = f[17]*g[26]+f[26]*g[17];
-    y[3] += CONSTANT(0.208340811096000000)*t;
-    y[13] += CONSTANT(0.029982305185199998)*t;
-    y[31] += CONSTANT(-0.118853600623999990)*t;
-
-    // [17,27]: 12,2,30,
-    tf = CONSTANT(-0.103861751821000010)*f[12]+CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30];
-    tg = CONSTANT(-0.103861751821000010)*g[12]+CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30];
-    y[17] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[17]+tg*f[17];
-    t = f[17]*g[27]+f[27]*g[17];
-    y[12] += CONSTANT(-0.103861751821000010)*t;
-    y[2] += CONSTANT(0.196425600433000000)*t;
-    y[30] += CONSTANT(-0.130197596204999990)*t;
-
-    // [17,28]: 13,3,31,35,
-    tf = CONSTANT(0.121172043789000000)*f[13]+CONSTANT(-0.060142811686500000)*f[3]+CONSTANT(0.034310079156700000)*f[31]+CONSTANT(0.099440056652200001)*f[35];
-    tg = CONSTANT(0.121172043789000000)*g[13]+CONSTANT(-0.060142811686500000)*g[3]+CONSTANT(0.034310079156700000)*g[31]+CONSTANT(0.099440056652200001)*g[35];
-    y[17] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[17]+tg*f[17];
-    t = f[17]*g[28]+f[28]*g[17];
-    y[13] += CONSTANT(0.121172043789000000)*t;
-    y[3] += CONSTANT(-0.060142811686500000)*t;
-    y[31] += CONSTANT(0.034310079156700000)*t;
-    y[35] += CONSTANT(0.099440056652200001)*t;
-
-    // [17,32]: 11,1,25,29,
-    tf = CONSTANT(0.121172043788000010)*f[11]+CONSTANT(-0.060142811686900000)*f[1]+CONSTANT(-0.099440056652700004)*f[25]+CONSTANT(0.034310079156599997)*f[29];
-    tg = CONSTANT(0.121172043788000010)*g[11]+CONSTANT(-0.060142811686900000)*g[1]+CONSTANT(-0.099440056652700004)*g[25]+CONSTANT(0.034310079156599997)*g[29];
-    y[17] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[17]+tg*f[17];
-    t = f[17]*g[32]+f[32]*g[17];
-    y[11] += CONSTANT(0.121172043788000010)*t;
-    y[1] += CONSTANT(-0.060142811686900000)*t;
-    y[25] += CONSTANT(-0.099440056652700004)*t;
-    y[29] += CONSTANT(0.034310079156599997)*t;
-
-    // [17,34]: 29,11,1,
-    tf = CONSTANT(0.118853600623000000)*f[29]+CONSTANT(-0.029982305185400002)*f[11]+CONSTANT(-0.208340811100000000)*f[1];
-    tg = CONSTANT(0.118853600623000000)*g[29]+CONSTANT(-0.029982305185400002)*g[11]+CONSTANT(-0.208340811100000000)*g[1];
-    y[17] += tf*g[34]+tg*f[34];
-    y[34] += tf*g[17]+tg*f[17];
-    t = f[17]*g[34]+f[34]*g[17];
-    y[29] += CONSTANT(0.118853600623000000)*t;
-    y[11] += CONSTANT(-0.029982305185400002)*t;
-    y[1] += CONSTANT(-0.208340811100000000)*t;
-
-    // [18,18]: 6,0,20,24,
-    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24];
-    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24];
-    y[18] += tf*g[18]+tg*f[18];
-    t = f[18]*g[18];
-    y[6] += CONSTANT(0.065535909662600006)*t;
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.083698454702400005)*t;
-    y[24] += CONSTANT(-0.135045473384000000)*t;
-
-    // [18,19]: 7,21,23,
-    tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23];
-    tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23];
-    y[18] += tf*g[19]+tg*f[19];
-    y[19] += tf*g[18]+tg*f[18];
-    t = f[18]*g[19]+f[19]*g[18];
-    y[7] += CONSTANT(0.090297865407399994)*t;
-    y[21] += CONSTANT(0.102084782359000000)*t;
-    y[23] += CONSTANT(-0.045015157794399997)*t;
-
-    // [18,25]: 15,33,
-    tf = CONSTANT(-0.098140130731999994)*f[15]+CONSTANT(0.130197596202000000)*f[33];
-    tg = CONSTANT(-0.098140130731999994)*g[15]+CONSTANT(0.130197596202000000)*g[33];
-    y[18] += tf*g[25]+tg*f[25];
-    y[25] += tf*g[18]+tg*f[18];
-    t = f[18]*g[25]+f[25]*g[18];
-    y[15] += CONSTANT(-0.098140130731999994)*t;
-    y[33] += CONSTANT(0.130197596202000000)*t;
-
-    // [18,26]: 14,32,
-    tf = CONSTANT(0.101358691174000000)*f[14]+CONSTANT(0.084042186965900004)*f[32];
-    tg = CONSTANT(0.101358691174000000)*g[14]+CONSTANT(0.084042186965900004)*g[32];
-    y[18] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[18]+tg*f[18];
-    t = f[18]*g[26]+f[26]*g[18];
-    y[14] += CONSTANT(0.101358691174000000)*t;
-    y[32] += CONSTANT(0.084042186965900004)*t;
-
-    // [18,27]: 13,3,35,
-    tf = CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.183739324705999990)*f[3]+CONSTANT(-0.130197596202000000)*f[35];
-    tg = CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.183739324705999990)*g[3]+CONSTANT(-0.130197596202000000)*g[35];
-    y[18] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[18]+tg*f[18];
-    t = f[18]*g[27]+f[27]*g[18];
-    y[13] += CONSTANT(0.101990215611000000)*t;
-    y[3] += CONSTANT(0.183739324705999990)*t;
-    y[35] += CONSTANT(-0.130197596202000000)*t;
-
-    // [18,28]: 2,12,30,34,
-    tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(-0.084042186968800003)*f[34];
-    tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(-0.084042186968800003)*g[34];
-    y[18] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[18]+tg*f[18];
-    t = f[18]*g[28]+f[28]*g[18];
-    y[2] += CONSTANT(0.225033795606000010)*t;
-    y[12] += CONSTANT(0.022664492358099999)*t;
-    y[30] += CONSTANT(-0.099440056651100006)*t;
-    y[34] += CONSTANT(-0.084042186968800003)*t;
-
-    // [18,29]: 3,13,15,31,
-    tf = CONSTANT(-0.085054779966799998)*f[3]+CONSTANT(0.075189952564900006)*f[13]+CONSTANT(0.101584686310000010)*f[15]+CONSTANT(0.097043558538999999)*f[31];
-    tg = CONSTANT(-0.085054779966799998)*g[3]+CONSTANT(0.075189952564900006)*g[13]+CONSTANT(0.101584686310000010)*g[15]+CONSTANT(0.097043558538999999)*g[31];
-    y[18] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[18]+tg*f[18];
-    t = f[18]*g[29]+f[29]*g[18];
-    y[3] += CONSTANT(-0.085054779966799998)*t;
-    y[13] += CONSTANT(0.075189952564900006)*t;
-    y[15] += CONSTANT(0.101584686310000010)*t;
-    y[31] += CONSTANT(0.097043558538999999)*t;
-
-    // [19,19]: 6,8,0,20,22,
-    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22];
-    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22];
-    y[19] += tf*g[19]+tg*f[19];
-    t = f[19]*g[19];
-    y[6] += CONSTANT(0.139263808033999990)*t;
-    y[8] += CONSTANT(-0.141889406570999990)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[20] += CONSTANT(0.068480553847200004)*t;
-    y[22] += CONSTANT(-0.102084782360000000)*t;
-
-    // [19,25]: 34,
-    tf = CONSTANT(-0.130197596205999990)*f[34];
-    tg = CONSTANT(-0.130197596205999990)*g[34];
-    y[19] += tf*g[25]+tg*f[25];
-    y[25] += tf*g[19]+tg*f[19];
-    t = f[19]*g[25]+f[25]*g[19];
-    y[34] += CONSTANT(-0.130197596205999990)*t;
-
-    // [19,26]: 15,35,
-    tf = CONSTANT(-0.131668802182000000)*f[15]+CONSTANT(0.130197596204999990)*f[35];
-    tg = CONSTANT(-0.131668802182000000)*g[15]+CONSTANT(0.130197596204999990)*g[35];
-    y[19] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[19]+tg*f[19];
-    t = f[19]*g[26]+f[26]*g[19];
-    y[15] += CONSTANT(-0.131668802182000000)*t;
-    y[35] += CONSTANT(0.130197596204999990)*t;
-
-    // [19,27]: 14,32,
-    tf = CONSTANT(0.025339672793899998)*f[14]+CONSTANT(0.084042186967699994)*f[32];
-    tg = CONSTANT(0.025339672793899998)*g[14]+CONSTANT(0.084042186967699994)*g[32];
-    y[19] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[19]+tg*f[19];
-    t = f[19]*g[27]+f[27]*g[19];
-    y[14] += CONSTANT(0.025339672793899998)*t;
-    y[32] += CONSTANT(0.084042186967699994)*t;
-
-    // [19,28]: 13,3,15,31,33,
-    tf = CONSTANT(0.104682806111000000)*f[13]+CONSTANT(0.159122922869999990)*f[3]+CONSTANT(-0.126698363970000010)*f[15]+CONSTANT(0.090775936911399999)*f[31]+CONSTANT(-0.084042186968400004)*f[33];
-    tg = CONSTANT(0.104682806111000000)*g[13]+CONSTANT(0.159122922869999990)*g[3]+CONSTANT(-0.126698363970000010)*g[15]+CONSTANT(0.090775936911399999)*g[31]+CONSTANT(-0.084042186968400004)*g[33];
-    y[19] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[19]+tg*f[19];
-    t = f[19]*g[28]+f[28]*g[19];
-    y[13] += CONSTANT(0.104682806111000000)*t;
-    y[3] += CONSTANT(0.159122922869999990)*t;
-    y[15] += CONSTANT(-0.126698363970000010)*t;
-    y[31] += CONSTANT(0.090775936911399999)*t;
-    y[33] += CONSTANT(-0.084042186968400004)*t;
-
-    // [19,29]: 12,14,2,30,32,
-    tf = CONSTANT(0.115089467124000010)*f[12]+CONSTANT(-0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(-0.090775936912099994)*f[32];
-    tg = CONSTANT(0.115089467124000010)*g[12]+CONSTANT(-0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(-0.090775936912099994)*g[32];
-    y[19] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[19]+tg*f[19];
-    t = f[19]*g[29]+f[29]*g[19];
-    y[12] += CONSTANT(0.115089467124000010)*t;
-    y[14] += CONSTANT(-0.097749909977199997)*t;
-    y[2] += CONSTANT(0.240571246744999990)*t;
-    y[30] += CONSTANT(0.053152946072499999)*t;
-    y[32] += CONSTANT(-0.090775936912099994)*t;
-
-    // [20,20]: 6,0,20,
-    tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0];
-    tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0];
-    y[20] += tf*g[20]+tg*f[20];
-    t = f[20]*g[20];
-    y[6] += CONSTANT(0.163839797503000010)*t;
-    y[0] += CONSTANT(0.282094802232000010)*t;
-    y[20] += CONSTANT(0.136961139005999990)*t;
-
-    // [21,21]: 6,20,0,8,22,
-    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22];
-    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22];
-    y[21] += tf*g[21]+tg*f[21];
-    t = f[21]*g[21];
-    y[6] += CONSTANT(0.139263808033999990)*t;
-    y[20] += CONSTANT(0.068480553847200004)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[8] += CONSTANT(0.141889406570999990)*t;
-    y[22] += CONSTANT(0.102084782360000000)*t;
-
-    // [21,23]: 8,22,24,
-    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24];
-    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24];
-    y[21] += tf*g[23]+tg*f[23];
-    y[23] += tf*g[21]+tg*f[21];
-    t = f[21]*g[23]+f[23]*g[21];
-    y[8] += CONSTANT(-0.112621225039000000)*t;
-    y[22] += CONSTANT(0.045015157794100001)*t;
-    y[24] += CONSTANT(-0.119098912753000000)*t;
-
-    // [21,26]: 9,25,
-    tf = CONSTANT(-0.131668802182000000)*f[9]+CONSTANT(-0.130197596204999990)*f[25];
-    tg = CONSTANT(-0.131668802182000000)*g[9]+CONSTANT(-0.130197596204999990)*g[25];
-    y[21] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[21]+tg*f[21];
-    t = f[21]*g[26]+f[26]*g[21];
-    y[9] += CONSTANT(-0.131668802182000000)*t;
-    y[25] += CONSTANT(-0.130197596204999990)*t;
-
-    // [21,28]: 27,1,11,9,29,
-    tf = CONSTANT(0.084042186968400004)*f[27]+CONSTANT(0.159122922869999990)*f[1]+CONSTANT(0.104682806111000000)*f[11]+CONSTANT(0.126698363970000010)*f[9]+CONSTANT(0.090775936911399999)*f[29];
-    tg = CONSTANT(0.084042186968400004)*g[27]+CONSTANT(0.159122922869999990)*g[1]+CONSTANT(0.104682806111000000)*g[11]+CONSTANT(0.126698363970000010)*g[9]+CONSTANT(0.090775936911399999)*g[29];
-    y[21] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[21]+tg*f[21];
-    t = f[21]*g[28]+f[28]*g[21];
-    y[27] += CONSTANT(0.084042186968400004)*t;
-    y[1] += CONSTANT(0.159122922869999990)*t;
-    y[11] += CONSTANT(0.104682806111000000)*t;
-    y[9] += CONSTANT(0.126698363970000010)*t;
-    y[29] += CONSTANT(0.090775936911399999)*t;
-
-    // [21,31]: 14,2,30,12,32,
-    tf = CONSTANT(0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(0.115089467124000010)*f[12]+CONSTANT(0.090775936912099994)*f[32];
-    tg = CONSTANT(0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(0.115089467124000010)*g[12]+CONSTANT(0.090775936912099994)*g[32];
-    y[21] += tf*g[31]+tg*f[31];
-    y[31] += tf*g[21]+tg*f[21];
-    t = f[21]*g[31]+f[31]*g[21];
-    y[14] += CONSTANT(0.097749909977199997)*t;
-    y[2] += CONSTANT(0.240571246744999990)*t;
-    y[30] += CONSTANT(0.053152946072499999)*t;
-    y[12] += CONSTANT(0.115089467124000010)*t;
-    y[32] += CONSTANT(0.090775936912099994)*t;
-
-    // [21,33]: 32,14,
-    tf = CONSTANT(0.084042186967699994)*f[32]+CONSTANT(0.025339672793899998)*f[14];
-    tg = CONSTANT(0.084042186967699994)*g[32]+CONSTANT(0.025339672793899998)*g[14];
-    y[21] += tf*g[33]+tg*f[33];
-    y[33] += tf*g[21]+tg*f[21];
-    t = f[21]*g[33]+f[33]*g[21];
-    y[32] += CONSTANT(0.084042186967699994)*t;
-    y[14] += CONSTANT(0.025339672793899998)*t;
-
-    // [21,34]: 35,
-    tf = CONSTANT(-0.130197596205999990)*f[35];
-    tg = CONSTANT(-0.130197596205999990)*g[35];
-    y[21] += tf*g[34]+tg*f[34];
-    y[34] += tf*g[21]+tg*f[21];
-    t = f[21]*g[34]+f[34]*g[21];
-    y[35] += CONSTANT(-0.130197596205999990)*t;
-
-    // [22,22]: 6,20,0,24,
-    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24];
-    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24];
-    y[22] += tf*g[22]+tg*f[22];
-    t = f[22]*g[22];
-    y[6] += CONSTANT(0.065535909662600006)*t;
-    y[20] += CONSTANT(-0.083698454702400005)*t;
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[24] += CONSTANT(0.135045473384000000)*t;
-
-    // [22,26]: 10,28,
-    tf = CONSTANT(0.101358691174000000)*f[10]+CONSTANT(0.084042186965900004)*f[28];
-    tg = CONSTANT(0.101358691174000000)*g[10]+CONSTANT(0.084042186965900004)*g[28];
-    y[22] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[22]+tg*f[22];
-    t = f[22]*g[26]+f[26]*g[22];
-    y[10] += CONSTANT(0.101358691174000000)*t;
-    y[28] += CONSTANT(0.084042186965900004)*t;
-
-    // [22,27]: 1,11,25,
-    tf = CONSTANT(0.183739324704000010)*f[1]+CONSTANT(0.101990215611000000)*f[11]+CONSTANT(0.130197596200999990)*f[25];
-    tg = CONSTANT(0.183739324704000010)*g[1]+CONSTANT(0.101990215611000000)*g[11]+CONSTANT(0.130197596200999990)*g[25];
-    y[22] += tf*g[27]+tg*f[27];
-    y[27] += tf*g[22]+tg*f[22];
-    t = f[22]*g[27]+f[27]*g[22];
-    y[1] += CONSTANT(0.183739324704000010)*t;
-    y[11] += CONSTANT(0.101990215611000000)*t;
-    y[25] += CONSTANT(0.130197596200999990)*t;
-
-    // [22,32]: 2,30,12,34,
-    tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(0.084042186968800003)*f[34];
-    tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(0.084042186968800003)*g[34];
-    y[22] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[22]+tg*f[22];
-    t = f[22]*g[32]+f[32]*g[22];
-    y[2] += CONSTANT(0.225033795606000010)*t;
-    y[30] += CONSTANT(-0.099440056651100006)*t;
-    y[12] += CONSTANT(0.022664492358099999)*t;
-    y[34] += CONSTANT(0.084042186968800003)*t;
-
-    // [22,33]: 3,13,35,
-    tf = CONSTANT(0.183739324704000010)*f[3]+CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.130197596200999990)*f[35];
-    tg = CONSTANT(0.183739324704000010)*g[3]+CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.130197596200999990)*g[35];
-    y[22] += tf*g[33]+tg*f[33];
-    y[33] += tf*g[22]+tg*f[22];
-    t = f[22]*g[33]+f[33]*g[22];
-    y[3] += CONSTANT(0.183739324704000010)*t;
-    y[13] += CONSTANT(0.101990215611000000)*t;
-    y[35] += CONSTANT(0.130197596200999990)*t;
-
-    // [23,23]: 6,20,0,
-    tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0];
-    tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0];
-    y[23] += tf*g[23]+tg*f[23];
-    t = f[23]*g[23];
-    y[6] += CONSTANT(-0.057343920955899998)*t;
-    y[20] += CONSTANT(-0.159787958979000000)*t;
-    y[0] += CONSTANT(0.282094791768999990)*t;
-
-    // [23,26]: 1,11,29,
-    tf = CONSTANT(0.208340811096000000)*f[1]+CONSTANT(0.029982305185199998)*f[11]+CONSTANT(-0.118853600623999990)*f[29];
-    tg = CONSTANT(0.208340811096000000)*g[1]+CONSTANT(0.029982305185199998)*g[11]+CONSTANT(-0.118853600623999990)*g[29];
-    y[23] += tf*g[26]+tg*f[26];
-    y[26] += tf*g[23]+tg*f[23];
-    t = f[23]*g[26]+f[26]*g[23];
-    y[1] += CONSTANT(0.208340811096000000)*t;
-    y[11] += CONSTANT(0.029982305185199998)*t;
-    y[29] += CONSTANT(-0.118853600623999990)*t;
-
-    // [23,28]: 25,11,1,29,
-    tf = CONSTANT(-0.099440056652200001)*f[25]+CONSTANT(-0.121172043789000000)*f[11]+CONSTANT(0.060142811686500000)*f[1]+CONSTANT(-0.034310079156700000)*f[29];
-    tg = CONSTANT(-0.099440056652200001)*g[25]+CONSTANT(-0.121172043789000000)*g[11]+CONSTANT(0.060142811686500000)*g[1]+CONSTANT(-0.034310079156700000)*g[29];
-    y[23] += tf*g[28]+tg*f[28];
-    y[28] += tf*g[23]+tg*f[23];
-    t = f[23]*g[28]+f[28]*g[23];
-    y[25] += CONSTANT(-0.099440056652200001)*t;
-    y[11] += CONSTANT(-0.121172043789000000)*t;
-    y[1] += CONSTANT(0.060142811686500000)*t;
-    y[29] += CONSTANT(-0.034310079156700000)*t;
-
-    // [23,32]: 31,13,3,35,
-    tf = CONSTANT(0.034310079156599997)*f[31]+CONSTANT(0.121172043788000010)*f[13]+CONSTANT(-0.060142811686900000)*f[3]+CONSTANT(-0.099440056652700004)*f[35];
-    tg = CONSTANT(0.034310079156599997)*g[31]+CONSTANT(0.121172043788000010)*g[13]+CONSTANT(-0.060142811686900000)*g[3]+CONSTANT(-0.099440056652700004)*g[35];
-    y[23] += tf*g[32]+tg*f[32];
-    y[32] += tf*g[23]+tg*f[23];
-    t = f[23]*g[32]+f[32]*g[23];
-    y[31] += CONSTANT(0.034310079156599997)*t;
-    y[13] += CONSTANT(0.121172043788000010)*t;
-    y[3] += CONSTANT(-0.060142811686900000)*t;
-    y[35] += CONSTANT(-0.099440056652700004)*t;
-
-    // [23,33]: 2,30,12,
-    tf = CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30]+CONSTANT(-0.103861751821000010)*f[12];
-    tg = CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30]+CONSTANT(-0.103861751821000010)*g[12];
-    y[23] += tf*g[33]+tg*f[33];
-    y[33] += tf*g[23]+tg*f[23];
-    t = f[23]*g[33]+f[33]*g[23];
-    y[2] += CONSTANT(0.196425600433000000)*t;
-    y[30] += CONSTANT(-0.130197596204999990)*t;
-    y[12] += CONSTANT(-0.103861751821000010)*t;
-
-    // [23,34]: 3,13,31,
-    tf = CONSTANT(0.208340811100000000)*f[3]+CONSTANT(0.029982305185400002)*f[13]+CONSTANT(-0.118853600623000000)*f[31];
-    tg = CONSTANT(0.208340811100000000)*g[3]+CONSTANT(0.029982305185400002)*g[13]+CONSTANT(-0.118853600623000000)*g[31];
-    y[23] += tf*g[34]+tg*f[34];
-    y[34] += tf*g[23]+tg*f[23];
-    t = f[23]*g[34]+f[34]*g[23];
-    y[3] += CONSTANT(0.208340811100000000)*t;
-    y[13] += CONSTANT(0.029982305185400002)*t;
-    y[31] += CONSTANT(-0.118853600623000000)*t;
-
-    // [24,24]: 6,0,20,
-    tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20];
-    tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20];
-    y[24] += tf*g[24]+tg*f[24];
-    t = f[24]*g[24];
-    y[6] += CONSTANT(-0.229375683829000000)*t;
-    y[0] += CONSTANT(0.282094791763999990)*t;
-    y[20] += CONSTANT(0.106525305981000000)*t;
-
-    // [24,29]: 9,27,25,
-    tf = CONSTANT(-0.035835708931400000)*f[9]+CONSTANT(0.118853600623000000)*f[27]+CONSTANT(0.053152946071199997)*f[25];
-    tg = CONSTANT(-0.035835708931400000)*g[9]+CONSTANT(0.118853600623000000)*g[27]+CONSTANT(0.053152946071199997)*g[25];
-    y[24] += tf*g[29]+tg*f[29];
-    y[29] += tf*g[24]+tg*f[24];
-    t = f[24]*g[29]+f[29]*g[24];
-    y[9] += CONSTANT(-0.035835708931400000)*t;
-    y[27] += CONSTANT(0.118853600623000000)*t;
-    y[25] += CONSTANT(0.053152946071199997)*t;
-
-    // [24,31]: 15,33,35,
-    tf = CONSTANT(0.035835708931400000)*f[15]+CONSTANT(-0.118853600623000000)*f[33]+CONSTANT(0.053152946071199997)*f[35];
-    tg = CONSTANT(0.035835708931400000)*g[15]+CONSTANT(-0.118853600623000000)*g[33]+CONSTANT(0.053152946071199997)*g[35];
-    y[24] += tf*g[31]+tg*f[31];
-    y[31] += tf*g[24]+tg*f[24];
-    t = f[24]*g[31]+f[31]*g[24];
-    y[15] += CONSTANT(0.035835708931400000)*t;
-    y[33] += CONSTANT(-0.118853600623000000)*t;
-    y[35] += CONSTANT(0.053152946071199997)*t;
-
-    // [24,34]: 12,30,2,
-    tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.130197596199999990)*f[30]+CONSTANT(0.147319200325000010)*f[2];
-    tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.130197596199999990)*g[30]+CONSTANT(0.147319200325000010)*g[2];
-    y[24] += tf*g[34]+tg*f[34];
-    y[34] += tf*g[24]+tg*f[24];
-    t = f[24]*g[34]+f[34]*g[24];
-    y[12] += CONSTANT(-0.207723503645000000)*t;
-    y[30] += CONSTANT(0.130197596199999990)*t;
-    y[2] += CONSTANT(0.147319200325000010)*t;
-
-    // [25,25]: 0,6,20,
-    tf = CONSTANT(0.282094791761999970)*f[0]+CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.130197596198000000)*f[20];
-    tg = CONSTANT(0.282094791761999970)*g[0]+CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.130197596198000000)*g[20];
-    y[25] += tf*g[25]+tg*f[25];
-    t = f[25]*g[25];
-    y[0] += CONSTANT(0.282094791761999970)*t;
-    y[6] += CONSTANT(-0.242608896358999990)*t;
-    y[20] += CONSTANT(0.130197596198000000)*t;
-
-    // [26,26]: 6,20,0,
-    tf = CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(0.282094791766000000)*f[0];
-    tg = CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(0.282094791766000000)*g[0];
-    y[26] += tf*g[26]+tg*f[26];
-    t = f[26]*g[26];
-    y[6] += CONSTANT(-0.097043558542400002)*t;
-    y[20] += CONSTANT(-0.130197596207000000)*t;
-    y[0] += CONSTANT(0.282094791766000000)*t;
-
-    // [27,27]: 0,20,6,
-    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.016173926423100001)*f[6];
-    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.016173926423100001)*g[6];
-    y[27] += tf*g[27]+tg*f[27];
-    t = f[27]*g[27];
-    y[0] += CONSTANT(0.282094791770000020)*t;
-    y[20] += CONSTANT(-0.130197596204999990)*t;
-    y[6] += CONSTANT(0.016173926423100001)*t;
-
-    // [28,28]: 6,0,20,24,
-    tf = CONSTANT(0.097043558538800007)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(-0.128376561118000000)*f[24];
-    tg = CONSTANT(0.097043558538800007)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(-0.128376561118000000)*g[24];
-    y[28] += tf*g[28]+tg*f[28];
-    t = f[28]*g[28];
-    y[6] += CONSTANT(0.097043558538800007)*t;
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[20] += CONSTANT(-0.021699599367299999)*t;
-    y[24] += CONSTANT(-0.128376561118000000)*t;
-
-    // [29,29]: 20,6,0,22,8,
-    tf = CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(-0.097043558539500002)*f[22]+CONSTANT(-0.140070311615000000)*f[8];
-    tg = CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(-0.097043558539500002)*g[22]+CONSTANT(-0.140070311615000000)*g[8];
-    y[29] += tf*g[29]+tg*f[29];
-    t = f[29]*g[29];
-    y[20] += CONSTANT(0.086798397468799998)*t;
-    y[6] += CONSTANT(0.145565337808999990)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-    y[22] += CONSTANT(-0.097043558539500002)*t;
-    y[8] += CONSTANT(-0.140070311615000000)*t;
-
-    // [30,30]: 0,20,6,
-    tf = CONSTANT(0.282094804531000000)*f[0]+CONSTANT(0.130197634486000000)*f[20]+CONSTANT(0.161739292769000010)*f[6];
-    tg = CONSTANT(0.282094804531000000)*g[0]+CONSTANT(0.130197634486000000)*g[20]+CONSTANT(0.161739292769000010)*g[6];
-    y[30] += tf*g[30]+tg*f[30];
-    t = f[30]*g[30];
-    y[0] += CONSTANT(0.282094804531000000)*t;
-    y[20] += CONSTANT(0.130197634486000000)*t;
-    y[6] += CONSTANT(0.161739292769000010)*t;
-
-    // [31,31]: 6,8,20,22,0,
-    tf = CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.140070311615000000)*f[8]+CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.097043558539500002)*f[22]+CONSTANT(0.282094791773999990)*f[0];
-    tg = CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.140070311615000000)*g[8]+CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.097043558539500002)*g[22]+CONSTANT(0.282094791773999990)*g[0];
-    y[31] += tf*g[31]+tg*f[31];
-    t = f[31]*g[31];
-    y[6] += CONSTANT(0.145565337808999990)*t;
-    y[8] += CONSTANT(0.140070311615000000)*t;
-    y[20] += CONSTANT(0.086798397468799998)*t;
-    y[22] += CONSTANT(0.097043558539500002)*t;
-    y[0] += CONSTANT(0.282094791773999990)*t;
-
-    // [32,32]: 0,24,20,6,
-    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.128376561118000000)*f[24]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(0.097043558538800007)*f[6];
-    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.128376561118000000)*g[24]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(0.097043558538800007)*g[6];
-    y[32] += tf*g[32]+tg*f[32];
-    t = f[32]*g[32];
-    y[0] += CONSTANT(0.282094791771999980)*t;
-    y[24] += CONSTANT(0.128376561118000000)*t;
-    y[20] += CONSTANT(-0.021699599367299999)*t;
-    y[6] += CONSTANT(0.097043558538800007)*t;
-
-    // [33,33]: 6,20,0,
-    tf = CONSTANT(0.016173926423100001)*f[6]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.282094791770000020)*f[0];
-    tg = CONSTANT(0.016173926423100001)*g[6]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.282094791770000020)*g[0];
-    y[33] += tf*g[33]+tg*f[33];
-    t = f[33]*g[33];
-    y[6] += CONSTANT(0.016173926423100001)*t;
-    y[20] += CONSTANT(-0.130197596204999990)*t;
-    y[0] += CONSTANT(0.282094791770000020)*t;
-
-    // [34,34]: 20,6,0,
-    tf = CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(0.282094791766000000)*f[0];
-    tg = CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(0.282094791766000000)*g[0];
-    y[34] += tf*g[34]+tg*f[34];
-    t = f[34]*g[34];
-    y[20] += CONSTANT(-0.130197596207000000)*t;
-    y[6] += CONSTANT(-0.097043558542400002)*t;
-    y[0] += CONSTANT(0.282094791766000000)*t;
-
-    // [35,35]: 6,0,20,
-    tf = CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.282094791761999970)*f[0]+CONSTANT(0.130197596198000000)*f[20];
-    tg = CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.282094791761999970)*g[0]+CONSTANT(0.130197596198000000)*g[20];
-    y[35] += tf*g[35]+tg*f[35];
-    t = f[35]*g[35];
-    y[6] += CONSTANT(-0.242608896358999990)*t;
-    y[0] += CONSTANT(0.282094791761999970)*t;
-    y[20] += CONSTANT(0.130197596198000000)*t;
-
-    // multiply count=2527
-
-    return y;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Evaluates a directional light and returns spectral SH data.  The output 
-// vector is computed so that if the intensity of R/G/B is unit the resulting
-// exit radiance of a point directly under the light on a diffuse object with
-// an albedo of 1 would be 1.0.  This will compute 3 spectral samples, resultR
-// has to be specified, while resultG and resultB are optional.
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204988.aspx
-//-------------------------------------------------------------------------------------
-bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order,
-                                           _In_ FXMVECTOR dir,
-                                           _In_ FXMVECTOR color,
-                                           _Out_writes_(order*order) float *resultR,
-                                           _Out_writes_opt_(order*order) float *resultG,
-                                           _Out_writes_opt_(order*order) float *resultB )
-{
-    if ( !resultR )
-        return false;
-
-    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
-        return false;
-
-    XMFLOAT3A clr;
-    XMStoreFloat3A( &clr, color );
-
-    float fTmp[ XM_SH_MAXORDER * XM_SH_MAXORDER ];
-
-    XMSHEvalDirection(fTmp,order,dir); // evaluate the BF in this direction...
-
-    // now compute "normalization" and scale vector for each valid spectral band
-    const float fNorm = XM_PI / CosWtInt(order);
-
-    const size_t numcoeff = order*order;
-
-    const float fRScale = fNorm * clr.x;
-
-    for( size_t i=0; i < numcoeff; ++i)
-    {
-        resultR[i] = fTmp[i] * fRScale;
-    }
-
-    if (resultG)
-    {
-        const float fGScale = fNorm * clr.y;
-
-        for( size_t i=0; i < numcoeff; ++i)
-        {
-            resultG[i] = fTmp[i] * fGScale;
-        }
-    }
-
-    if (resultB)
-    {
-        const float fBScale = fNorm * clr.z;
-
-        for( size_t i=0; i < numcoeff; ++i)
-        {
-            resultB[i] = fTmp[i]*fBScale;
-        }
-    }
-
-    return true;
-}
-
-
-//------------------------------------------------------------------------------------
-// Evaluates a spherical light and returns spectral SH data.  There is no 
-// normalization of the intensity of the light like there is for directional
-// lights, care has to be taken when specifiying the intensities.  This will 
-// compute 3 spectral samples, resultR has to be specified, while resultG and 
-// resultB are optional.
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205451.aspx
-//-------------------------------------------------------------------------------------
-bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order,
-                                         _In_ FXMVECTOR pos,
-                                         _In_ float radius,
-                                         _In_ FXMVECTOR color,
-                                         _Out_writes_(order*order) float *resultR,
-                                         _Out_writes_opt_(order*order) float *resultG,
-                                         _Out_writes_opt_(order*order) float *resultB )
-{
-    if ( !resultR )
-        return false;
-
-    if ( radius < 0.f )
-        return false;
-
-    const float fDist = XMVectorGetX( XMVector3Length( pos ) );
-
-    // WARNING: fDist should not be < radius - otherwise light contains origin
-
-    //const float fSinConeAngle = (fDist <= radius) ? 0.99999f : radius/fDist;
-    const float fConeAngle = (fDist <= radius) ? (XM_PIDIV2) : asinf(radius/fDist);
-
-    XMVECTOR dir = XMVector3Normalize( pos );
-
-    float fTmpDir[ XM_SH_MAXORDER* XM_SH_MAXORDER];  // rotation "vector"
-    float fTmpL0[ XM_SH_MAXORDER ];
-
-    //
-    // Sphere at distance fDist, the cone angle is determined by looking at the
-    // right triangle with one side (the hypotenuse) beind the vector from the 
-    // origin to the center of the sphere, another side is from the origin to
-    // a point on the sphere whose normal is perpendicular to the given side (this
-    // is one of the points on the cone that is defined by the projection of the sphere
-    // through the origin - we want to find the angle of this cone) and the final
-    // side being from the center of the sphere to the point of tagency (the two
-    // sides conected to this are at a right angle by construction.)
-    // From trig we know that sin(theta) = ||opposite||/||hypotenuse||, where
-    // ||opposite|| = Radius, ||hypotenuse|| = fDist
-    // theta is the angle of the cone that subtends the sphere from the origin
-    //
-
-    // no default normalization is done for this case, have to be careful how
-    // you represent the coefficients...
-
-    const float fNewNorm = 1.0f;///(fSinConeAngle*fSinConeAngle); 
-
-    ComputeCapInt(order,fConeAngle,fTmpL0);
-
-    XMFLOAT3A vd;
-    XMStoreFloat3( &vd, dir );
-
-    const float fX = vd.x;
-    const float fY = vd.y;
-    const float fZ = vd.z;
-
-    switch (order)
-    {
-    case 2:
-        sh_eval_basis_1(fX,fY,fZ,fTmpDir);
-        break;
-
-    case 3:
-        sh_eval_basis_2(fX,fY,fZ,fTmpDir);
-        break;
-
-    case 4:
-        sh_eval_basis_3(fX,fY,fZ,fTmpDir);
-        break;
-
-    case 5:
-        sh_eval_basis_4(fX,fY,fZ,fTmpDir);
-        break;
-
-    case 6:
-        sh_eval_basis_5(fX,fY,fZ,fTmpDir);
-        break;
-
-    default:
-        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
-        return false;
-    }
-
-    XMFLOAT3A clr;
-    XMStoreFloat3A( &clr, color );
-
-    for( size_t i=0; i<order; ++i)
-    {
-        const size_t cNumCoefs = 2*i + 1;
-        const size_t cStart = i*i;
-        const float fValUse = fTmpL0[i]*clr.x*fNewNorm*fExtraNormFac[i];
-        for( size_t j=0; j < cNumCoefs; ++j ) resultR[cStart + j] = fTmpDir[cStart+j]*fValUse;
-    }
-
-    if (resultG)
-    {
-        for( size_t i=0; i<order; ++i)
-        {
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            const float fValUse = fTmpL0[i]*clr.y*fNewNorm*fExtraNormFac[i];
-            for( size_t j=0; j < cNumCoefs; ++j ) resultG[cStart + j] = fTmpDir[cStart+j]*fValUse;
-        }
-    }
-
-    if (resultB)
-    {
-        for( size_t i=0; i<order; ++i)
-        {
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            const float fValUse = fTmpL0[i]*clr.z*fNewNorm*fExtraNormFac[i];
-            for( size_t j=0; j < cNumCoefs; ++j ) resultB[cStart + j] = fTmpDir[cStart+j]*fValUse;
-        }
-    }
-
-    return true;
-}
-
-
-//-------------------------------------------------------------------------------------
-// Evaluates a light that is a cone of constant intensity and returns spectral
-// SH data.  The output vector is computed so that if the intensity of R/G/B is
-// unit the resulting exit radiance of a point directly under the light oriented
-// in the cone direction on a diffuse object with an albedo of 1 would be 1.0.
-// This will compute 3 spectral samples, resultR has to be specified, while resultG
-// and resultB are optional.
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204986.aspx
-//-------------------------------------------------------------------------------------
-bool XM_CALLCONV XMSHEvalConeLight( _In_ size_t order,
-                                    _In_ FXMVECTOR dir,
-                                    _In_ float radius,
-                                    _In_ FXMVECTOR color,
-                                    _Out_writes_(order*order) float *resultR,
-                                    _Out_writes_opt_(order*order) float *resultG,
-                                    _Out_writes_opt_(order*order) float *resultB )
-{
-    if ( !resultR )
-        return false;
-
-    if ( radius < 0.f || radius > (XM_PI*1.00001f) )
-        return false;
-
-    if (radius < 0.0001f)
-    {
-        // turn it into a pure directional light...
-        return XMSHEvalDirectionalLight(order, dir,color,resultR,resultG,resultB);
-    }
-    else
-    {
-        float fTmpL0[ XM_SH_MAXORDER ];
-        float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER ];
-
-        const float fConeAngle = radius;
-        const float fAngCheck = (fConeAngle > XM_PIDIV2) ? (XM_PIDIV2) : fConeAngle;
-
-        const float fNewNorm = 1.0f/(sinf(fAngCheck)*sinf(fAngCheck));
-
-        ComputeCapInt(order,fConeAngle,fTmpL0);
-
-        XMFLOAT3A vd;
-        XMStoreFloat3( &vd, dir );
-
-        const float fX = vd.x;
-        const float fY = vd.y;
-        const float fZ = vd.z;
-
-        switch (order)
-        {
-        case 2:
-            sh_eval_basis_1(fX,fY,fZ,fTmpDir);
-            break;
-
-        case 3:
-            sh_eval_basis_2(fX,fY,fZ,fTmpDir);
-            break;
-
-        case 4:
-            sh_eval_basis_3(fX,fY,fZ,fTmpDir);
-            break;
-
-        case 5:
-            sh_eval_basis_4(fX,fY,fZ,fTmpDir);
-            break;
-
-        case 6:
-            sh_eval_basis_5(fX,fY,fZ,fTmpDir);
-            break;
-
-        default:
-            assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
-            return false;
-        }
-
-        XMFLOAT3A clr;
-        XMStoreFloat3A( &clr, color );
-
-        for( size_t i=0; i<order; ++i)
-        {
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            const float fValUse = fTmpL0[i]*clr.x*fNewNorm*fExtraNormFac[i];
-            for( size_t j=0; j<cNumCoefs; ++j) 
-                resultR[cStart + j] = fTmpDir[cStart+j]*fValUse;
-        }
-
-        if (resultG)
-        {
-            for( size_t i=0; i<order; ++i)
-            {
-                const size_t cNumCoefs = 2*i + 1;
-                const size_t cStart = i*i;
-                const float fValUse = fTmpL0[i]*clr.y*fNewNorm*fExtraNormFac[i];
-                for( size_t j=0; j<cNumCoefs; ++j)
-                    resultG[cStart + j] = fTmpDir[cStart+j]*fValUse;
-            }
-        }
-
-        if (resultB)
-        {
-            for( size_t i=0; i<order; ++i)
-            {
-                const size_t cNumCoefs = 2*i + 1;
-                const size_t cStart = i*i;
-                const float fValUse = fTmpL0[i]*clr.z*fNewNorm*fExtraNormFac[i];
-                for( size_t j=0; j<cNumCoefs; ++j)
-                    resultB[cStart + j] = fTmpDir[cStart+j]*fValUse;
-            }
-        }
-    }
-
-    return true;
-}
-
-
-//------------------------------------------------------------------------------------
-// Evaluates a light that is a linear interpolant between two colors over the
-// sphere.  The interpolant is linear along the axis of the two points, not
-// over the surface of the sphere (ie: if the axis was (0,0,1) it is linear in
-// Z, not in the azimuthal angle.)  The resulting spherical lighting function
-// is normalized so that a point on a perfectly diffuse surface with no
-// shadowing and a normal pointed in the direction pDir would result in exit
-// radiance with a value of 1 if the top color was white and the bottom color
-// was black.  This is a very simple model where topColor represents the intensity 
-// of the "sky" and bottomColor represents the intensity of the "ground".
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204989.aspx
-//-------------------------------------------------------------------------------------
-bool XM_CALLCONV XMSHEvalHemisphereLight( _In_ size_t order,
-                                          _In_ FXMVECTOR dir,
-                                          _In_ FXMVECTOR topColor,
-                                          _In_ FXMVECTOR bottomColor,
-                                          _Out_writes_(order*order) float *resultR,
-                                          _Out_writes_opt_(order*order) float *resultG,
-                                          _Out_writes_opt_(order*order) float *resultB )
-{
-    if ( !resultR )
-        return false;
-
-    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
-        return false;
-
-    // seperate "R/G/B colors...
-
-    float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER];  // rotation "vector"
-    float fTmpL0[ XM_SH_MAXORDER ];
-
-    const float fNewNorm = 3.0f/2.0f; // normalizes things for 1 sky color, 0 ground color...
-
-    XMFLOAT3A vd;
-    XMStoreFloat3( &vd, dir );
-
-    const float fX = vd.x;
-    const float fY = vd.y;
-    const float fZ = vd.z;
-
-    sh_eval_basis_1(fX,fY,fZ,fTmpDir);
-
-    XMFLOAT3A clrTop;
-    XMStoreFloat3A( &clrTop, topColor );
-
-    XMFLOAT3A clrBottom;
-    XMStoreFloat3A( &clrBottom, bottomColor );
-
-    float fA = clrTop.x;
-    float fAvrg = (clrTop.x + clrBottom.x)*0.5f;
-
-    fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
-    fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
-    
-    size_t i = 0;
-    for( ; i<2; ++i)
-    {
-        _Analysis_assume_(i < order);
-        const size_t cNumCoefs = 2*i + 1;
-        const size_t cStart = i*i;
-        const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i];
-        for( size_t j=0; j<cNumCoefs; ++j) resultR[cStart + j] = fTmpDir[cStart+j]*fValUse;
-    }
-
-    for( ; i<order; ++i)
-    {
-        const size_t cNumCoefs = 2*i + 1;
-        const size_t cStart = i*i;
-        for( size_t j=0; j<cNumCoefs; ++j) resultR[cStart + j] = 0.0f;
-    }
-
-    if (resultG)
-    {
-        fA = clrTop.y;
-        fAvrg = (clrTop.y + clrBottom.y)*0.5f;
-
-        fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
-        fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
-
-        for( i=0; i<2; ++i)
-        {
-            _Analysis_assume_(i < order);
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i];
-            for( size_t j=0; j<cNumCoefs; ++j) resultG[cStart + j] = fTmpDir[cStart+j]*fValUse;
-        }
-
-        for( ;i<order; ++i)
-        {
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            for( size_t j=0; j<cNumCoefs; ++j) resultG[cStart + j] = 0.0f;
-        }
-    }
-
-    if (resultB)
-    {
-        fA = clrTop.z;
-        fAvrg = (clrTop.z + clrBottom.z)*0.5f;
-
-        fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
-        fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
-
-        for( i=0; i<2; ++i)
-        {
-            _Analysis_assume_(i < order);
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i];
-            for( size_t j=0; j<cNumCoefs; ++j) resultB[cStart + j] = fTmpDir[cStart+j]*fValUse;
-        }
-
-        for( ; i<order; ++i)
-        {
-            const size_t cNumCoefs = 2*i + 1;
-            const size_t cStart = i*i;
-            for( size_t j=0; j<cNumCoefs; ++j) resultB[cStart + j] = 0.0f;
-        }
-    }
-
-    return true;
-}
-
-}; // namespace DirectX
+//-------------------------------------------------------------------------------------
+// DirectXSH.cpp -- C++ Spherical Harmonics Math Library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#include "DirectXSH.h"
+#include <assert.h>
+
+using namespace DirectX;
+
+#pragma warning( disable : 4619 4456 )
+
+namespace
+{
+    #pragma prefast(disable:246, "generated code by maple (nested const variable definitions)")
+
+    static const float fExtraNormFac[XM_SH_MAXORDER] = { 2.0f*sqrtf(XM_PI), 2.0f/3.0f*sqrtf(3.0f*XM_PI), 2.0f/5.0f*sqrtf(5.0f*XM_PI), 2.0f/7.0f*sqrtf(7.0f*XM_PI), 2.0f/3.0f*sqrtf(XM_PI), 2.0f/11.0f*sqrtf(11.0f*XM_PI) };
+
+    // computes the integral of a constant function over a solid angular
+    // extent.  No error checking - only used internaly.  This function
+    // only returns the Yl0 coefficients, since the rest are zero for
+    // circularly symmetric functions.
+    static const float ComputeCapInt_t1 = sqrtf(0.3141593E1f);
+    static const float ComputeCapInt_t5 = sqrtf(3.0f);
+    static const float ComputeCapInt_t11 = sqrtf(5.0f);
+    static const float ComputeCapInt_t18 = sqrtf(7.0f);
+    static const float ComputeCapInt_t32 = sqrtf(11.0f);
+
+    static inline void ComputeCapInt(const size_t order, float angle, float *pR)
+    {
+        const float t2 = cosf(angle);
+        const float t3 = ComputeCapInt_t1*t2;
+        const float t7 = sinf(angle);
+        const float t8 = t7*t7;
+
+
+        pR[0] = -t3+ComputeCapInt_t1;
+        pR[1] = ComputeCapInt_t5*ComputeCapInt_t1*t8/2.0f;
+
+        if (order > 2)
+        {
+            const float t13 = t2*t2;
+
+            pR[2] = -ComputeCapInt_t11*ComputeCapInt_t1*t2*(t13-1.0f)/2.0f;
+            if (order > 3)
+            {
+                const float t19 = ComputeCapInt_t18*ComputeCapInt_t1;
+                const float t20 = t13*t13;
+
+                pR[3] = -5.0f/8.0f*t19*t20+3.0f/4.0f*t19*t13-t19/8.0f;
+                if (order > 4)
+                {
+
+
+                    pR[4] = -3.0f/8.0f*t3*(7.0f*t20-10.0f*t13+3.0f);
+                    if (order > 5)
+                    {
+                        const float t33 = ComputeCapInt_t32*ComputeCapInt_t1;
+                        pR[5] = -21.0f/16.0f*t33*t20*t13+35.0f/16.0f*t33*t20-15.0f/16.0f*t33*t13+t33/16.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    // input pF only consists of Yl0 values, normalizes coefficients for directional
+    // lights.
+    static inline float CosWtInt(const size_t order)
+    {
+        const float fCW0 = 0.25f;
+        const float fCW1 = 0.5f;
+        const float fCW2 = 5.0f/16.0f;
+        //const float fCW3 = 0.0f;
+        const float fCW4 = -3.0f/32.0f;
+        //const float fCW5 = 0.0f;
+
+        // order has to be at least linear...
+
+        float fRet = fCW0 + fCW1;
+
+        if (order > 2) fRet += fCW2;
+        if (order > 4) fRet += fCW4;
+
+        // odd degrees >= 3 evaluate to zero integrated against cosine...
+
+        return fRet;
+    }
+
+    static const float SHEvalHemisphereLight_fSqrtPi = sqrtf(XM_PI);
+    static const float SHEvalHemisphereLight_fSqrtPi3 = sqrtf(XM_PI/3.0f);
+
+    typedef float REAL;
+    #define CONSTANT(x) (x ## f)
+
+    // routine generated programmatically for evaluating SH basis for degree 1
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    inline static void sh_eval_basis_1(REAL x,REAL y,REAL z,REAL b[4])
+    {
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[  0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[  2] = p_1_0; // l=1,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[  1] = p_1_1*s1; // l=1,m=-1
+        b[  3] = p_1_1*c1; // l=1,m=+1
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 2
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    inline static void sh_eval_basis_2(REAL x,REAL y,REAL z,REAL b[9])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[  0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[  2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[  6] = p_2_0; // l=2,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[  1] = p_1_1*s1; // l=1,m=-1
+        b[  3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[  5] = p_2_1*s1; // l=2,m=-1
+        b[  7] = p_2_1*c1; // l=2,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[  4] = p_2_2*s2; // l=2,m=-2
+        b[  8] = p_2_2*c2; // l=2,m=+2
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 3
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    static void sh_eval_basis_3(REAL x,REAL y,REAL z,REAL b[16])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[  0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[  2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[  6] = p_2_0; // l=2,m=0
+        // l=3
+        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
+        b[ 12] = p_3_0; // l=3,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[  1] = p_1_1*s1; // l=1,m=-1
+        b[  3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[  5] = p_2_1*s1; // l=2,m=-1
+        b[  7] = p_2_1*c1; // l=2,m=+1
+        // l=3
+        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
+        b[ 11] = p_3_1*s1; // l=3,m=-1
+        b[ 13] = p_3_1*c1; // l=3,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[  4] = p_2_2*s2; // l=2,m=-2
+        b[  8] = p_2_2*c2; // l=2,m=+2
+        // l=3
+        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
+        b[ 10] = p_3_2*s2; // l=3,m=-2
+        b[ 14] = p_3_2*c2; // l=3,m=+2
+
+
+        /* m=3 */
+
+        const REAL s3 = x*s2 + y*c2;
+        const REAL c3 = x*c2 - y*s2;
+
+        // l=3
+        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
+        b[  9] = p_3_3*s3; // l=3,m=-3
+        b[ 15] = p_3_3*c3; // l=3,m=+3
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 4
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    static void sh_eval_basis_4(REAL x,REAL y,REAL z,REAL b[25])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[  0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[  2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[  6] = p_2_0; // l=2,m=0
+        // l=3
+        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
+        b[ 12] = p_3_0; // l=3,m=0
+        // l=4
+        const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0;
+        b[ 20] = p_4_0; // l=4,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[  1] = p_1_1*s1; // l=1,m=-1
+        b[  3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[  5] = p_2_1*s1; // l=2,m=-1
+        b[  7] = p_2_1*c1; // l=2,m=+1
+        // l=3
+        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
+        b[ 11] = p_3_1*s1; // l=3,m=-1
+        b[ 13] = p_3_1*c1; // l=3,m=+1
+        // l=4
+        const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200));
+        b[ 19] = p_4_1*s1; // l=4,m=-1
+        b[ 21] = p_4_1*c1; // l=4,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[  4] = p_2_2*s2; // l=2,m=-2
+        b[  8] = p_2_2*c2; // l=2,m=+2
+        // l=3
+        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
+        b[ 10] = p_3_2*s2; // l=3,m=-2
+        b[ 14] = p_3_2*c2; // l=3,m=+2
+        // l=4
+        const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980);
+        b[ 18] = p_4_2*s2; // l=4,m=-2
+        b[ 22] = p_4_2*c2; // l=4,m=+2
+
+
+        /* m=3 */
+
+        const REAL s3 = x*s2 + y*c2;
+        const REAL c3 = x*c2 - y*s2;
+
+        // l=3
+        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
+        b[  9] = p_3_3*s3; // l=3,m=-3
+        b[ 15] = p_3_3*c3; // l=3,m=+3
+        // l=4
+        const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z;
+        b[ 17] = p_4_3*s3; // l=4,m=-3
+        b[ 23] = p_4_3*c3; // l=4,m=+3
+
+
+        /* m=4 */
+
+        const REAL s4 = x*s3 + y*c3;
+        const REAL c4 = x*c3 - y*s3;
+
+        // l=4
+        const REAL p_4_4 = CONSTANT(0.625835735449176030);
+        b[ 16] = p_4_4*s4; // l=4,m=-4
+        b[ 24] = p_4_4*c4; // l=4,m=+4
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 5
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    static void sh_eval_basis_5(REAL x,REAL y,REAL z,REAL b[36])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[  0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[  2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[  6] = p_2_0; // l=2,m=0
+        // l=3
+        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
+        b[ 12] = p_3_0; // l=3,m=0
+        // l=4
+        const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0;
+        b[ 20] = p_4_0; // l=4,m=0
+        // l=5
+        const REAL p_5_0 = CONSTANT(1.989974874213239700)*z*p_4_0 + CONSTANT(-1.002853072844814000)*p_3_0;
+        b[ 30] = p_5_0; // l=5,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[  1] = p_1_1*s1; // l=1,m=-1
+        b[  3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[  5] = p_2_1*s1; // l=2,m=-1
+        b[  7] = p_2_1*c1; // l=2,m=+1
+        // l=3
+        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
+        b[ 11] = p_3_1*s1; // l=3,m=-1
+        b[ 13] = p_3_1*c1; // l=3,m=+1
+        // l=4
+        const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200));
+        b[ 19] = p_4_1*s1; // l=4,m=-1
+        b[ 21] = p_4_1*c1; // l=4,m=+1
+        // l=5
+        const REAL p_5_1 = CONSTANT(2.031009601158990200)*z*p_4_1 + CONSTANT(-0.991031208965114650)*p_3_1;
+        b[ 29] = p_5_1*s1; // l=5,m=-1
+        b[ 31] = p_5_1*c1; // l=5,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[  4] = p_2_2*s2; // l=2,m=-2
+        b[  8] = p_2_2*c2; // l=2,m=+2
+        // l=3
+        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
+        b[ 10] = p_3_2*s2; // l=3,m=-2
+        b[ 14] = p_3_2*c2; // l=3,m=+2
+        // l=4
+        const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980);
+        b[ 18] = p_4_2*s2; // l=4,m=-2
+        b[ 22] = p_4_2*c2; // l=4,m=+2
+        // l=5
+        const REAL p_5_2 = z*(CONSTANT(7.190305177459987500)*z2 + CONSTANT(-2.396768392486662100));
+        b[ 28] = p_5_2*s2; // l=5,m=-2
+        b[ 32] = p_5_2*c2; // l=5,m=+2
+
+
+        /* m=3 */
+
+        const REAL s3 = x*s2 + y*c2;
+        const REAL c3 = x*c2 - y*s2;
+
+        // l=3
+        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
+        b[  9] = p_3_3*s3; // l=3,m=-3
+        b[ 15] = p_3_3*c3; // l=3,m=+3
+        // l=4
+        const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z;
+        b[ 17] = p_4_3*s3; // l=4,m=-3
+        b[ 23] = p_4_3*c3; // l=4,m=+3
+        // l=5
+        const REAL p_5_3 = CONSTANT(-4.403144694917253700)*z2 + CONSTANT(0.489238299435250430);
+        b[ 27] = p_5_3*s3; // l=5,m=-3
+        b[ 33] = p_5_3*c3; // l=5,m=+3
+
+
+        /* m=4 */
+
+        const REAL s4 = x*s3 + y*c3;
+        const REAL c4 = x*c3 - y*s3;
+
+        // l=4
+        const REAL p_4_4 = CONSTANT(0.625835735449176030);
+        b[ 16] = p_4_4*s4; // l=4,m=-4
+        b[ 24] = p_4_4*c4; // l=4,m=+4
+        // l=5
+        const REAL p_5_4 = CONSTANT(2.075662314881041100)*z;
+        b[ 26] = p_5_4*s4; // l=5,m=-4
+        b[ 34] = p_5_4*c4; // l=5,m=+4
+
+
+        /* m=5 */
+
+        const REAL s5 = x*s4 + y*c4;
+        const REAL c5 = x*c4 - y*s4;
+
+        // l=5
+        const REAL p_5_5 = CONSTANT(-0.656382056840170150);
+        b[ 25] = p_5_5*s5; // l=5,m=-5
+        b[ 35] = p_5_5*c5; // l=5,m=+5
+    }
+
+    static const REAL M_PIjs = (REAL) (4.0*atan(1.0));
+    static const REAL maxang = (REAL) (M_PIjs/2);
+    static const int NSH0 = 1;
+    static const int NSH1 = 4;
+    static const int NSH2 = 9;
+    static const int NSH3 = 16;
+    static const int NSH4 = 25;
+    static const int NSH5 = 36;
+    static const int NSH6 = 49;
+    static const int NSH7 = 64;
+    static const int NSH8 = 81;
+    static const int NSH9 = 100;
+    static const int NL0 = 1;
+    static const int NL1 = 3;
+    static const int NL2 = 5;
+    static const int NL3 = 7;
+    static const int NL4 = 9;
+    static const int NL5 = 11;
+    static const int NL6 = 13;
+    static const int NL7 = 15;
+    static const int NL8 = 17;
+    static const int NL9 = 19;
+
+    static inline void rot(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout)
+    {
+        xout = x*ct - y*st;
+        yout = y*ct + x*st;
+    }
+
+    static inline void rot_inv(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout)
+    {
+        xout = x*ct + y*st;
+        yout = y*ct - x*st;
+    }
+
+    static inline void rot_1(REAL ct,REAL st,REAL ctm[1],REAL stm[1])
+    {
+        ctm[0] = ct;
+        stm[0] = st;
+    }
+
+    static inline void rot_2(REAL ct,REAL st,REAL ctm[2],REAL stm[2])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;         
+        stm[0] = st;
+        ctm[1] = ct2*ct-CONSTANT(1.0); 
+        stm[1] = ct2*st;
+    }
+
+    static inline void rot_3(REAL ct,REAL st,REAL ctm[3],REAL stm[3])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;         
+        stm[0] = st;
+        ctm[1] = ct2*ct-CONSTANT(1.0); 
+        stm[1] = ct2*st;
+        ctm[2] = ct2*ctm[1] - ct;
+        stm[2] = ct2*stm[1] - st;
+    }
+
+    static inline void rot_4(REAL ct,REAL st,REAL ctm[4],REAL stm[4])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;         
+        stm[0] = st;
+        ctm[1] = ct2*ct-CONSTANT(1.0); 
+        stm[1] = ct2*st;
+        ctm[2] = ct2*ctm[1] - ct;
+        stm[2] = ct2*stm[1] - st;
+        ctm[3] = ct2*ctm[2] - ctm[1];
+        stm[3] = ct2*stm[2] - stm[1];
+    }
+
+    static inline void rot_5(REAL ct,REAL st,REAL ctm[5],REAL stm[5])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;         
+        stm[0] = st;
+        ctm[1] = ct2*ct-CONSTANT(1.0); 
+        stm[1] = ct2*st;
+        ctm[2] = ct2*ctm[1] - ct;
+        stm[2] = ct2*stm[1] - st;
+        ctm[3] = ct2*ctm[2] - ctm[1];
+        stm[3] = ct2*stm[2] - stm[1];
+        ctm[4] = ct2*ctm[3] - ctm[2];
+        stm[4] = ct2*stm[3] - stm[2];
+    }
+
+    static inline void sh_rotz_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1])
+    {
+        yr[1] = y[1];
+        rot_inv(ctm[0],stm[0],y[0],y[2],yr[0],yr[2]);
+    }
+
+    static inline void sh_rotz_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2])
+    {
+        yr[2] = y[2];
+        rot_inv(ctm[0],stm[0],y[1],y[3],yr[1],yr[3]);
+        rot_inv(ctm[1],stm[1],y[0],y[4],yr[0],yr[4]);
+    }
+
+    static inline void sh_rotz_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3])
+    {
+        yr[3] = y[3];
+        rot_inv(ctm[0],stm[0],y[2],y[4],yr[2],yr[4]);
+        rot_inv(ctm[1],stm[1],y[1],y[5],yr[1],yr[5]);
+        rot_inv(ctm[2],stm[2],y[0],y[6],yr[0],yr[6]);
+    }
+
+    static inline void sh_rotz_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4])
+    {
+        yr[4] = y[4];
+        rot_inv(ctm[0],stm[0],y[3],y[5],yr[3],yr[5]);
+        rot_inv(ctm[1],stm[1],y[2],y[6],yr[2],yr[6]);
+        rot_inv(ctm[2],stm[2],y[1],y[7],yr[1],yr[7]);
+        rot_inv(ctm[3],stm[3],y[0],y[8],yr[0],yr[8]);
+    }
+
+    static inline void sh_rotz_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5])
+    {
+        yr[5] = y[5];
+        rot_inv(ctm[0],stm[0],y[4],y[6],yr[4],yr[6]);
+        rot_inv(ctm[1],stm[1],y[3],y[7],yr[3],yr[7]);
+        rot_inv(ctm[2],stm[2],y[2],y[8],yr[2],yr[8]);
+        rot_inv(ctm[3],stm[3],y[1],y[9],yr[1],yr[9]);
+        rot_inv(ctm[4],stm[4],y[0],y[10],yr[0],yr[10]);
+    }
+
+    // rotation code generated programmatically by rotatex (2000x4000 samples, eps=1e-008)
+
+    static REAL fx_1_001 = (REAL) ( sqrt(1.0)/1.0); // 1
+    static REAL fx_1_002 = (REAL) (-sqrt(1.0)/1.0); // -1.00000030843
+
+    static inline void sh_rotx90_1(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_1_001*y[  1];
+        yr[  1] =  fx_1_002*y[  0];
+        yr[  2] =  fx_1_001*y[  2];
+    };
+
+    static inline void sh_rotx90_inv_1(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_1_002*y[  1];
+        yr[  1] =  fx_1_001*y[  0];
+        yr[  2] =  fx_1_001*y[  2];
+    }
+
+    static REAL fx_2_001 = (REAL) ( sqrt(4.0)/2.0); // 1
+    static REAL fx_2_002 = (REAL) (-sqrt(4.0)/2.0); // -1
+    static REAL fx_2_003 = (REAL) (-sqrt(1.0)/2.0); // -0.500000257021
+    static REAL fx_2_004 = (REAL) (-sqrt(3.0)/2.0); // -0.866025848959
+    static REAL fx_2_005 = (REAL) ( sqrt(1.0)/2.0); // 0.5
+
+    static inline void sh_rotx90_2(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_2_001*y[  3];
+        yr[  1] =  fx_2_002*y[  1];
+        yr[  2] =  fx_2_003*y[  2]+fx_2_004*y[  4];
+        yr[  3] =  fx_2_002*y[  0];
+        yr[  4] =  fx_2_004*y[  2]+fx_2_005*y[  4];
+    };
+
+    static inline void sh_rotx90_inv_2(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_2_002*y[  3];
+        yr[  1] =  fx_2_002*y[  1];
+        yr[  2] =  fx_2_003*y[  2]+fx_2_004*y[  4];
+        yr[  3] =  fx_2_001*y[  0];
+        yr[  4] =  fx_2_004*y[  2]+fx_2_005*y[  4];
+    }
+
+    static REAL fx_3_001 = (REAL) (-sqrt(10.0)/4.0); // -0.790569415042
+    static REAL fx_3_002 = (REAL) ( sqrt(6.0)/4.0); // 0.612372435696
+    static REAL fx_3_003 = (REAL) (-sqrt(16.0)/4.0); // -1
+    static REAL fx_3_004 = (REAL) (-sqrt(6.0)/4.0); // -0.612372435695
+    static REAL fx_3_005 = (REAL) (-sqrt(1.0)/4.0); // -0.25
+    static REAL fx_3_006 = (REAL) (-sqrt(15.0)/4.0); // -0.968245836551
+    static REAL fx_3_007 = (REAL) ( sqrt(1.0)/4.0); // 0.25
+    static REAL fx_3_008 = (REAL) ( sqrt(10.0)/4.0); // 0.790569983984
+
+    static inline void sh_rotx90_3(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_3_001*y[  3]+fx_3_002*y[  5];
+        yr[  1] =  fx_3_003*y[  1];
+        yr[  2] =  fx_3_004*y[  3]+fx_3_001*y[  5];
+        yr[  3] =  fx_3_008*y[  0]+fx_3_002*y[  2];
+        yr[  4] =  fx_3_005*y[  4]+fx_3_006*y[  6];
+        yr[  5] =  fx_3_004*y[  0]-fx_3_001*y[  2];
+        yr[  6] =  fx_3_006*y[  4]+fx_3_007*y[  6];
+    };
+
+    static inline void sh_rotx90_inv_3(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_3_008*y[  3]+fx_3_004*y[  5];
+        yr[  1] =  fx_3_003*y[  1];
+        yr[  2] =  fx_3_002*y[  3]-fx_3_001*y[  5];
+        yr[  3] =  fx_3_001*y[  0]+fx_3_004*y[  2];
+        yr[  4] =  fx_3_005*y[  4]+fx_3_006*y[  6];
+        yr[  5] =  fx_3_002*y[  0]+fx_3_001*y[  2];
+        yr[  6] =  fx_3_006*y[  4]+fx_3_007*y[  6];
+    }
+
+    static REAL fx_4_001 = (REAL) (-sqrt(56.0)/8.0); // -0.935414346694
+    static REAL fx_4_002 = (REAL) ( sqrt(8.0)/8.0); // 0.353553390593
+    static REAL fx_4_003 = (REAL) (-sqrt(36.0)/8.0); // -0.75
+    static REAL fx_4_004 = (REAL) ( sqrt(28.0)/8.0); // 0.661437827766
+    static REAL fx_4_005 = (REAL) (-sqrt(8.0)/8.0); // -0.353553390593
+    static REAL fx_4_006 = (REAL) ( sqrt(36.0)/8.0); // 0.749999999999
+    static REAL fx_4_007 = (REAL) ( sqrt(9.0)/8.0); // 0.37500034698
+    static REAL fx_4_008 = (REAL) ( sqrt(20.0)/8.0); // 0.559017511622
+    static REAL fx_4_009 = (REAL) ( sqrt(35.0)/8.0); // 0.739510657141
+    static REAL fx_4_010 = (REAL) ( sqrt(16.0)/8.0); // 0.5
+    static REAL fx_4_011 = (REAL) (-sqrt(28.0)/8.0); // -0.661437827766
+    static REAL fx_4_012 = (REAL) ( sqrt(1.0)/8.0); // 0.125
+    static REAL fx_4_013 = (REAL) ( sqrt(56.0)/8.0); // 0.935414346692
+
+    static inline void sh_rotx90_4(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_4_001*y[  5]+fx_4_002*y[  7];
+        yr[  1] =  fx_4_003*y[  1]+fx_4_004*y[  3];
+        yr[  2] =  fx_4_005*y[  5]+fx_4_001*y[  7];
+        yr[  3] =  fx_4_004*y[  1]+fx_4_006*y[  3];
+        yr[  4] =  fx_4_007*y[  4]+fx_4_008*y[  6]+fx_4_009*y[  8];
+        yr[  5] =  fx_4_013*y[  0]+fx_4_002*y[  2];
+        yr[  6] =  fx_4_008*y[  4]+fx_4_010*y[  6]+fx_4_011*y[  8];
+        yr[  7] =  fx_4_005*y[  0]-fx_4_001*y[  2];
+        yr[  8] =  fx_4_009*y[  4]+fx_4_011*y[  6]+fx_4_012*y[  8];
+    };
+
+    static inline void sh_rotx90_inv_4(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_4_013*y[  5]+fx_4_005*y[  7];
+        yr[  1] =  fx_4_003*y[  1]+fx_4_004*y[  3];
+        yr[  2] =  fx_4_002*y[  5]-fx_4_001*y[  7];
+        yr[  3] =  fx_4_004*y[  1]+fx_4_006*y[  3];
+        yr[  4] =  fx_4_007*y[  4]+fx_4_008*y[  6]+fx_4_009*y[  8];
+        yr[  5] =  fx_4_001*y[  0]+fx_4_005*y[  2];
+        yr[  6] =  fx_4_008*y[  4]+fx_4_010*y[  6]+fx_4_011*y[  8];
+        yr[  7] =  fx_4_002*y[  0]+fx_4_001*y[  2];
+        yr[  8] =  fx_4_009*y[  4]+fx_4_011*y[  6]+fx_4_012*y[  8];
+    }
+
+    static REAL fx_5_001 = (REAL) ( sqrt(126.0)/16.0); // 0.70156076002
+    static REAL fx_5_002 = (REAL) (-sqrt(120.0)/16.0); // -0.684653196882
+    static REAL fx_5_003 = (REAL) ( sqrt(10.0)/16.0); // 0.197642353761
+    static REAL fx_5_004 = (REAL) (-sqrt(64.0)/16.0); // -0.5
+    static REAL fx_5_005 = (REAL) ( sqrt(192.0)/16.0); // 0.866025403784
+    static REAL fx_5_006 = (REAL) ( sqrt(70.0)/16.0); // 0.522912516584
+    static REAL fx_5_007 = (REAL) ( sqrt(24.0)/16.0); // 0.306186217848
+    static REAL fx_5_008 = (REAL) (-sqrt(162.0)/16.0); // -0.795495128835
+    static REAL fx_5_009 = (REAL) ( sqrt(64.0)/16.0); // 0.5
+    static REAL fx_5_010 = (REAL) ( sqrt(60.0)/16.0); // 0.484122918274
+    static REAL fx_5_011 = (REAL) ( sqrt(112.0)/16.0); // 0.661437827763
+    static REAL fx_5_012 = (REAL) ( sqrt(84.0)/16.0); // 0.572821961867
+    static REAL fx_5_013 = (REAL) ( sqrt(4.0)/16.0); // 0.125
+    static REAL fx_5_014 = (REAL) ( sqrt(42.0)/16.0); // 0.405046293649
+    static REAL fx_5_015 = (REAL) ( sqrt(210.0)/16.0); // 0.905711046633
+    static REAL fx_5_016 = (REAL) ( sqrt(169.0)/16.0); // 0.8125
+    static REAL fx_5_017 = (REAL) (-sqrt(45.0)/16.0); // -0.419262745781
+    static REAL fx_5_018 = (REAL) ( sqrt(1.0)/16.0); // 0.0625
+    static REAL fx_5_019 = (REAL) (-sqrt(126.0)/16.0); // -0.701561553415
+    static REAL fx_5_020 = (REAL) ( sqrt(120.0)/16.0); // 0.684653196881
+    static REAL fx_5_021 = (REAL) (-sqrt(10.0)/16.0); // -0.197642353761
+    static REAL fx_5_022 = (REAL) (-sqrt(70.0)/16.0); // -0.522913107945
+    static REAL fx_5_023 = (REAL) (-sqrt(60.0)/16.0); // -0.48412346577
+
+    static inline void sh_rotx90_5(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_5_001*y[  5]+fx_5_002*y[  7]+fx_5_003*y[  9];
+        yr[  1] =  fx_5_004*y[  1]+fx_5_005*y[  3];
+        yr[  2] =  fx_5_006*y[  5]+fx_5_007*y[  7]+fx_5_008*y[  9];
+        yr[  3] =  fx_5_005*y[  1]+fx_5_009*y[  3];
+        yr[  4] =  fx_5_010*y[  5]+fx_5_011*y[  7]+fx_5_012*y[  9];
+        yr[  5] =  fx_5_019*y[  0]+fx_5_022*y[  2]+fx_5_023*y[  4];
+        yr[  6] =  fx_5_013*y[  6]+fx_5_014*y[  8]+fx_5_015*y[ 10];
+        yr[  7] =  fx_5_020*y[  0]-fx_5_007*y[  2]-fx_5_011*y[  4];
+        yr[  8] =  fx_5_014*y[  6]+fx_5_016*y[  8]+fx_5_017*y[ 10];
+        yr[  9] =  fx_5_021*y[  0]-fx_5_008*y[  2]-fx_5_012*y[  4];
+        yr[ 10] =  fx_5_015*y[  6]+fx_5_017*y[  8]+fx_5_018*y[ 10];
+    };
+
+    static inline void sh_rotx90_inv_5(REAL y[],REAL yr[])
+    {
+        yr[  0] =  fx_5_019*y[  5]+fx_5_020*y[  7]+fx_5_021*y[  9];
+        yr[  1] =  fx_5_004*y[  1]+fx_5_005*y[  3];
+        yr[  2] =  fx_5_022*y[  5]-fx_5_007*y[  7]-fx_5_008*y[  9];
+        yr[  3] =  fx_5_005*y[  1]+fx_5_009*y[  3];
+        yr[  4] =  fx_5_023*y[  5]-fx_5_011*y[  7]-fx_5_012*y[  9];
+        yr[  5] =  fx_5_001*y[  0]+fx_5_006*y[  2]+fx_5_010*y[  4];
+        yr[  6] =  fx_5_013*y[  6]+fx_5_014*y[  8]+fx_5_015*y[ 10];
+        yr[  7] =  fx_5_002*y[  0]+fx_5_007*y[  2]+fx_5_011*y[  4];
+        yr[  8] =  fx_5_014*y[  6]+fx_5_016*y[  8]+fx_5_017*y[ 10];
+        yr[  9] =  fx_5_003*y[  0]+fx_5_008*y[  2]+fx_5_012*y[  4];
+        yr[ 10] =  fx_5_015*y[  6]+fx_5_017*y[  8]+fx_5_018*y[ 10];
+    }
+
+    static inline void sh_rot_1(REAL m[3*3],REAL y[NL1],REAL yr[NL1])
+    {
+        REAL yr0 = m[4]*y[0] - m[5]*y[1] + m[3]*y[2];
+        REAL yr1 = m[8]*y[1] - m[7]*y[0] - m[6]*y[2];
+        REAL yr2 = m[1]*y[0] - m[2]*y[1] + m[0]*y[2];
+
+        yr[0] = yr0;
+        yr[1] = yr1;
+        yr[2] = yr2;
+    }
+
+    static inline void sh_roty_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1])
+    {
+        yr[0] = y[0];
+        rot_inv(ctm[0],stm[0],y[1],y[2],yr[1],yr[2]);
+    }
+
+    static inline void sh_roty_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2])
+    {
+        REAL ytmp[NL2];
+        sh_rotx90_2(y,yr);
+        sh_rotz_2(ctm,stm,yr,ytmp);
+        sh_rotx90_inv_2(ytmp,yr);
+    }
+
+    static inline void sh_roty_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3])
+    {
+        REAL ytmp[NL3];
+        sh_rotx90_3(y,yr);
+        sh_rotz_3(ctm,stm,yr,ytmp);
+        sh_rotx90_inv_3(ytmp,yr);
+    }
+
+    static inline void sh_roty_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4])
+    {
+        REAL ytmp[NL4];
+        sh_rotx90_4(y,yr);
+        sh_rotz_4(ctm,stm,yr,ytmp);
+        sh_rotx90_inv_4(ytmp,yr);
+    }
+
+    static inline void sh_roty_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5])
+    {
+        REAL ytmp[NL5];
+        sh_rotx90_5(y,yr);
+        sh_rotz_5(ctm,stm,yr,ytmp);
+        sh_rotx90_inv_5(ytmp,yr);
+    } 
+
+    #define ROT_TOL CONSTANT(1e-4)
+
+    /*
+    Finds cosine,sine pairs for zyz rotation (i.e. rotation R_z2 R_y R_z1 v).
+    The rotation is one which maps mx to (1,0,0) and mz to (0,0,1).
+    */
+    static inline void zyz(REAL m[3*3],REAL &zc1,REAL &zs1,REAL &yc,REAL &ys,REAL &zc2,REAL &zs2)
+    {
+        REAL cz = m[8];
+
+        // rotate so that (cx,cy,0) aligns to (1,0,0)
+        REAL cxylen = (REAL) sqrtf(1.0f - cz*cz);
+        if (cxylen >= ROT_TOL)
+        {
+            // if above is a NaN, will do the correct thing
+            yc = cz;
+            ys = cxylen;
+            REAL len67inv = 1.0f/sqrtf(m[6]*m[6] + m[7]*m[7]);
+            zc1 = -m[6]*len67inv;
+            zs1 =  m[7]*len67inv;
+            REAL len25inv = 1.0f/sqrtf(m[2]*m[2] + m[5]*m[5]);
+            zc2 = m[2]*len25inv;
+            zs2 = m[5]*len25inv; 
+        } else {  // m[6],m[7],m[8] already aligned to (0,0,1)
+            zc1 = 1.0; zs1 = 0.0;        // identity
+            yc = cz; ys = 0.0;           // identity
+            zc2 = m[0]*cz; zs2 = -m[1];  // align x axis (mx[0],mx[1],0) to (1,0,0)
+        }
+    }
+
+    static inline void sh_rotzyz_2(REAL zc1m[2],REAL zs1m[2],REAL ycm[2],REAL ysm[2],REAL zc2m[2],REAL zs2m[2],REAL y[NL2],REAL yr[NL2])
+    {
+        REAL ytmp[NL2];
+        sh_rotz_2(zc1m,zs1m,y,yr);
+        sh_roty_2(ycm,ysm,yr,ytmp);
+        sh_rotz_2(zc2m,zs2m,ytmp,yr);
+    }
+
+    static inline void sh_rotzyz_3(REAL zc1m[3],REAL zs1m[3],REAL ycm[3],REAL ysm[3],REAL zc2m[3],REAL zs2m[3],REAL y[NL3],REAL yr[NL3])
+    {
+        REAL ytmp[NL3];
+        sh_rotz_3(zc1m,zs1m,y,yr);
+        sh_roty_3(ycm,ysm,yr,ytmp);
+        sh_rotz_3(zc2m,zs2m,ytmp,yr);
+    }
+
+    static inline void sh_rotzyz_4(REAL zc1m[4],REAL zs1m[4],REAL ycm[4],REAL ysm[4],REAL zc2m[4],REAL zs2m[4],REAL y[NL4],REAL yr[NL4])
+    {
+        REAL ytmp[NL4];
+        sh_rotz_4(zc1m,zs1m,y,yr);
+        sh_roty_4(ycm,ysm,yr,ytmp);
+        sh_rotz_4(zc2m,zs2m,ytmp,yr);
+    }
+
+    static inline void sh_rotzyz_5(REAL zc1m[5],REAL zs1m[5],REAL ycm[5],REAL ysm[5],REAL zc2m[5],REAL zs2m[5],REAL y[NL5],REAL yr[NL5])
+    {
+        REAL ytmp[NL5];
+        sh_rotz_5(zc1m,zs1m,y,yr);
+        sh_roty_5(ycm,ysm,yr,ytmp);
+        sh_rotz_5(zc2m,zs2m,ytmp,yr);
+    }
+
+    static inline void sh3_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH3],REAL yr[NSH3])
+    {
+        REAL zc1m[3],zs1m[3];
+        rot_3(zc1,zs1,zc1m,zs1m);
+        REAL ycm[3],ysm[3];
+        rot_3(yc,ys,ycm,ysm);
+        REAL zc2m[3],zs2m[3];
+        rot_3(zc2,zs2,zc2m,zs2m);
+
+        yr[0] = y[0];
+        sh_rot_1(m,y+NSH0,yr+NSH0);
+        sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1);
+        sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2);
+    }
+
+    static inline void sh4_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH4],REAL yr[NSH4])
+    {
+        REAL zc1m[4],zs1m[4];
+        rot_4(zc1,zs1,zc1m,zs1m);
+        REAL ycm[4],ysm[4];
+        rot_4(yc,ys,ycm,ysm);
+        REAL zc2m[4],zs2m[4];
+        rot_4(zc2,zs2,zc2m,zs2m);
+
+        yr[0] = y[0];
+        sh_rot_1(m,y+NSH0,yr+NSH0);
+        sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1);
+        sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2);
+        sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3);
+    }
+
+    static inline void sh5_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH5],REAL yr[NSH5])
+    {
+        REAL zc1m[5],zs1m[5];
+        rot_5(zc1,zs1,zc1m,zs1m);
+        REAL ycm[5],ysm[5];
+        rot_5(yc,ys,ycm,ysm);
+        REAL zc2m[5],zs2m[5];
+        rot_5(zc2,zs2,zc2m,zs2m);
+
+        yr[0] = y[0];
+        sh_rot_1(m,y+NSH0,yr+NSH0);
+        sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1);
+        sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2);
+        sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3);
+        sh_rotzyz_5(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH4,yr+NSH4);
+    }
+
+    inline void sh1_rot(REAL m[3*3],REAL y[NSH1],REAL yr[NSH1])
+    {
+        yr[0] = y[0];
+        sh_rot_1(m,y+NSH0,yr+NSH0);
+    }
+
+    inline void sh3_rot(REAL m[3*3],REAL y[NSH3],REAL yr[NSH3])
+    {
+        REAL zc1,zs1,yc,ys,zc2,zs2;
+        zyz(m,zc1,zs1,yc,ys,zc2,zs2);
+        sh3_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr);
+    }
+
+    inline void sh4_rot(REAL m[3*3],REAL y[NSH4],REAL yr[NSH4])
+    {
+        REAL zc1,zs1,yc,ys,zc2,zs2;
+        zyz(m,zc1,zs1,yc,ys,zc2,zs2);
+        sh4_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr);
+    }
+
+    inline void sh5_rot(REAL m[3*3],REAL y[NSH5],REAL yr[NSH5])
+    {
+        REAL zc1,zs1,yc,ys,zc2,zs2;
+        zyz(m,zc1,zs1,yc,ys,zc2,zs2);
+        sh5_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr);
+    }
+
+    // simple matrix vector multiply for a square matrix (only used by ZRotation)
+    static inline void SimpMatMul(size_t dim, const float *matrix, const float *input, float *result)
+    {
+        for(size_t iR=0; iR < dim; ++iR)
+        {
+            result[iR + 0] = matrix[iR*dim + 0] * input[0];
+            for(size_t iC=1; iC < dim; ++iC)
+            {
+                result[iR] += matrix[iR*dim+ iC] * input[iC]; 
+            }
+        }
+    }
+
+}; // anonymous namespace
+
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Evaluates the Spherical Harmonic basis functions
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205448.aspx
+//-------------------------------------------------------------------------------------
+float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result,
+                                      _In_ size_t order,
+                                      _In_ FXMVECTOR dir )
+{
+    if ( !result )
+        return nullptr;
+
+    XMFLOAT4A dv;
+    XMStoreFloat4A( &dv, dir );
+
+    const float fX = dv.x;
+    const float fY = dv.y;
+    const float fZ = dv.z;
+
+    switch( order )
+    {
+    case 2:
+        sh_eval_basis_1(fX,fY,fZ,result);
+        break;
+
+    case 3:
+        sh_eval_basis_2(fX,fY,fZ,result);
+        break;
+
+    case 4:
+        sh_eval_basis_3(fX,fY,fZ,result);
+        break;
+
+    case 5:
+        sh_eval_basis_4(fX,fY,fZ,result);
+        break;
+
+    case 6:
+        sh_eval_basis_5(fX,fY,fZ,result);
+        break;
+
+    default:
+        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
+        return nullptr;
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Rotates SH vector by a rotation matrix
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204992.aspx
+//-------------------------------------------------------------------------------------
+float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result,
+                               _In_ size_t order,
+                               _In_ FXMMATRIX rotMatrix,
+                               _In_reads_(order*order) const float *input )
+{
+    if ( !result || !input )
+        return nullptr;
+
+    if( result == input )
+        return nullptr;
+
+    XMFLOAT3X3 mat;
+    XMStoreFloat3x3( &mat, rotMatrix );
+
+    float mRot[3*3];
+    const float r00 = mRot[0*3 +0] = mat._11;
+    const float r10 = mRot[1*3 +0] = mat._12;
+    const float r20 = mRot[2*3 +0] = mat._13;
+
+    const float r01 = mRot[0*3 +1] = mat._21;
+    const float r11 = mRot[1*3 +1] = mat._22;
+    const float r21 = mRot[2*3 +1] = mat._23;
+
+    const float r02 = mRot[0*3 +2] = mat._31;
+    const float r12 = mRot[1*3 +2] = mat._32;
+    const float r22 = mRot[2*3 +2] = mat._33;
+
+    result[0] = input[0]; // rotate the constant term
+
+    switch (order)
+    {
+    case 2:
+            {
+                // do linear by hand...
+
+                result[1] = r11*input[1] - r12*input[2] + r10*input[3];
+                result[2] = -r21*input[1] + r22*input[2] - r20*input[3];
+                result[3] = r01*input[1] -r02*input[2] + r00*input[3];
+            }
+            break;
+
+    case 3:
+        {
+            float R[25];
+            // do linear by hand...
+
+            result[1] = r11*input[1] - r12*input[2] + r10*input[3];
+            result[2] = -r21*input[1] + r22*input[2] - r20*input[3];
+            result[3] = r01*input[1] -r02*input[2] + r00*input[3];
+
+            // direct code for quadratics is faster than ZYZ reccurence relations
+
+            const float t41 = r01 * r00;
+            const float t43 = r11 * r10;
+            const float t48 = r11 * r12;
+            const float t50 = r01 * r02;
+            const float t55 = r02 * r02;
+            const float t57 = r22 * r22;
+            const float t58 = r12 * r12;
+            const float t61 = r00 * r02;
+            const float t63 = r10 * r12;
+            const float t68 = r10 * r10;
+            const float t70 = r01 * r01;
+            const float t72 = r11 * r11;
+            const float t74 = r00 * r00;
+            const float t76 = r21 * r21;
+            const float t78 = r20 * r20;
+
+            const float v173 = 0.1732050808e1f;
+            const float v577 = 0.5773502693e0f;
+            const float v115 = 0.1154700539e1f;
+            const float v288 = 0.2886751347e0f;
+            const float v866 = 0.8660254040e0f;
+
+            R[0] = r11 * r00 + r01 * r10;
+            R[1] = - r01 * r12 -  r11 * r02;
+            R[2] =  v173 * r02 * r12;
+            R[3] = - r10 * r02 -  r00 * r12;
+            R[4] = r00 * r10 -  r01 * r11;
+            R[5] = - r11 * r20 -  r21 * r10;
+            R[6] = r11 * r22 + r21 * r12;
+            R[7] = -v173 * r22 * r12;
+            R[8] = r20 * r12 + r10 * r22;
+            R[9] = - r10 * r20 + r11 * r21;
+            R[10] = - v577* (t41 + t43) + v115 * r21 * r20;
+            R[11] = v577* (t48 +  t50) - v115 * r21 * r22;
+            R[12] = -0.5000000000e0f * (t55 + t58) + t57;
+            R[13] = v577 * (t61 +  t63) - v115 * r20 * r22;
+            R[14] =  v288 * (t70 - t68 +  t72 -  t74) - v577 * (t76 - t78);
+            R[15] = - r01 * r20 -  r21 * r00;
+            R[16] = r01 * r22 + r21 * r02;
+            R[17] = -v173 * r22 * r02;
+            R[18] = r00 * r22 + r20 * r02;
+            R[19] = - r00 * r20 + r01 * r21;
+            R[20] = t41 -  t43;
+            R[21] = - t50 + t48;
+            R[22] =  v866 * (t55 - t58);
+            R[23] = t63 -  t61;
+            R[24] = 0.5000000000e0f *( t74 -  t68 -  t70 +  t72);
+
+            // blow the matrix multiply out by hand, looping is ineficient on a P4...
+            for(unsigned int iR=0; iR<5;iR++)
+            {
+                const unsigned int uBase = iR*5;
+                result[4 + iR] = R[uBase + 0]*input[4] + R[uBase + 1]*input[5] + R[uBase + 2]*input[6] + R[uBase + 3]*input[7] + R[uBase + 4]*input[8];
+            }
+        }
+        break;
+
+    case 4:
+        sh3_rot(mRot,const_cast<float *>(input),result);
+        break;
+
+    case 5:
+        sh4_rot(mRot,const_cast<float *>(input),result);
+        break;
+
+    case 6:
+        sh5_rot(mRot,const_cast<float *>(input),result);
+        break;
+
+    default:
+        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
+        return nullptr;
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Rotates the SH vector in the Z axis by an angle
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205461.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHRotateZ( _Out_writes_(order*order) float *result,
+                    _In_ size_t order,
+                    _In_ float angle,
+                    _In_reads_(order*order) const float *input )
+{
+    if ( !result || !input )
+        return nullptr;
+
+    if( result == input )
+        return nullptr;
+
+    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
+        return nullptr;
+
+    float R[(2*(XM_SH_MAXORDER-1) + 1)*(2* (XM_SH_MAXORDER-1) + 1)]; // used to store rotation matrices...
+
+    // these are actually very sparse matrices, most of the entries are zero's...
+
+    const float ca = cosf(angle);
+    const float sa = sinf(angle);
+
+    const float t1 = ca;
+    const float t2 = sa;
+    R[0] = t1;
+    R[1] = 0.0f;
+    R[2] = t2;
+    R[3] = 0.0f;
+    R[4] = 1.0f;
+    R[5] = 0.0f;
+    R[6] = -t2;
+    R[7] = 0.0f;
+    R[8] = t1;
+
+    result[0] = input[0];
+    SimpMatMul(3,R,input+1,result+1);
+
+    if (order > 2)
+    {
+        for(int j=0;j<5*5;j++) R[j] = 0.0f;
+        const float t1 = sa;
+        const float t2 = t1*t1;
+        const float t3 = ca;
+        const float t4 = t3*t3;
+        const float t5 = -t2+t4;
+        const float t7 = 2.0f*t3*t1;
+        R[0] = t5;
+        R[4] = t7;
+        R[6] = t3;
+        R[8] = t1;
+        R[12] = 1.0f;
+        R[16] = -t1;
+        R[18] = t3;
+        R[20] = -t7;
+        R[24] = t5;
+
+        SimpMatMul(5,R,input+4,result+4); // un-roll matrix/vector multiply
+        if (order > 3)
+        {
+            for(int j=0;j<7*7;j++) R[j] = 0.0f;
+            const float t1 = ca;
+            const float t2 = t1*t1;
+            const float t4 = sa;
+            const float t5 = t4*t4;
+            const float t8 = t2*t1-3.0f*t1*t5;
+            const float t12 = 3.0f*t4*t2-t5*t4;
+            const float t13 = -t5+t2;
+            const float t15 = 2.0f*t1*t4;
+            R[0] = t8;
+            R[6] = t12;
+            R[8] = t13;
+            R[12] = t15;
+            R[16] = t1;
+            R[18] = t4;
+            R[24] = 1.0f;
+            R[30] = -t4;
+            R[32] = t1;
+            R[36] = -t15;
+            R[40] = t13;
+            R[42] = -t12;
+            R[48] = t8;
+            SimpMatMul(7,R,input+9,result+9);
+            if (order > 4)
+            {
+                for(int j=0;j<=9*9;j++) R[j] = 0.0f;
+                const float t1 = ca;
+                const float t2 = t1*t1;
+                const float t3 = t2*t2;
+                const float t4 = sa;
+                const float t5 = t4*t4;
+                const float t6 = t5*t5;
+                const float t9 = t3+t6-6.0f*t5*t2;
+                const float t10 = t5*t4;
+                const float t12 = t2*t1;
+                const float t14 = -t10*t1+t4*t12;
+                const float t17 = t12-3.0f*t1*t5;
+                const float t20 = 3.0f*t4*t2-t10;
+                const float t21 = -t5+t2;
+                const float t23 = 2.0f*t1*t4;
+                R[0] = t9;
+                R[8] = 4.0f*t14;
+                R[10] = t17;
+                R[16] = t20;
+                R[20] = t21;
+                R[24] = t23;
+                R[30] = t1;
+                R[32] = t4;
+                R[40] = 1.0f;
+                R[48] = -t4;
+                R[50] = t1;
+                R[56] = -t23;
+                R[60] = t21;
+                R[64] = -t20;
+                R[70] = t17;
+                R[72] = -4.0f*t14;
+                R[80] = t9;
+
+                SimpMatMul(9,R,input+16,result+16);
+                if (order > 5)
+                {
+                    for(int j=0;j<11*11;j++) R[j] = 0.0f;
+                    const float t1 = ca;
+                    const float t2 = sa;
+                    const float t3 = t2*t2;
+                    const float t4 = t3*t3;
+                    const float t7 = t1*t1;
+                    const float t8 = t7*t1;
+                    const float t11 = t7*t7;
+                    const float t13 = 5.0f*t1*t4-10.0f*t3*t8+t11*t1;
+                    const float t14 = t3*t2;
+                    const float t20 = -10.0f*t14*t7+5.0f*t2*t11+t4*t2;
+                    const float t23 = t11+t4-6.0f*t3*t7;
+                    const float t26 = -t14*t1+t2*t8;
+                    const float t29 = t8-3.0f*t1*t3;
+                    const float t32 = 3.0f*t2*t7-t14;
+                    const float t33 = -t3+t7;
+                    const float t35 = 2.0f*t1*t2;
+                    R[0] = t13;
+                    R[10] = t20;
+                    R[12] = t23;
+                    R[20] = 4.0f*t26;
+                    R[24] = t29;
+                    R[30] = t32;
+                    R[36] = t33;
+                    R[40] = t35;
+                    R[48] = t1;
+                    R[50] = t2;
+                    R[60] = 1.0f;
+                    R[70] = -t2;
+                    R[72] = t1;
+                    R[80] = -t35;
+                    R[84] = t33;
+                    R[90] = -t32;
+                    R[96] = t29;
+                    R[100] = -4.0f*t26;
+                    R[108] = t23;
+                    R[110] = -t20;
+                    R[120] = t13;
+                    SimpMatMul(11,R,input+25,result+25);
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Adds two SH vectors, result[i] = inputA[i] + inputB[i];
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205438.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHAdd( _Out_writes_(order*order) float *result,
+                _In_ size_t order,
+                _In_reads_(order*order) const float *inputA,
+                _In_reads_(order*order) const float *inputB )
+{
+    if ( !result || !inputA || !inputB )
+        return nullptr;
+
+    const size_t numcoeff = order*order;
+
+    for( size_t i=0; i < numcoeff; ++i )
+    {
+        result[i] = inputA[i] + inputB[i];
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Scales a SH vector, result[i] = input[i] * scale;
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204994.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHScale( _Out_writes_(order*order) float *result,
+                  _In_ size_t order,
+                  _In_reads_(order*order) const float *input,
+                  _In_ float scale )
+{
+    if ( !result || !input )
+        return nullptr;
+
+    const size_t numcoeff = order*order;
+
+    for( size_t i=0; i < numcoeff; ++i )
+    {
+        result[i] = scale * input[i];
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Computes the dot product of two SH vectors
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205446.aspx
+//-------------------------------------------------------------------------------------
+float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB )
+{
+    if ( !inputA || !inputB )
+        return 0.f;
+
+    float result = inputA[0] * inputB[0];
+
+    const size_t numcoeff = order*order;
+
+    for( size_t i=1; i < numcoeff; ++i )
+    {
+        result += inputA[i] * inputB[i];
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Computes the product of two functions represented using SH (f and g), where:
+// result[i] = int(y_i(s) * f(s) * g(s)), where y_i(s) is the ith SH basis
+// function, f(s) and g(s) are SH functions (sum_i(y_i(s)*c_i)).  The order O
+// determines the lengths of the arrays, where there should always be O^2 
+// coefficients.  In general the product of two SH functions of order O generates
+// and SH function of order 2*O - 1, but we truncate the result.  This means
+// that the product commutes (f*g == g*f) but doesn't associate 
+// (f*(g*h) != (f*g)*h.
+//-------------------------------------------------------------------------------------
+float* XMSHMultiply( _Out_writes_(order*order) float *result,
+                     _In_ size_t order,
+                     _In_reads_(order*order) const float *inputF,
+                     _In_reads_(order*order) const float *inputG )
+{
+    switch( order )
+    {
+    case 2:
+        return XMSHMultiply2( result, inputF, inputG );
+        
+    case 3:
+        return XMSHMultiply3( result, inputF, inputG );
+
+    case 4:
+        return XMSHMultiply4( result, inputF, inputG );
+
+    case 5:
+        return XMSHMultiply5( result, inputF, inputG );
+
+    case 6:
+        return XMSHMultiply6( result, inputF, inputG );
+
+    default:
+        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
+        return nullptr;
+    }
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205454.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHMultiply2( _Out_writes_(4) float *y,
+                      _In_reads_(4) const float *f,
+                      _In_reads_(4) const float *g )
+{
+    if ( !y || !f || !g )
+        return nullptr;
+
+    REAL tf,tg,t;
+    // [0,0]: 0,
+    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
+
+    // [1,1]: 0,
+    tf = CONSTANT(0.282094791773000010)*f[0];
+    tg = CONSTANT(0.282094791773000010)*g[0];
+    y[1]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+
+    // [2,2]: 0,
+    tf = CONSTANT(0.282094795249000000)*f[0];
+    tg = CONSTANT(0.282094795249000000)*g[0];
+    y[2]  = tf*g[2]+tg*f[2];
+    t = f[2]*g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+
+    // [3,3]: 0,
+    tf = CONSTANT(0.282094791773000010)*f[0];
+    tg = CONSTANT(0.282094791773000010)*g[0];
+    y[3]  = tf*g[3]+tg*f[3];
+    t = f[3]*g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+
+    // multiply count=20
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232906.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHMultiply3( _Out_writes_(9) float *y,
+                      _In_reads_(9) const float *f,
+                      _In_reads_(9) const float *g )
+{
+    if ( !y || !f || !g )
+        return nullptr;
+
+    REAL tf,tg,t;
+    // [0,0]: 0,
+    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
+    y[1]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6]  = CONSTANT(-0.126156626101000010)*t;
+    y[8]  = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,2]: 5,
+    tf = CONSTANT(0.218509686118000010)*f[5];
+    tg = CONSTANT(0.218509686118000010)*g[5];
+    y[1] += tf*g[2]+tg*f[2];
+    y[2]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[2]+f[2]*g[1];
+    y[5]  = CONSTANT(0.218509686118000010)*t;
+
+    // [1,3]: 4,
+    tf = CONSTANT(0.218509686114999990)*f[4];
+    tg = CONSTANT(0.218509686114999990)*g[4];
+    y[1] += tf*g[3]+tg*f[3];
+    y[3]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[3]+f[3]*g[1];
+    y[4]  = CONSTANT(0.218509686114999990)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2]+tg*f[2];
+    t = f[2]*g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,3]: 7,
+    tf = CONSTANT(0.218509686118000010)*f[7];
+    tg = CONSTANT(0.218509686118000010)*g[7];
+    y[2] += tf*g[3]+tg*f[3];
+    y[3] += tf*g[2]+tg*f[2];
+    t = f[2]*g[3]+f[3]*g[2];
+    y[7]  = CONSTANT(0.218509686118000010)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3]+tg*f[3];
+    t = f[3]*g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [4,4]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
+    y[4] += tf*g[4]+tg*f[4];
+    t = f[4]*g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // [4,5]: 7,
+    tf = CONSTANT(0.156078347226000000)*f[7];
+    tg = CONSTANT(0.156078347226000000)*g[7];
+    y[4] += tf*g[5]+tg*f[5];
+    y[5] += tf*g[4]+tg*f[4];
+    t = f[4]*g[5]+f[5]*g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+
+    // [5,5]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8];
+    y[5] += tf*g[5]+tg*f[5];
+    t = f[5]*g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+
+    // [6,6]: 0,6,
+    tf = CONSTANT(0.282094797560000000)*f[0];
+    tg = CONSTANT(0.282094797560000000)*g[0];
+    y[6] += tf*g[6]+tg*f[6];
+    t = f[6]*g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+
+    // [7,7]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.156078347227999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.156078347227999990)*g[8];
+    y[7] += tf*g[7]+tg*f[7];
+    t = f[7]*g[7];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+
+    // [8,8]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
+    y[8] += tf*g[8]+tg*f[8];
+    t = f[8]*g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // multiply count=120
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232907.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHMultiply4( _Out_writes_(16) float *y,
+                      _In_reads_(16) const float *f,
+                      _In_reads_(16) const float *g )
+{
+    if ( !y || !f || !g )
+        return nullptr;
+
+    REAL tf,tg,t;
+    // [0,0]: 0,
+    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
+    y[1]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6]  = CONSTANT(-0.126156626101000010)*t;
+    y[8]  = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,4]: 3,13,15,
+    tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15];
+    tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15];
+    y[1] += tf*g[4]+tg*f[4];
+    y[4]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[4]+f[4]*g[1];
+    y[3]  = CONSTANT(0.218509686114999990)*t;
+    y[13]  = CONSTANT(-0.058399170082300000)*t;
+    y[15]  = CONSTANT(-0.226179013157999990)*t;
+
+    // [1,5]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14];
+    y[1] += tf*g[5]+tg*f[5];
+    y[5]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[5]+f[5]*g[1];
+    y[2]  = CONSTANT(0.218509686118000010)*t;
+    y[12]  = CONSTANT(-0.143048168103000000)*t;
+    y[14]  = CONSTANT(-0.184674390923000000)*t;
+
+    // [1,6]: 11,
+    tf = CONSTANT(0.202300659402999990)*f[11];
+    tg = CONSTANT(0.202300659402999990)*g[11];
+    y[1] += tf*g[6]+tg*f[6];
+    y[6] += tf*g[1]+tg*f[1];
+    t = f[1]*g[6]+f[6]*g[1];
+    y[11]  = CONSTANT(0.202300659402999990)*t;
+
+    // [1,8]: 9,11,
+    tf = CONSTANT(0.226179013155000000)*f[9]+CONSTANT(0.058399170081799998)*f[11];
+    tg = CONSTANT(0.226179013155000000)*g[9]+CONSTANT(0.058399170081799998)*g[11];
+    y[1] += tf*g[8]+tg*f[8];
+    y[8] += tf*g[1]+tg*f[1];
+    t = f[1]*g[8]+f[8]*g[1];
+    y[9]  = CONSTANT(0.226179013155000000)*t;
+    y[11] += CONSTANT(0.058399170081799998)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2]+tg*f[2];
+    t = f[2]*g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,6]: 12,
+    tf = CONSTANT(0.247766706973999990)*f[12];
+    tg = CONSTANT(0.247766706973999990)*g[12];
+    y[2] += tf*g[6]+tg*f[6];
+    y[6] += tf*g[2]+tg*f[2];
+    t = f[2]*g[6]+f[6]*g[2];
+    y[12] += CONSTANT(0.247766706973999990)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3]+tg*f[3];
+    t = f[3]*g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [3,6]: 13,
+    tf = CONSTANT(0.202300659402999990)*f[13];
+    tg = CONSTANT(0.202300659402999990)*g[13];
+    y[3] += tf*g[6]+tg*f[6];
+    y[6] += tf*g[3]+tg*f[3];
+    t = f[3]*g[6]+f[6]*g[3];
+    y[13] += CONSTANT(0.202300659402999990)*t;
+
+    // [3,7]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14];
+    y[3] += tf*g[7]+tg*f[7];
+    y[7]  = tf*g[3]+tg*f[3];
+    t = f[3]*g[7]+f[7]*g[3];
+    y[2] += CONSTANT(0.218509686118000010)*t;
+    y[12] += CONSTANT(-0.143048168103000000)*t;
+    y[14] += CONSTANT(0.184674390923000000)*t;
+
+    // [3,8]: 13,15,
+    tf = CONSTANT(-0.058399170081799998)*f[13]+CONSTANT(0.226179013155000000)*f[15];
+    tg = CONSTANT(-0.058399170081799998)*g[13]+CONSTANT(0.226179013155000000)*g[15];
+    y[3] += tf*g[8]+tg*f[8];
+    y[8] += tf*g[3]+tg*f[3];
+    t = f[3]*g[8]+f[8]*g[3];
+    y[13] += CONSTANT(-0.058399170081799998)*t;
+    y[15] += CONSTANT(0.226179013155000000)*t;
+
+    // [4,4]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
+    y[4] += tf*g[4]+tg*f[4];
+    t = f[4]*g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // [4,5]: 7,
+    tf = CONSTANT(0.156078347226000000)*f[7];
+    tg = CONSTANT(0.156078347226000000)*g[7];
+    y[4] += tf*g[5]+tg*f[5];
+    y[5] += tf*g[4]+tg*f[4];
+    t = f[4]*g[5]+f[5]*g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+
+    // [4,9]: 3,13,
+    tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13];
+    tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13];
+    y[4] += tf*g[9]+tg*f[9];
+    y[9] += tf*g[4]+tg*f[4];
+    t = f[4]*g[9]+f[9]*g[4];
+    y[3] += CONSTANT(0.226179013157999990)*t;
+    y[13] += CONSTANT(-0.094031597258400004)*t;
+
+    // [4,10]: 2,12,
+    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12];
+    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12];
+    y[4] += tf*g[10]+tg*f[10];
+    y[10]  = tf*g[4]+tg*f[4];
+    t = f[4]*g[10]+f[10]*g[4];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+
+    // [4,11]: 3,13,15,
+    tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15];
+    tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15];
+    y[4] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[4]+tg*f[4];
+    t = f[4]*g[11]+f[11]*g[4];
+    y[3] += CONSTANT(-0.058399170082300000)*t;
+    y[13] += CONSTANT(0.145673124078000010)*t;
+    y[15] += CONSTANT(0.094031597258400004)*t;
+
+    // [5,5]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8];
+    y[5] += tf*g[5]+tg*f[5];
+    t = f[5]*g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+
+    // [5,9]: 14,
+    tf = CONSTANT(0.148677009677999990)*f[14];
+    tg = CONSTANT(0.148677009677999990)*g[14];
+    y[5] += tf*g[9]+tg*f[9];
+    y[9] += tf*g[5]+tg*f[5];
+    t = f[5]*g[9]+f[9]*g[5];
+    y[14] += CONSTANT(0.148677009677999990)*t;
+
+    // [5,10]: 3,13,15,
+    tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15];
+    tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15];
+    y[5] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[5]+tg*f[5];
+    t = f[5]*g[10]+f[10]*g[5];
+    y[3] += CONSTANT(0.184674390919999990)*t;
+    y[13] += CONSTANT(0.115164716490000000)*t;
+    y[15] += CONSTANT(-0.148677009678999990)*t;
+
+    // [5,11]: 2,12,14,
+    tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14];
+    tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14];
+    y[5] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[5]+tg*f[5];
+    t = f[5]*g[11]+f[11]*g[5];
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[14] += CONSTANT(-0.115164716491000000)*t;
+
+    // [6,6]: 0,6,
+    tf = CONSTANT(0.282094797560000000)*f[0];
+    tg = CONSTANT(0.282094797560000000)*g[0];
+    y[6] += tf*g[6]+tg*f[6];
+    t = f[6]*g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+
+    // [7,7]: 6,0,8,
+    tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8];
+    tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8];
+    y[7] += tf*g[7]+tg*f[7];
+    t = f[7]*g[7];
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+
+    // [7,10]: 9,1,11,
+    tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11];
+    tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11];
+    y[7] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[7]+tg*f[7];
+    t = f[7]*g[10]+f[10]*g[7];
+    y[9] += CONSTANT(0.148677009678999990)*t;
+    y[1] += CONSTANT(0.184674390919999990)*t;
+    y[11] += CONSTANT(0.115164716490000000)*t;
+
+    // [7,13]: 12,2,14,
+    tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14];
+    tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14];
+    y[7] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[7]+tg*f[7];
+    t = f[7]*g[13]+f[13]*g[7];
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[14] += CONSTANT(0.115164716491000000)*t;
+
+    // [7,14]: 15,
+    tf = CONSTANT(0.148677009677999990)*f[15];
+    tg = CONSTANT(0.148677009677999990)*g[15];
+    y[7] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[7]+tg*f[7];
+    t = f[7]*g[14]+f[14]*g[7];
+    y[15] += CONSTANT(0.148677009677999990)*t;
+
+    // [8,8]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6];
+    y[8] += tf*g[8]+tg*f[8];
+    t = f[8]*g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // [8,9]: 11,
+    tf = CONSTANT(-0.094031597259499999)*f[11];
+    tg = CONSTANT(-0.094031597259499999)*g[11];
+    y[8] += tf*g[9]+tg*f[9];
+    y[9] += tf*g[8]+tg*f[8];
+    t = f[8]*g[9]+f[9]*g[8];
+    y[11] += CONSTANT(-0.094031597259499999)*t;
+
+    // [8,13]: 15,
+    tf = CONSTANT(-0.094031597259499999)*f[15];
+    tg = CONSTANT(-0.094031597259499999)*g[15];
+    y[8] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[8]+tg*f[8];
+    t = f[8]*g[13]+f[13]*g[8];
+    y[15] += CONSTANT(-0.094031597259499999)*t;
+
+    // [8,14]: 2,12,
+    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12];
+    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12];
+    y[8] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[8]+tg*f[8];
+    t = f[8]*g[14]+f[14]*g[8];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+
+    // [9,9]: 6,0,
+    tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0];
+    tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0];
+    y[9] += tf*g[9]+tg*f[9];
+    t = f[9]*g[9];
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[0] += CONSTANT(0.282094791766999970)*t;
+
+    // [10,10]: 0,
+    tf = CONSTANT(0.282094791771999980)*f[0];
+    tg = CONSTANT(0.282094791771999980)*g[0];
+    y[10] += tf*g[10]+tg*f[10];
+    t = f[10]*g[10];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+
+    // [11,11]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8];
+    y[11] += tf*g[11]+tg*f[11];
+    t = f[11]*g[11];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[8] += CONSTANT(-0.145673124078999990)*t;
+
+    // [12,12]: 0,6,
+    tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6];
+    tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6];
+    y[12] += tf*g[12]+tg*f[12];
+    t = f[12]*g[12];
+    y[0] += CONSTANT(0.282094799871999980)*t;
+    y[6] += CONSTANT(0.168208852954000010)*t;
+
+    // [13,13]: 0,8,6,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6];
+    y[13] += tf*g[13]+tg*f[13];
+    t = f[13]*g[13];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.145673124078999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+
+    // [14,14]: 0,
+    tf = CONSTANT(0.282094791771999980)*f[0];
+    tg = CONSTANT(0.282094791771999980)*g[0];
+    y[14] += tf*g[14]+tg*f[14];
+    t = f[14]*g[14];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+
+    // [15,15]: 0,6,
+    tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6];
+    tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6];
+    y[15] += tf*g[15]+tg*f[15];
+    t = f[15]*g[15];
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+
+    // multiply count=399
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232908.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHMultiply5( _Out_writes_(25) float *y,
+                      _In_reads_(25) const float *f,
+                      _In_reads_(25) const float *g )
+{
+    if ( !y || !f || !g )
+        return nullptr;
+
+    REAL tf,tg,t;
+    // [0,0]: 0,
+    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
+    y[1]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6]  = CONSTANT(-0.126156626101000010)*t;
+    y[8]  = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,4]: 3,13,15,
+    tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15];
+    tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15];
+    y[1] += tf*g[4]+tg*f[4];
+    y[4]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[4]+f[4]*g[1];
+    y[3]  = CONSTANT(0.218509686114999990)*t;
+    y[13]  = CONSTANT(-0.058399170082300000)*t;
+    y[15]  = CONSTANT(-0.226179013157999990)*t;
+
+    // [1,5]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14];
+    y[1] += tf*g[5]+tg*f[5];
+    y[5]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[5]+f[5]*g[1];
+    y[2]  = CONSTANT(0.218509686118000010)*t;
+    y[12]  = CONSTANT(-0.143048168103000000)*t;
+    y[14]  = CONSTANT(-0.184674390923000000)*t;
+
+    // [1,9]: 8,22,24,
+    tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(-0.230329432978999990)*f[24];
+    tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(-0.230329432978999990)*g[24];
+    y[1] += tf*g[9]+tg*f[9];
+    y[9]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[9]+f[9]*g[1];
+    y[8] += CONSTANT(0.226179013155000000)*t;
+    y[22]  = CONSTANT(-0.043528171378199997)*t;
+    y[24]  = CONSTANT(-0.230329432978999990)*t;
+
+    // [1,10]: 7,21,23,
+    tf = CONSTANT(0.184674390919999990)*f[7]+CONSTANT(-0.075393004386799994)*f[21]+CONSTANT(-0.199471140200000010)*f[23];
+    tg = CONSTANT(0.184674390919999990)*g[7]+CONSTANT(-0.075393004386799994)*g[21]+CONSTANT(-0.199471140200000010)*g[23];
+    y[1] += tf*g[10]+tg*f[10];
+    y[10]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[10]+f[10]*g[1];
+    y[7]  = CONSTANT(0.184674390919999990)*t;
+    y[21]  = CONSTANT(-0.075393004386799994)*t;
+    y[23]  = CONSTANT(-0.199471140200000010)*t;
+
+    // [1,11]: 6,8,20,22,
+    tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22];
+    tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22];
+    y[1] += tf*g[11]+tg*f[11];
+    y[11]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[11]+f[11]*g[1];
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[8] += CONSTANT(0.058399170081799998)*t;
+    y[20]  = CONSTANT(-0.150786008773000000)*t;
+    y[22] += CONSTANT(-0.168583882836999990)*t;
+
+    // [1,12]: 19,
+    tf = CONSTANT(0.194663900273000010)*f[19];
+    tg = CONSTANT(0.194663900273000010)*g[19];
+    y[1] += tf*g[12]+tg*f[12];
+    y[12] += tf*g[1]+tg*f[1];
+    t = f[1]*g[12]+f[12]*g[1];
+    y[19]  = CONSTANT(0.194663900273000010)*t;
+
+    // [1,13]: 18,
+    tf = CONSTANT(0.168583882834000000)*f[18];
+    tg = CONSTANT(0.168583882834000000)*g[18];
+    y[1] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[1]+tg*f[1];
+    t = f[1]*g[13]+f[13]*g[1];
+    y[18]  = CONSTANT(0.168583882834000000)*t;
+
+    // [1,14]: 17,19,
+    tf = CONSTANT(0.199471140196999990)*f[17]+CONSTANT(0.075393004386399995)*f[19];
+    tg = CONSTANT(0.199471140196999990)*g[17]+CONSTANT(0.075393004386399995)*g[19];
+    y[1] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[1]+tg*f[1];
+    t = f[1]*g[14]+f[14]*g[1];
+    y[17]  = CONSTANT(0.199471140196999990)*t;
+    y[19] += CONSTANT(0.075393004386399995)*t;
+
+    // [1,15]: 16,18,
+    tf = CONSTANT(0.230329432973999990)*f[16]+CONSTANT(0.043528171377799997)*f[18];
+    tg = CONSTANT(0.230329432973999990)*g[16]+CONSTANT(0.043528171377799997)*g[18];
+    y[1] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[1]+tg*f[1];
+    t = f[1]*g[15]+f[15]*g[1];
+    y[16]  = CONSTANT(0.230329432973999990)*t;
+    y[18] += CONSTANT(0.043528171377799997)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2]+tg*f[2];
+    t = f[2]*g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,10]: 4,18,
+    tf = CONSTANT(0.184674390919999990)*f[4]+CONSTANT(0.213243618621000000)*f[18];
+    tg = CONSTANT(0.184674390919999990)*g[4]+CONSTANT(0.213243618621000000)*g[18];
+    y[2] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[2]+tg*f[2];
+    t = f[2]*g[10]+f[10]*g[2];
+    y[4] += CONSTANT(0.184674390919999990)*t;
+    y[18] += CONSTANT(0.213243618621000000)*t;
+
+    // [2,12]: 6,20,
+    tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20];
+    tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20];
+    y[2] += tf*g[12]+tg*f[12];
+    y[12] += tf*g[2]+tg*f[2];
+    t = f[2]*g[12]+f[12]*g[2];
+    y[6] += CONSTANT(0.247766706973999990)*t;
+    y[20] += CONSTANT(0.246232537174000010)*t;
+
+    // [2,14]: 8,22,
+    tf = CONSTANT(0.184674390919999990)*f[8]+CONSTANT(0.213243618621000000)*f[22];
+    tg = CONSTANT(0.184674390919999990)*g[8]+CONSTANT(0.213243618621000000)*g[22];
+    y[2] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[2]+tg*f[2];
+    t = f[2]*g[14]+f[14]*g[2];
+    y[8] += CONSTANT(0.184674390919999990)*t;
+    y[22] += CONSTANT(0.213243618621000000)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3]+tg*f[3];
+    t = f[3]*g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [3,7]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14];
+    y[3] += tf*g[7]+tg*f[7];
+    y[7] += tf*g[3]+tg*f[3];
+    t = f[3]*g[7]+f[7]*g[3];
+    y[2] += CONSTANT(0.218509686118000010)*t;
+    y[12] += CONSTANT(-0.143048168103000000)*t;
+    y[14] += CONSTANT(0.184674390923000000)*t;
+
+    // [3,9]: 4,16,18,
+    tf = CONSTANT(0.226179013157999990)*f[4]+CONSTANT(0.230329432973999990)*f[16]+CONSTANT(-0.043528171377799997)*f[18];
+    tg = CONSTANT(0.226179013157999990)*g[4]+CONSTANT(0.230329432973999990)*g[16]+CONSTANT(-0.043528171377799997)*g[18];
+    y[3] += tf*g[9]+tg*f[9];
+    y[9] += tf*g[3]+tg*f[3];
+    t = f[3]*g[9]+f[9]*g[3];
+    y[4] += CONSTANT(0.226179013157999990)*t;
+    y[16] += CONSTANT(0.230329432973999990)*t;
+    y[18] += CONSTANT(-0.043528171377799997)*t;
+
+    // [3,10]: 5,17,19,
+    tf = CONSTANT(0.184674390919999990)*f[5]+CONSTANT(0.199471140200000010)*f[17]+CONSTANT(-0.075393004386799994)*f[19];
+    tg = CONSTANT(0.184674390919999990)*g[5]+CONSTANT(0.199471140200000010)*g[17]+CONSTANT(-0.075393004386799994)*g[19];
+    y[3] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[3]+tg*f[3];
+    t = f[3]*g[10]+f[10]*g[3];
+    y[5] += CONSTANT(0.184674390919999990)*t;
+    y[17] += CONSTANT(0.199471140200000010)*t;
+    y[19] += CONSTANT(-0.075393004386799994)*t;
+
+    // [3,12]: 21,
+    tf = CONSTANT(0.194663900273000010)*f[21];
+    tg = CONSTANT(0.194663900273000010)*g[21];
+    y[3] += tf*g[12]+tg*f[12];
+    y[12] += tf*g[3]+tg*f[3];
+    t = f[3]*g[12]+f[12]*g[3];
+    y[21] += CONSTANT(0.194663900273000010)*t;
+
+    // [3,13]: 8,6,20,22,
+    tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22];
+    tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22];
+    y[3] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[3]+tg*f[3];
+    t = f[3]*g[13]+f[13]*g[3];
+    y[8] += CONSTANT(-0.058399170081799998)*t;
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[20] += CONSTANT(-0.150786008773000000)*t;
+    y[22] += CONSTANT(0.168583882836999990)*t;
+
+    // [3,14]: 21,23,
+    tf = CONSTANT(-0.075393004386399995)*f[21]+CONSTANT(0.199471140196999990)*f[23];
+    tg = CONSTANT(-0.075393004386399995)*g[21]+CONSTANT(0.199471140196999990)*g[23];
+    y[3] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[3]+tg*f[3];
+    t = f[3]*g[14]+f[14]*g[3];
+    y[21] += CONSTANT(-0.075393004386399995)*t;
+    y[23] += CONSTANT(0.199471140196999990)*t;
+
+    // [3,15]: 8,22,24,
+    tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(0.230329432978999990)*f[24];
+    tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(0.230329432978999990)*g[24];
+    y[3] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[3]+tg*f[3];
+    t = f[3]*g[15]+f[15]*g[3];
+    y[8] += CONSTANT(0.226179013155000000)*t;
+    y[22] += CONSTANT(-0.043528171378199997)*t;
+    y[24] += CONSTANT(0.230329432978999990)*t;
+
+    // [4,4]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24];
+    y[4] += tf*g[4]+tg*f[4];
+    t = f[4]*g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(-0.238413613505999990)*t;
+
+    // [4,5]: 7,21,23,
+    tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23];
+    tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23];
+    y[4] += tf*g[5]+tg*f[5];
+    y[5] += tf*g[4]+tg*f[4];
+    t = f[4]*g[5]+f[5]*g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+    y[21] += CONSTANT(-0.063718718434399996)*t;
+    y[23] += CONSTANT(-0.168583882835000000)*t;
+
+    // [4,11]: 3,13,15,
+    tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15];
+    tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15];
+    y[4] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[4]+tg*f[4];
+    t = f[4]*g[11]+f[11]*g[4];
+    y[3] += CONSTANT(-0.058399170082300000)*t;
+    y[13] += CONSTANT(0.145673124078000010)*t;
+    y[15] += CONSTANT(0.094031597258400004)*t;
+
+    // [4,16]: 8,22,
+    tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22];
+    tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22];
+    y[4] += tf*g[16]+tg*f[16];
+    y[16] += tf*g[4]+tg*f[4];
+    t = f[4]*g[16]+f[16]*g[4];
+    y[8] += CONSTANT(0.238413613494000000)*t;
+    y[22] += CONSTANT(-0.075080816693699995)*t;
+
+    // [4,18]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24];
+    y[4] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[4]+tg*f[4];
+    t = f[4]*g[18]+f[18]*g[4];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(0.075080816691500005)*t;
+
+    // [4,19]: 7,21,23,
+    tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23];
+    tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23];
+    y[4] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[4]+tg*f[4];
+    t = f[4]*g[19]+f[19]*g[4];
+    y[7] += CONSTANT(-0.063718718434399996)*t;
+    y[21] += CONSTANT(0.141889406569999990)*t;
+    y[23] += CONSTANT(0.112621225039000000)*t;
+
+    // [5,5]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22];
+    y[5] += tf*g[5]+tg*f[5];
+    t = f[5]*g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(-0.180223751574000000)*t;
+
+    // [5,11]: 2,12,14,
+    tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14];
+    tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14];
+    y[5] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[5]+tg*f[5];
+    t = f[5]*g[11]+f[11]*g[5];
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[14] += CONSTANT(-0.115164716491000000)*t;
+
+    // [5,17]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24];
+    y[5] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[5]+tg*f[5];
+    t = f[5]*g[17]+f[17]*g[5];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(-0.140463346189000000)*t;
+
+    // [5,18]: 7,21,23,
+    tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23];
+    tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23];
+    y[5] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[5]+tg*f[5];
+    t = f[5]*g[18]+f[18]*g[5];
+    y[7] += CONSTANT(0.180223751571000010)*t;
+    y[21] += CONSTANT(0.090297865407399994)*t;
+    y[23] += CONSTANT(-0.132725386549000010)*t;
+
+    // [5,19]: 6,8,20,22,
+    tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22];
+    tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22];
+    y[5] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[5]+tg*f[5];
+    t = f[5]*g[19]+f[19]*g[5];
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[8] += CONSTANT(0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[22] += CONSTANT(-0.090297865408399999)*t;
+
+    // [6,6]: 0,6,20,
+    tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20];
+    tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20];
+    y[6] += tf*g[6]+tg*f[6];
+    t = f[6]*g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+    y[20] += CONSTANT(0.241795553185999990)*t;
+
+    // [7,7]: 6,0,8,20,22,
+    tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22];
+    tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22];
+    y[7] += tf*g[7]+tg*f[7];
+    t = f[7]*g[7];
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(0.180223751574000000)*t;
+
+    // [7,13]: 12,2,14,
+    tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14];
+    tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14];
+    y[7] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[7]+tg*f[7];
+    t = f[7]*g[13]+f[13]*g[7];
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[14] += CONSTANT(0.115164716491000000)*t;
+
+    // [7,17]: 16,4,18,
+    tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18];
+    tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18];
+    y[7] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[7]+tg*f[7];
+    t = f[7]*g[17]+f[17]*g[7];
+    y[16] += CONSTANT(0.140463346187999990)*t;
+    y[4] += CONSTANT(0.168583882835000000)*t;
+    y[18] += CONSTANT(0.132725386549000010)*t;
+
+    // [7,21]: 8,20,6,22,
+    tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22];
+    tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22];
+    y[7] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[7]+tg*f[7];
+    t = f[7]*g[21]+f[21]*g[7];
+    y[8] += CONSTANT(-0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[22] += CONSTANT(0.090297865408399999)*t;
+
+    // [7,23]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24];
+    y[7] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[7]+tg*f[7];
+    t = f[7]*g[23]+f[23]*g[7];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(0.140463346189000000)*t;
+
+    // [8,8]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24];
+    y[8] += tf*g[8]+tg*f[8];
+    t = f[8]*g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(0.238413613505999990)*t;
+
+    // [8,22]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24];
+    y[8] += tf*g[22]+tg*f[22];
+    y[22] += tf*g[8]+tg*f[8];
+    t = f[8]*g[22]+f[22]*g[8];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(-0.075080816691500005)*t;
+
+    // [9,9]: 6,0,20,
+    tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20];
+    y[9] += tf*g[9]+tg*f[9];
+    t = f[9]*g[9];
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [9,10]: 7,21,
+    tf = CONSTANT(0.148677009678999990)*f[7]+CONSTANT(-0.099322584599600000)*f[21];
+    tg = CONSTANT(0.148677009678999990)*g[7]+CONSTANT(-0.099322584599600000)*g[21];
+    y[9] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[9]+tg*f[9];
+    t = f[9]*g[10]+f[10]*g[9];
+    y[7] += CONSTANT(0.148677009678999990)*t;
+    y[21] += CONSTANT(-0.099322584599600000)*t;
+
+    // [9,11]: 8,22,24,
+    tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(0.117520066950999990)*f[24];
+    tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(0.117520066950999990)*g[24];
+    y[9] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[9]+tg*f[9];
+    t = f[9]*g[11]+f[11]*g[9];
+    y[8] += CONSTANT(-0.094031597259499999)*t;
+    y[22] += CONSTANT(0.133255230518000010)*t;
+    y[24] += CONSTANT(0.117520066950999990)*t;
+
+    // [9,13]: 4,16,18,
+    tf = CONSTANT(-0.094031597258400004)*f[4]+CONSTANT(-0.117520066953000000)*f[16]+CONSTANT(0.133255230519000010)*f[18];
+    tg = CONSTANT(-0.094031597258400004)*g[4]+CONSTANT(-0.117520066953000000)*g[16]+CONSTANT(0.133255230519000010)*g[18];
+    y[9] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[9]+tg*f[9];
+    t = f[9]*g[13]+f[13]*g[9];
+    y[4] += CONSTANT(-0.094031597258400004)*t;
+    y[16] += CONSTANT(-0.117520066953000000)*t;
+    y[18] += CONSTANT(0.133255230519000010)*t;
+
+    // [9,14]: 5,19,
+    tf = CONSTANT(0.148677009677999990)*f[5]+CONSTANT(-0.099322584600699995)*f[19];
+    tg = CONSTANT(0.148677009677999990)*g[5]+CONSTANT(-0.099322584600699995)*g[19];
+    y[9] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[9]+tg*f[9];
+    t = f[9]*g[14]+f[14]*g[9];
+    y[5] += CONSTANT(0.148677009677999990)*t;
+    y[19] += CONSTANT(-0.099322584600699995)*t;
+
+    // [9,17]: 2,12,
+    tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12];
+    tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12];
+    y[9] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[9]+tg*f[9];
+    t = f[9]*g[17]+f[17]*g[9];
+    y[2] += CONSTANT(0.162867503964999990)*t;
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+
+    // [10,10]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24];
+    y[10] += tf*g[10]+tg*f[10];
+    t = f[10]*g[10];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(-0.151717754049000010)*t;
+
+    // [10,11]: 7,21,23,
+    tf = CONSTANT(0.115164716490000000)*f[7]+CONSTANT(0.102579924281000000)*f[21]+CONSTANT(-0.067850242288900006)*f[23];
+    tg = CONSTANT(0.115164716490000000)*g[7]+CONSTANT(0.102579924281000000)*g[21]+CONSTANT(-0.067850242288900006)*g[23];
+    y[10] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[10]+tg*f[10];
+    t = f[10]*g[11]+f[11]*g[10];
+    y[7] += CONSTANT(0.115164716490000000)*t;
+    y[21] += CONSTANT(0.102579924281000000)*t;
+    y[23] += CONSTANT(-0.067850242288900006)*t;
+
+    // [10,12]: 4,18,
+    tf = CONSTANT(-0.188063194517999990)*f[4]+CONSTANT(-0.044418410173299998)*f[18];
+    tg = CONSTANT(-0.188063194517999990)*g[4]+CONSTANT(-0.044418410173299998)*g[18];
+    y[10] += tf*g[12]+tg*f[12];
+    y[12] += tf*g[10]+tg*f[10];
+    t = f[10]*g[12]+f[12]*g[10];
+    y[4] += CONSTANT(-0.188063194517999990)*t;
+    y[18] += CONSTANT(-0.044418410173299998)*t;
+
+    // [10,13]: 5,17,19,
+    tf = CONSTANT(0.115164716490000000)*f[5]+CONSTANT(0.067850242288900006)*f[17]+CONSTANT(0.102579924281000000)*f[19];
+    tg = CONSTANT(0.115164716490000000)*g[5]+CONSTANT(0.067850242288900006)*g[17]+CONSTANT(0.102579924281000000)*g[19];
+    y[10] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[10]+tg*f[10];
+    t = f[10]*g[13]+f[13]*g[10];
+    y[5] += CONSTANT(0.115164716490000000)*t;
+    y[17] += CONSTANT(0.067850242288900006)*t;
+    y[19] += CONSTANT(0.102579924281000000)*t;
+
+    // [10,14]: 16,
+    tf = CONSTANT(0.151717754044999990)*f[16];
+    tg = CONSTANT(0.151717754044999990)*g[16];
+    y[10] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[10]+tg*f[10];
+    t = f[10]*g[14]+f[14]*g[10];
+    y[16] += CONSTANT(0.151717754044999990)*t;
+
+    // [10,15]: 5,19,
+    tf = CONSTANT(-0.148677009678999990)*f[5]+CONSTANT(0.099322584599600000)*f[19];
+    tg = CONSTANT(-0.148677009678999990)*g[5]+CONSTANT(0.099322584599600000)*g[19];
+    y[10] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[10]+tg*f[10];
+    t = f[10]*g[15]+f[15]*g[10];
+    y[5] += CONSTANT(-0.148677009678999990)*t;
+    y[19] += CONSTANT(0.099322584599600000)*t;
+
+    // [11,11]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22];
+    y[11] += tf*g[11]+tg*f[11];
+    t = f[11]*g[11];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[8] += CONSTANT(-0.145673124078999990)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(-0.114687841910000000)*t;
+
+    // [11,14]: 17,
+    tf = CONSTANT(0.067850242288500007)*f[17];
+    tg = CONSTANT(0.067850242288500007)*g[17];
+    y[11] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[11]+tg*f[11];
+    t = f[11]*g[14]+f[14]*g[11];
+    y[17] += CONSTANT(0.067850242288500007)*t;
+
+    // [11,15]: 16,
+    tf = CONSTANT(-0.117520066953000000)*f[16];
+    tg = CONSTANT(-0.117520066953000000)*g[16];
+    y[11] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[11]+tg*f[11];
+    t = f[11]*g[15]+f[15]*g[11];
+    y[16] += CONSTANT(-0.117520066953000000)*t;
+
+    // [11,18]: 3,13,15,
+    tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15];
+    tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15];
+    y[11] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[11]+tg*f[11];
+    t = f[11]*g[18]+f[18]*g[11];
+    y[3] += CONSTANT(0.168583882834000000)*t;
+    y[13] += CONSTANT(0.114687841909000000)*t;
+    y[15] += CONSTANT(-0.133255230519000010)*t;
+
+    // [11,19]: 2,14,12,
+    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12];
+    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12];
+    y[11] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[11]+tg*f[11];
+    t = f[11]*g[19]+f[19]*g[11];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[14] += CONSTANT(-0.102579924282000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+
+    // [12,12]: 0,6,20,
+    tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20];
+    tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20];
+    y[12] += tf*g[12]+tg*f[12];
+    t = f[12]*g[12];
+    y[0] += CONSTANT(0.282094799871999980)*t;
+    y[6] += CONSTANT(0.168208852954000010)*t;
+    y[20] += CONSTANT(0.153869910786000010)*t;
+
+    // [12,14]: 8,22,
+    tf = CONSTANT(-0.188063194517999990)*f[8]+CONSTANT(-0.044418410173299998)*f[22];
+    tg = CONSTANT(-0.188063194517999990)*g[8]+CONSTANT(-0.044418410173299998)*g[22];
+    y[12] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[12]+tg*f[12];
+    t = f[12]*g[14]+f[14]*g[12];
+    y[8] += CONSTANT(-0.188063194517999990)*t;
+    y[22] += CONSTANT(-0.044418410173299998)*t;
+
+    // [13,13]: 0,8,6,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22];
+    y[13] += tf*g[13]+tg*f[13];
+    t = f[13]*g[13];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.145673124078999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(0.114687841910000000)*t;
+
+    // [13,14]: 23,
+    tf = CONSTANT(0.067850242288500007)*f[23];
+    tg = CONSTANT(0.067850242288500007)*g[23];
+    y[13] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[13]+tg*f[13];
+    t = f[13]*g[14]+f[14]*g[13];
+    y[23] += CONSTANT(0.067850242288500007)*t;
+
+    // [13,15]: 8,22,24,
+    tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(-0.117520066950999990)*f[24];
+    tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(-0.117520066950999990)*g[24];
+    y[13] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[13]+tg*f[13];
+    t = f[13]*g[15]+f[15]*g[13];
+    y[8] += CONSTANT(-0.094031597259499999)*t;
+    y[22] += CONSTANT(0.133255230518000010)*t;
+    y[24] += CONSTANT(-0.117520066950999990)*t;
+
+    // [13,21]: 2,12,14,
+    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14];
+    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14];
+    y[13] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[13]+tg*f[13];
+    t = f[13]*g[21]+f[21]*g[13];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+    y[14] += CONSTANT(0.102579924282000000)*t;
+
+    // [14,14]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24];
+    y[14] += tf*g[14]+tg*f[14];
+    t = f[14]*g[14];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(0.151717754049000010)*t;
+
+    // [14,15]: 7,21,
+    tf = CONSTANT(0.148677009677999990)*f[7]+CONSTANT(-0.099322584600699995)*f[21];
+    tg = CONSTANT(0.148677009677999990)*g[7]+CONSTANT(-0.099322584600699995)*g[21];
+    y[14] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[14]+tg*f[14];
+    t = f[14]*g[15]+f[15]*g[14];
+    y[7] += CONSTANT(0.148677009677999990)*t;
+    y[21] += CONSTANT(-0.099322584600699995)*t;
+
+    // [15,15]: 0,6,20,
+    tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20];
+    y[15] += tf*g[15]+tg*f[15];
+    t = f[15]*g[15];
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [15,23]: 12,2,
+    tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2];
+    tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2];
+    y[15] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[15]+tg*f[15];
+    t = f[15]*g[23]+f[23]*g[15];
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+    y[2] += CONSTANT(0.162867503964999990)*t;
+
+    // [16,16]: 0,6,20,
+    tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20];
+    y[16] += tf*g[16]+tg*f[16];
+    t = f[16]*g[16];
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // [16,18]: 8,22,
+    tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22];
+    tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22];
+    y[16] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[16]+tg*f[16];
+    t = f[16]*g[18]+f[18]*g[16];
+    y[8] += CONSTANT(-0.075080816693699995)*t;
+    y[22] += CONSTANT(0.135045473380000000)*t;
+
+    // [16,23]: 19,5,
+    tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5];
+    tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5];
+    y[16] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[16]+tg*f[16];
+    t = f[16]*g[23]+f[23]*g[16];
+    y[19] += CONSTANT(-0.119098912754999990)*t;
+    y[5] += CONSTANT(0.140463346187999990)*t;
+
+    // [17,17]: 0,6,20,
+    tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20];
+    tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20];
+    y[17] += tf*g[17]+tg*f[17];
+    t = f[17]*g[17];
+    y[0] += CONSTANT(0.282094791768999990)*t;
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+
+    // [17,19]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24];
+    y[17] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[17]+tg*f[17];
+    t = f[17]*g[19]+f[19]*g[17];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(0.119098912753000000)*t;
+
+    // [17,21]: 16,4,18,
+    tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18];
+    tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18];
+    y[17] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[17]+tg*f[17];
+    t = f[17]*g[21]+f[21]*g[17];
+    y[16] += CONSTANT(-0.119098912754999990)*t;
+    y[4] += CONSTANT(-0.112621225039000000)*t;
+    y[18] += CONSTANT(0.045015157794399997)*t;
+
+    // [18,18]: 6,0,20,24,
+    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24];
+    y[18] += tf*g[18]+tg*f[18];
+    t = f[18]*g[18];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[24] += CONSTANT(-0.135045473384000000)*t;
+
+    // [18,19]: 7,21,23,
+    tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23];
+    tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23];
+    y[18] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[18]+tg*f[18];
+    t = f[18]*g[19]+f[19]*g[18];
+    y[7] += CONSTANT(0.090297865407399994)*t;
+    y[21] += CONSTANT(0.102084782359000000)*t;
+    y[23] += CONSTANT(-0.045015157794399997)*t;
+
+    // [19,19]: 6,8,0,20,22,
+    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22];
+    y[19] += tf*g[19]+tg*f[19];
+    t = f[19]*g[19];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[8] += CONSTANT(-0.141889406570999990)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[22] += CONSTANT(-0.102084782360000000)*t;
+
+    // [20,20]: 6,0,20,
+    tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0];
+    tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0];
+    y[20] += tf*g[20]+tg*f[20];
+    t = f[20]*g[20];
+    y[6] += CONSTANT(0.163839797503000010)*t;
+    y[0] += CONSTANT(0.282094802232000010)*t;
+    y[20] += CONSTANT(0.136961139005999990)*t;
+
+    // [21,21]: 6,20,0,8,22,
+    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22];
+    y[21] += tf*g[21]+tg*f[21];
+    t = f[21]*g[21];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.141889406570999990)*t;
+    y[22] += CONSTANT(0.102084782360000000)*t;
+
+    // [21,23]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24];
+    y[21] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[21]+tg*f[21];
+    t = f[21]*g[23]+f[23]*g[21];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(-0.119098912753000000)*t;
+
+    // [22,22]: 6,20,0,24,
+    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24];
+    y[22] += tf*g[22]+tg*f[22];
+    t = f[22]*g[22];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[24] += CONSTANT(0.135045473384000000)*t;
+
+    // [23,23]: 6,20,0,
+    tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0];
+    tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0];
+    y[23] += tf*g[23]+tg*f[23];
+    t = f[23]*g[23];
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+    y[0] += CONSTANT(0.282094791768999990)*t;
+
+    // [24,24]: 6,0,20,
+    tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20];
+    y[24] += tf*g[24]+tg*f[24];
+    t = f[24]*g[24];
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // multiply count=1135
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232909.aspx
+//-------------------------------------------------------------------------------------
+float* XMSHMultiply6( _Out_writes_(36) float *y,
+                      _In_reads_(36) const float *f,
+                      _In_reads_(36) const float *g )
+{
+    if ( !y || !f || !g )
+        return nullptr;
+
+    REAL tf,tg,t;
+    // [0,0]: 0,
+    y[0]  = CONSTANT(0.282094792935999980)*f[0]*g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8];
+    y[1]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6]  = CONSTANT(-0.126156626101000010)*t;
+    y[8]  = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,4]: 3,13,15,
+    tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15];
+    tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15];
+    y[1] += tf*g[4]+tg*f[4];
+    y[4]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[4]+f[4]*g[1];
+    y[3]  = CONSTANT(0.218509686114999990)*t;
+    y[13]  = CONSTANT(-0.058399170082300000)*t;
+    y[15]  = CONSTANT(-0.226179013157999990)*t;
+
+    // [1,5]: 2,12,
+    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12];
+    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12];
+    y[1] += tf*g[5]+tg*f[5];
+    y[5]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[5]+f[5]*g[1];
+    y[2]  = CONSTANT(0.218509686118000010)*t;
+    y[12]  = CONSTANT(-0.143048168103000000)*t;
+
+    // [1,11]: 6,8,20,22,
+    tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22];
+    tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22];
+    y[1] += tf*g[11]+tg*f[11];
+    y[11]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[11]+f[11]*g[1];
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[8] += CONSTANT(0.058399170081799998)*t;
+    y[20]  = CONSTANT(-0.150786008773000000)*t;
+    y[22]  = CONSTANT(-0.168583882836999990)*t;
+
+    // [1,16]: 15,33,35,
+    tf = CONSTANT(0.230329432973999990)*f[15]+CONSTANT(-0.034723468517399998)*f[33]+CONSTANT(-0.232932108051999990)*f[35];
+    tg = CONSTANT(0.230329432973999990)*g[15]+CONSTANT(-0.034723468517399998)*g[33]+CONSTANT(-0.232932108051999990)*g[35];
+    y[1] += tf*g[16]+tg*f[16];
+    y[16]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[16]+f[16]*g[1];
+    y[15] += CONSTANT(0.230329432973999990)*t;
+    y[33]  = CONSTANT(-0.034723468517399998)*t;
+    y[35]  = CONSTANT(-0.232932108051999990)*t;
+
+    // [1,18]: 15,13,31,33,
+    tf = CONSTANT(0.043528171377799997)*f[15]+CONSTANT(0.168583882834000000)*f[13]+CONSTANT(-0.085054779966799998)*f[31]+CONSTANT(-0.183739324705999990)*f[33];
+    tg = CONSTANT(0.043528171377799997)*g[15]+CONSTANT(0.168583882834000000)*g[13]+CONSTANT(-0.085054779966799998)*g[31]+CONSTANT(-0.183739324705999990)*g[33];
+    y[1] += tf*g[18]+tg*f[18];
+    y[18]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[18]+f[18]*g[1];
+    y[15] += CONSTANT(0.043528171377799997)*t;
+    y[13] += CONSTANT(0.168583882834000000)*t;
+    y[31]  = CONSTANT(-0.085054779966799998)*t;
+    y[33] += CONSTANT(-0.183739324705999990)*t;
+
+    // [1,19]: 14,12,30,32,
+    tf = CONSTANT(0.075393004386399995)*f[14]+CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(-0.159122922869999990)*f[32];
+    tg = CONSTANT(0.075393004386399995)*g[14]+CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(-0.159122922869999990)*g[32];
+    y[1] += tf*g[19]+tg*f[19];
+    y[19]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[19]+f[19]*g[1];
+    y[14]  = CONSTANT(0.075393004386399995)*t;
+    y[12] += CONSTANT(0.194663900273000010)*t;
+    y[30]  = CONSTANT(-0.155288072037000010)*t;
+    y[32]  = CONSTANT(-0.159122922869999990)*t;
+
+    // [1,24]: 9,25,27,
+    tf = CONSTANT(-0.230329432978999990)*f[9]+CONSTANT(0.232932108049000000)*f[25]+CONSTANT(0.034723468517100002)*f[27];
+    tg = CONSTANT(-0.230329432978999990)*g[9]+CONSTANT(0.232932108049000000)*g[25]+CONSTANT(0.034723468517100002)*g[27];
+    y[1] += tf*g[24]+tg*f[24];
+    y[24]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[24]+f[24]*g[1];
+    y[9]  = CONSTANT(-0.230329432978999990)*t;
+    y[25]  = CONSTANT(0.232932108049000000)*t;
+    y[27]  = CONSTANT(0.034723468517100002)*t;
+
+    // [1,29]: 22,20,
+    tf = CONSTANT(0.085054779965999999)*f[22]+CONSTANT(0.190188269815000010)*f[20];
+    tg = CONSTANT(0.085054779965999999)*g[22]+CONSTANT(0.190188269815000010)*g[20];
+    y[1] += tf*g[29]+tg*f[29];
+    y[29]  = tf*g[1]+tg*f[1];
+    t = f[1]*g[29]+f[29]*g[1];
+    y[22] += CONSTANT(0.085054779965999999)*t;
+    y[20] += CONSTANT(0.190188269815000010)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2]+tg*f[2];
+    t = f[2]*g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,12]: 6,20,
+    tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20];
+    tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20];
+    y[2] += tf*g[12]+tg*f[12];
+    y[12] += tf*g[2]+tg*f[2];
+    t = f[2]*g[12]+f[12]*g[2];
+    y[6] += CONSTANT(0.247766706973999990)*t;
+    y[20] += CONSTANT(0.246232537174000010)*t;
+
+    // [2,20]: 30,
+    tf = CONSTANT(0.245532020560000010)*f[30];
+    tg = CONSTANT(0.245532020560000010)*g[30];
+    y[2] += tf*g[20]+tg*f[20];
+    y[20] += tf*g[2]+tg*f[2];
+    t = f[2]*g[20]+f[20]*g[2];
+    y[30] += CONSTANT(0.245532020560000010)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3]+tg*f[3];
+    t = f[3]*g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [3,7]: 2,12,
+    tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12];
+    tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12];
+    y[3] += tf*g[7]+tg*f[7];
+    y[7]  = tf*g[3]+tg*f[3];
+    t = f[3]*g[7]+f[7]*g[3];
+    y[2] += CONSTANT(0.218509686118000010)*t;
+    y[12] += CONSTANT(-0.143048168103000000)*t;
+
+    // [3,13]: 8,6,20,22,
+    tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22];
+    tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22];
+    y[3] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[3]+tg*f[3];
+    t = f[3]*g[13]+f[13]*g[3];
+    y[8] += CONSTANT(-0.058399170081799998)*t;
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[20] += CONSTANT(-0.150786008773000000)*t;
+    y[22] += CONSTANT(0.168583882836999990)*t;
+
+    // [3,16]: 9,25,27,
+    tf = CONSTANT(0.230329432973999990)*f[9]+CONSTANT(0.232932108051999990)*f[25]+CONSTANT(-0.034723468517399998)*f[27];
+    tg = CONSTANT(0.230329432973999990)*g[9]+CONSTANT(0.232932108051999990)*g[25]+CONSTANT(-0.034723468517399998)*g[27];
+    y[3] += tf*g[16]+tg*f[16];
+    y[16] += tf*g[3]+tg*f[3];
+    t = f[3]*g[16]+f[16]*g[3];
+    y[9] += CONSTANT(0.230329432973999990)*t;
+    y[25] += CONSTANT(0.232932108051999990)*t;
+    y[27] += CONSTANT(-0.034723468517399998)*t;
+
+    // [3,21]: 12,14,30,32,
+    tf = CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.075393004386399995)*f[14]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(0.159122922869999990)*f[32];
+    tg = CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.075393004386399995)*g[14]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(0.159122922869999990)*g[32];
+    y[3] += tf*g[21]+tg*f[21];
+    y[21]  = tf*g[3]+tg*f[3];
+    t = f[3]*g[21]+f[21]*g[3];
+    y[12] += CONSTANT(0.194663900273000010)*t;
+    y[14] += CONSTANT(-0.075393004386399995)*t;
+    y[30] += CONSTANT(-0.155288072037000010)*t;
+    y[32] += CONSTANT(0.159122922869999990)*t;
+
+    // [3,24]: 15,33,35,
+    tf = CONSTANT(0.230329432978999990)*f[15]+CONSTANT(-0.034723468517100002)*f[33]+CONSTANT(0.232932108049000000)*f[35];
+    tg = CONSTANT(0.230329432978999990)*g[15]+CONSTANT(-0.034723468517100002)*g[33]+CONSTANT(0.232932108049000000)*g[35];
+    y[3] += tf*g[24]+tg*f[24];
+    y[24] += tf*g[3]+tg*f[3];
+    t = f[3]*g[24]+f[24]*g[3];
+    y[15] += CONSTANT(0.230329432978999990)*t;
+    y[33] += CONSTANT(-0.034723468517100002)*t;
+    y[35] += CONSTANT(0.232932108049000000)*t;
+
+    // [3,31]: 20,22,
+    tf = CONSTANT(0.190188269815000010)*f[20]+CONSTANT(-0.085054779965999999)*f[22];
+    tg = CONSTANT(0.190188269815000010)*g[20]+CONSTANT(-0.085054779965999999)*g[22];
+    y[3] += tf*g[31]+tg*f[31];
+    y[31] += tf*g[3]+tg*f[3];
+    t = f[3]*g[31]+f[31]*g[3];
+    y[20] += CONSTANT(0.190188269815000010)*t;
+    y[22] += CONSTANT(-0.085054779965999999)*t;
+
+    // [4,4]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24];
+    y[4] += tf*g[4]+tg*f[4];
+    t = f[4]*g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(-0.238413613505999990)*t;
+
+    // [4,5]: 7,21,23,
+    tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23];
+    tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23];
+    y[4] += tf*g[5]+tg*f[5];
+    y[5] += tf*g[4]+tg*f[4];
+    t = f[4]*g[5]+f[5]*g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+    y[21] += CONSTANT(-0.063718718434399996)*t;
+    y[23]  = CONSTANT(-0.168583882835000000)*t;
+
+    // [4,9]: 3,13,31,35,
+    tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13]+CONSTANT(0.016943317729299998)*f[31]+CONSTANT(-0.245532000542000000)*f[35];
+    tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13]+CONSTANT(0.016943317729299998)*g[31]+CONSTANT(-0.245532000542000000)*g[35];
+    y[4] += tf*g[9]+tg*f[9];
+    y[9] += tf*g[4]+tg*f[4];
+    t = f[4]*g[9]+f[9]*g[4];
+    y[3] += CONSTANT(0.226179013157999990)*t;
+    y[13] += CONSTANT(-0.094031597258400004)*t;
+    y[31] += CONSTANT(0.016943317729299998)*t;
+    y[35] += CONSTANT(-0.245532000542000000)*t;
+
+    // [4,10]: 2,12,30,34,
+    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(-0.190188269816000010)*f[34];
+    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(-0.190188269816000010)*g[34];
+    y[4] += tf*g[10]+tg*f[10];
+    y[10]  = tf*g[4]+tg*f[4];
+    t = f[4]*g[10]+f[10]*g[4];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+    y[30] += CONSTANT(0.053579475144400000)*t;
+    y[34]  = CONSTANT(-0.190188269816000010)*t;
+
+    // [4,11]: 3,13,15,31,33,
+    tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]+CONSTANT(-0.065621187395699998)*f[31]+CONSTANT(-0.141757966610000010)*f[33];
+    tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]+CONSTANT(-0.065621187395699998)*g[31]+CONSTANT(-0.141757966610000010)*g[33];
+    y[4] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[4]+tg*f[4];
+    t = f[4]*g[11]+f[11]*g[4];
+    y[3] += CONSTANT(-0.058399170082300000)*t;
+    y[13] += CONSTANT(0.145673124078000010)*t;
+    y[15] += CONSTANT(0.094031597258400004)*t;
+    y[31] += CONSTANT(-0.065621187395699998)*t;
+    y[33] += CONSTANT(-0.141757966610000010)*t;
+
+    // [4,16]: 8,22,
+    tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22];
+    tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22];
+    y[4] += tf*g[16]+tg*f[16];
+    y[16] += tf*g[4]+tg*f[4];
+    t = f[4]*g[16]+f[16]*g[4];
+    y[8] += CONSTANT(0.238413613494000000)*t;
+    y[22] += CONSTANT(-0.075080816693699995)*t;
+
+    // [4,18]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24];
+    y[4] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[4]+tg*f[4];
+    t = f[4]*g[18]+f[18]*g[4];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(0.075080816691500005)*t;
+
+    // [4,19]: 7,21,23,
+    tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23];
+    tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23];
+    y[4] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[4]+tg*f[4];
+    t = f[4]*g[19]+f[19]*g[4];
+    y[7] += CONSTANT(-0.063718718434399996)*t;
+    y[21] += CONSTANT(0.141889406569999990)*t;
+    y[23] += CONSTANT(0.112621225039000000)*t;
+
+    // [4,25]: 15,33,
+    tf = CONSTANT(0.245532000542000000)*f[15]+CONSTANT(-0.062641347680800000)*f[33];
+    tg = CONSTANT(0.245532000542000000)*g[15]+CONSTANT(-0.062641347680800000)*g[33];
+    y[4] += tf*g[25]+tg*f[25];
+    y[25] += tf*g[4]+tg*f[4];
+    t = f[4]*g[25]+f[25]*g[4];
+    y[15] += CONSTANT(0.245532000542000000)*t;
+    y[33] += CONSTANT(-0.062641347680800000)*t;
+
+    // [4,26]: 14,32,
+    tf = CONSTANT(0.190188269806999990)*f[14]+CONSTANT(-0.097043558542400002)*f[32];
+    tg = CONSTANT(0.190188269806999990)*g[14]+CONSTANT(-0.097043558542400002)*g[32];
+    y[4] += tf*g[26]+tg*f[26];
+    y[26]  = tf*g[4]+tg*f[4];
+    t = f[4]*g[26]+f[26]*g[4];
+    y[14] += CONSTANT(0.190188269806999990)*t;
+    y[32] += CONSTANT(-0.097043558542400002)*t;
+
+    // [4,27]: 13,31,35,
+    tf = CONSTANT(0.141757966610000010)*f[13]+CONSTANT(-0.121034582549000000)*f[31]+CONSTANT(0.062641347680800000)*f[35];
+    tg = CONSTANT(0.141757966610000010)*g[13]+CONSTANT(-0.121034582549000000)*g[31]+CONSTANT(0.062641347680800000)*g[35];
+    y[4] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[4]+tg*f[4];
+    t = f[4]*g[27]+f[27]*g[4];
+    y[13] += CONSTANT(0.141757966610000010)*t;
+    y[31] += CONSTANT(-0.121034582549000000)*t;
+    y[35] += CONSTANT(0.062641347680800000)*t;
+
+    // [4,28]: 12,30,34,
+    tf = CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.097043558538899996)*f[34];
+    tg = CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.097043558538899996)*g[34];
+    y[4] += tf*g[28]+tg*f[28];
+    y[28]  = tf*g[4]+tg*f[4];
+    t = f[4]*g[28]+f[28]*g[4];
+    y[12] += CONSTANT(0.141757966609000000)*t;
+    y[30] += CONSTANT(-0.191372478254000000)*t;
+    y[34] += CONSTANT(0.097043558538899996)*t;
+
+    // [4,29]: 13,15,31,33,
+    tf = CONSTANT(-0.065621187395699998)*f[13]+CONSTANT(-0.016943317729299998)*f[15]+CONSTANT(0.140070311613999990)*f[31]+CONSTANT(0.121034582549000000)*f[33];
+    tg = CONSTANT(-0.065621187395699998)*g[13]+CONSTANT(-0.016943317729299998)*g[15]+CONSTANT(0.140070311613999990)*g[31]+CONSTANT(0.121034582549000000)*g[33];
+    y[4] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[4]+tg*f[4];
+    t = f[4]*g[29]+f[29]*g[4];
+    y[13] += CONSTANT(-0.065621187395699998)*t;
+    y[15] += CONSTANT(-0.016943317729299998)*t;
+    y[31] += CONSTANT(0.140070311613999990)*t;
+    y[33] += CONSTANT(0.121034582549000000)*t;
+
+    // [5,5]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22];
+    y[5] += tf*g[5]+tg*f[5];
+    t = f[5]*g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(-0.180223751574000000)*t;
+
+    // [5,10]: 3,13,15,31,33,
+    tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15]+CONSTANT(-0.083004965974099995)*f[31]+CONSTANT(-0.179311220383999990)*f[33];
+    tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15]+CONSTANT(-0.083004965974099995)*g[31]+CONSTANT(-0.179311220383999990)*g[33];
+    y[5] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[5]+tg*f[5];
+    t = f[5]*g[10]+f[10]*g[5];
+    y[3] += CONSTANT(0.184674390919999990)*t;
+    y[13] += CONSTANT(0.115164716490000000)*t;
+    y[15] += CONSTANT(-0.148677009678999990)*t;
+    y[31] += CONSTANT(-0.083004965974099995)*t;
+    y[33] += CONSTANT(-0.179311220383999990)*t;
+
+    // [5,11]: 2,12,14,30,32,
+    tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(-0.173617342585000000)*f[32];
+    tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(-0.173617342585000000)*g[32];
+    y[5] += tf*g[11]+tg*f[11];
+    y[11] += tf*g[5]+tg*f[5];
+    t = f[5]*g[11]+f[11]*g[5];
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[14] += CONSTANT(-0.115164716491000000)*t;
+    y[30] += CONSTANT(-0.169433177294000010)*t;
+    y[32] += CONSTANT(-0.173617342585000000)*t;
+
+    // [5,14]: 9,1,27,29,
+    tf = CONSTANT(0.148677009677999990)*f[9]+CONSTANT(-0.184674390923000000)*f[1]+CONSTANT(0.179311220382000010)*f[27]+CONSTANT(0.083004965973399999)*f[29];
+    tg = CONSTANT(0.148677009677999990)*g[9]+CONSTANT(-0.184674390923000000)*g[1]+CONSTANT(0.179311220382000010)*g[27]+CONSTANT(0.083004965973399999)*g[29];
+    y[5] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[5]+tg*f[5];
+    t = f[5]*g[14]+f[14]*g[5];
+    y[9] += CONSTANT(0.148677009677999990)*t;
+    y[1] += CONSTANT(-0.184674390923000000)*t;
+    y[27] += CONSTANT(0.179311220382000010)*t;
+    y[29] += CONSTANT(0.083004965973399999)*t;
+
+    // [5,17]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24];
+    y[5] += tf*g[17]+tg*f[17];
+    y[17]  = tf*g[5]+tg*f[5];
+    t = f[5]*g[17]+f[17]*g[5];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(-0.140463346189000000)*t;
+
+    // [5,18]: 7,21,23,
+    tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23];
+    tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23];
+    y[5] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[5]+tg*f[5];
+    t = f[5]*g[18]+f[18]*g[5];
+    y[7] += CONSTANT(0.180223751571000010)*t;
+    y[21] += CONSTANT(0.090297865407399994)*t;
+    y[23] += CONSTANT(-0.132725386549000010)*t;
+
+    // [5,19]: 6,8,20,22,
+    tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22];
+    tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22];
+    y[5] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[5]+tg*f[5];
+    t = f[5]*g[19]+f[19]*g[5];
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[8] += CONSTANT(0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[22] += CONSTANT(-0.090297865408399999)*t;
+
+    // [5,26]: 15,33,35,
+    tf = CONSTANT(0.155288072035000000)*f[15]+CONSTANT(0.138662534056999990)*f[33]+CONSTANT(-0.132882365179999990)*f[35];
+    tg = CONSTANT(0.155288072035000000)*g[15]+CONSTANT(0.138662534056999990)*g[33]+CONSTANT(-0.132882365179999990)*g[35];
+    y[5] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[5]+tg*f[5];
+    t = f[5]*g[26]+f[26]*g[5];
+    y[15] += CONSTANT(0.155288072035000000)*t;
+    y[33] += CONSTANT(0.138662534056999990)*t;
+    y[35] += CONSTANT(-0.132882365179999990)*t;
+
+    // [5,28]: 15,13,31,33,
+    tf = CONSTANT(0.044827805096399997)*f[15]+CONSTANT(0.173617342584000000)*f[13]+CONSTANT(0.074118242118699995)*f[31]+CONSTANT(-0.114366930522000000)*f[33];
+    tg = CONSTANT(0.044827805096399997)*g[15]+CONSTANT(0.173617342584000000)*g[13]+CONSTANT(0.074118242118699995)*g[31]+CONSTANT(-0.114366930522000000)*g[33];
+    y[5] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[5]+tg*f[5];
+    t = f[5]*g[28]+f[28]*g[5];
+    y[15] += CONSTANT(0.044827805096399997)*t;
+    y[13] += CONSTANT(0.173617342584000000)*t;
+    y[31] += CONSTANT(0.074118242118699995)*t;
+    y[33] += CONSTANT(-0.114366930522000000)*t;
+
+    // [5,29]: 12,30,32,
+    tf = CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.036165998945399999)*f[30]+CONSTANT(-0.074118242119099995)*f[32];
+    tg = CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.036165998945399999)*g[30]+CONSTANT(-0.074118242119099995)*g[32];
+    y[5] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[5]+tg*f[5];
+    t = f[5]*g[29]+f[29]*g[5];
+    y[12] += CONSTANT(0.214317900578999990)*t;
+    y[30] += CONSTANT(0.036165998945399999)*t;
+    y[32] += CONSTANT(-0.074118242119099995)*t;
+
+    // [5,32]: 9,27,
+    tf = CONSTANT(-0.044827805096799997)*f[9]+CONSTANT(0.114366930522000000)*f[27];
+    tg = CONSTANT(-0.044827805096799997)*g[9]+CONSTANT(0.114366930522000000)*g[27];
+    y[5] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[5]+tg*f[5];
+    t = f[5]*g[32]+f[32]*g[5];
+    y[9] += CONSTANT(-0.044827805096799997)*t;
+    y[27] += CONSTANT(0.114366930522000000)*t;
+
+    // [5,34]: 9,27,25,
+    tf = CONSTANT(-0.155288072036000010)*f[9]+CONSTANT(-0.138662534059000000)*f[27]+CONSTANT(0.132882365179000010)*f[25];
+    tg = CONSTANT(-0.155288072036000010)*g[9]+CONSTANT(-0.138662534059000000)*g[27]+CONSTANT(0.132882365179000010)*g[25];
+    y[5] += tf*g[34]+tg*f[34];
+    y[34] += tf*g[5]+tg*f[5];
+    t = f[5]*g[34]+f[34]*g[5];
+    y[9] += CONSTANT(-0.155288072036000010)*t;
+    y[27] += CONSTANT(-0.138662534059000000)*t;
+    y[25] += CONSTANT(0.132882365179000010)*t;
+
+    // [6,6]: 0,6,20,
+    tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20];
+    tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20];
+    y[6] += tf*g[6]+tg*f[6];
+    t = f[6]*g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+    y[20] += CONSTANT(0.241795553185999990)*t;
+
+    // [7,7]: 6,0,8,20,22,
+    tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22];
+    tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22];
+    y[7] += tf*g[7]+tg*f[7];
+    t = f[7]*g[7];
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(0.180223751574000000)*t;
+
+    // [7,10]: 9,1,11,27,29,
+    tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11]+CONSTANT(0.179311220383999990)*f[27]+CONSTANT(-0.083004965974099995)*f[29];
+    tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11]+CONSTANT(0.179311220383999990)*g[27]+CONSTANT(-0.083004965974099995)*g[29];
+    y[7] += tf*g[10]+tg*f[10];
+    y[10] += tf*g[7]+tg*f[7];
+    t = f[7]*g[10]+f[10]*g[7];
+    y[9] += CONSTANT(0.148677009678999990)*t;
+    y[1] += CONSTANT(0.184674390919999990)*t;
+    y[11] += CONSTANT(0.115164716490000000)*t;
+    y[27] += CONSTANT(0.179311220383999990)*t;
+    y[29] += CONSTANT(-0.083004965974099995)*t;
+
+    // [7,13]: 12,2,14,30,32,
+    tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(0.173617342585000000)*f[32];
+    tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(0.173617342585000000)*g[32];
+    y[7] += tf*g[13]+tg*f[13];
+    y[13] += tf*g[7]+tg*f[7];
+    t = f[7]*g[13]+f[13]*g[7];
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[14] += CONSTANT(0.115164716491000000)*t;
+    y[30] += CONSTANT(-0.169433177294000010)*t;
+    y[32] += CONSTANT(0.173617342585000000)*t;
+
+    // [7,14]: 3,15,31,33,
+    tf = CONSTANT(0.184674390923000000)*f[3]+CONSTANT(0.148677009677999990)*f[15]+CONSTANT(-0.083004965973399999)*f[31]+CONSTANT(0.179311220382000010)*f[33];
+    tg = CONSTANT(0.184674390923000000)*g[3]+CONSTANT(0.148677009677999990)*g[15]+CONSTANT(-0.083004965973399999)*g[31]+CONSTANT(0.179311220382000010)*g[33];
+    y[7] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[7]+tg*f[7];
+    t = f[7]*g[14]+f[14]*g[7];
+    y[3] += CONSTANT(0.184674390923000000)*t;
+    y[15] += CONSTANT(0.148677009677999990)*t;
+    y[31] += CONSTANT(-0.083004965973399999)*t;
+    y[33] += CONSTANT(0.179311220382000010)*t;
+
+    // [7,17]: 16,4,18,
+    tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18];
+    tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18];
+    y[7] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[7]+tg*f[7];
+    t = f[7]*g[17]+f[17]*g[7];
+    y[16] += CONSTANT(0.140463346187999990)*t;
+    y[4] += CONSTANT(0.168583882835000000)*t;
+    y[18] += CONSTANT(0.132725386549000010)*t;
+
+    // [7,21]: 8,20,6,22,
+    tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22];
+    tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22];
+    y[7] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[7]+tg*f[7];
+    t = f[7]*g[21]+f[21]*g[7];
+    y[8] += CONSTANT(-0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[22] += CONSTANT(0.090297865408399999)*t;
+
+    // [7,23]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24];
+    y[7] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[7]+tg*f[7];
+    t = f[7]*g[23]+f[23]*g[7];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(0.140463346189000000)*t;
+
+    // [7,26]: 9,25,27,
+    tf = CONSTANT(0.155288072035000000)*f[9]+CONSTANT(0.132882365179999990)*f[25]+CONSTANT(0.138662534056999990)*f[27];
+    tg = CONSTANT(0.155288072035000000)*g[9]+CONSTANT(0.132882365179999990)*g[25]+CONSTANT(0.138662534056999990)*g[27];
+    y[7] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[7]+tg*f[7];
+    t = f[7]*g[26]+f[26]*g[7];
+    y[9] += CONSTANT(0.155288072035000000)*t;
+    y[25] += CONSTANT(0.132882365179999990)*t;
+    y[27] += CONSTANT(0.138662534056999990)*t;
+
+    // [7,28]: 27,11,9,29,
+    tf = CONSTANT(0.114366930522000000)*f[27]+CONSTANT(0.173617342584000000)*f[11]+CONSTANT(-0.044827805096399997)*f[9]+CONSTANT(0.074118242118699995)*f[29];
+    tg = CONSTANT(0.114366930522000000)*g[27]+CONSTANT(0.173617342584000000)*g[11]+CONSTANT(-0.044827805096399997)*g[9]+CONSTANT(0.074118242118699995)*g[29];
+    y[7] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[7]+tg*f[7];
+    t = f[7]*g[28]+f[28]*g[7];
+    y[27] += CONSTANT(0.114366930522000000)*t;
+    y[11] += CONSTANT(0.173617342584000000)*t;
+    y[9] += CONSTANT(-0.044827805096399997)*t;
+    y[29] += CONSTANT(0.074118242118699995)*t;
+
+    // [7,31]: 30,12,32,
+    tf = CONSTANT(0.036165998945399999)*f[30]+CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.074118242119099995)*f[32];
+    tg = CONSTANT(0.036165998945399999)*g[30]+CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.074118242119099995)*g[32];
+    y[7] += tf*g[31]+tg*f[31];
+    y[31] += tf*g[7]+tg*f[7];
+    t = f[7]*g[31]+f[31]*g[7];
+    y[30] += CONSTANT(0.036165998945399999)*t;
+    y[12] += CONSTANT(0.214317900578999990)*t;
+    y[32] += CONSTANT(0.074118242119099995)*t;
+
+    // [7,32]: 15,33,
+    tf = CONSTANT(-0.044827805096799997)*f[15]+CONSTANT(0.114366930522000000)*f[33];
+    tg = CONSTANT(-0.044827805096799997)*g[15]+CONSTANT(0.114366930522000000)*g[33];
+    y[7] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[7]+tg*f[7];
+    t = f[7]*g[32]+f[32]*g[7];
+    y[15] += CONSTANT(-0.044827805096799997)*t;
+    y[33] += CONSTANT(0.114366930522000000)*t;
+
+    // [7,34]: 15,33,35,
+    tf = CONSTANT(0.155288072036000010)*f[15]+CONSTANT(0.138662534059000000)*f[33]+CONSTANT(0.132882365179000010)*f[35];
+    tg = CONSTANT(0.155288072036000010)*g[15]+CONSTANT(0.138662534059000000)*g[33]+CONSTANT(0.132882365179000010)*g[35];
+    y[7] += tf*g[34]+tg*f[34];
+    y[34] += tf*g[7]+tg*f[7];
+    t = f[7]*g[34]+f[34]*g[7];
+    y[15] += CONSTANT(0.155288072036000010)*t;
+    y[33] += CONSTANT(0.138662534059000000)*t;
+    y[35] += CONSTANT(0.132882365179000010)*t;
+
+    // [8,8]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24];
+    y[8] += tf*g[8]+tg*f[8];
+    t = f[8]*g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(0.238413613505999990)*t;
+
+    // [8,9]: 1,11,25,29,
+    tf = CONSTANT(0.226179013155000000)*f[1]+CONSTANT(-0.094031597259499999)*f[11]+CONSTANT(0.245532000541000000)*f[25]+CONSTANT(0.016943317729199998)*f[29];
+    tg = CONSTANT(0.226179013155000000)*g[1]+CONSTANT(-0.094031597259499999)*g[11]+CONSTANT(0.245532000541000000)*g[25]+CONSTANT(0.016943317729199998)*g[29];
+    y[8] += tf*g[9]+tg*f[9];
+    y[9] += tf*g[8]+tg*f[8];
+    t = f[8]*g[9]+f[9]*g[8];
+    y[1] += CONSTANT(0.226179013155000000)*t;
+    y[11] += CONSTANT(-0.094031597259499999)*t;
+    y[25] += CONSTANT(0.245532000541000000)*t;
+    y[29] += CONSTANT(0.016943317729199998)*t;
+
+    // [8,14]: 2,12,30,34,
+    tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(0.190188269816000010)*f[34];
+    tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(0.190188269816000010)*g[34];
+    y[8] += tf*g[14]+tg*f[14];
+    y[14] += tf*g[8]+tg*f[8];
+    t = f[8]*g[14]+f[14]*g[8];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+    y[30] += CONSTANT(0.053579475144400000)*t;
+    y[34] += CONSTANT(0.190188269816000010)*t;
+
+    // [8,15]: 13,3,31,35,
+    tf = CONSTANT(-0.094031597259499999)*f[13]+CONSTANT(0.226179013155000000)*f[3]+CONSTANT(0.016943317729199998)*f[31]+CONSTANT(0.245532000541000000)*f[35];
+    tg = CONSTANT(-0.094031597259499999)*g[13]+CONSTANT(0.226179013155000000)*g[3]+CONSTANT(0.016943317729199998)*g[31]+CONSTANT(0.245532000541000000)*g[35];
+    y[8] += tf*g[15]+tg*f[15];
+    y[15] += tf*g[8]+tg*f[8];
+    t = f[8]*g[15]+f[15]*g[8];
+    y[13] += CONSTANT(-0.094031597259499999)*t;
+    y[3] += CONSTANT(0.226179013155000000)*t;
+    y[31] += CONSTANT(0.016943317729199998)*t;
+    y[35] += CONSTANT(0.245532000541000000)*t;
+
+    // [8,22]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24];
+    y[8] += tf*g[22]+tg*f[22];
+    y[22] += tf*g[8]+tg*f[8];
+    t = f[8]*g[22]+f[22]*g[8];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(-0.075080816691500005)*t;
+
+    // [8,26]: 10,28,
+    tf = CONSTANT(0.190188269806999990)*f[10]+CONSTANT(-0.097043558542400002)*f[28];
+    tg = CONSTANT(0.190188269806999990)*g[10]+CONSTANT(-0.097043558542400002)*g[28];
+    y[8] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[8]+tg*f[8];
+    t = f[8]*g[26]+f[26]*g[8];
+    y[10] += CONSTANT(0.190188269806999990)*t;
+    y[28] += CONSTANT(-0.097043558542400002)*t;
+
+    // [8,27]: 25,11,29,
+    tf = CONSTANT(-0.062641347680800000)*f[25]+CONSTANT(0.141757966609000000)*f[11]+CONSTANT(-0.121034582550000010)*f[29];
+    tg = CONSTANT(-0.062641347680800000)*g[25]+CONSTANT(0.141757966609000000)*g[11]+CONSTANT(-0.121034582550000010)*g[29];
+    y[8] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[8]+tg*f[8];
+    t = f[8]*g[27]+f[27]*g[8];
+    y[25] += CONSTANT(-0.062641347680800000)*t;
+    y[11] += CONSTANT(0.141757966609000000)*t;
+    y[29] += CONSTANT(-0.121034582550000010)*t;
+
+    // [8,32]: 30,12,34,
+    tf = CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.097043558538899996)*f[34];
+    tg = CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.097043558538899996)*g[34];
+    y[8] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[8]+tg*f[8];
+    t = f[8]*g[32]+f[32]*g[8];
+    y[30] += CONSTANT(-0.191372478254000000)*t;
+    y[12] += CONSTANT(0.141757966609000000)*t;
+    y[34] += CONSTANT(-0.097043558538899996)*t;
+
+    // [8,33]: 13,31,35,
+    tf = CONSTANT(0.141757966609000000)*f[13]+CONSTANT(-0.121034582550000010)*f[31]+CONSTANT(-0.062641347680800000)*f[35];
+    tg = CONSTANT(0.141757966609000000)*g[13]+CONSTANT(-0.121034582550000010)*g[31]+CONSTANT(-0.062641347680800000)*g[35];
+    y[8] += tf*g[33]+tg*f[33];
+    y[33] += tf*g[8]+tg*f[8];
+    t = f[8]*g[33]+f[33]*g[8];
+    y[13] += CONSTANT(0.141757966609000000)*t;
+    y[31] += CONSTANT(-0.121034582550000010)*t;
+    y[35] += CONSTANT(-0.062641347680800000)*t;
+
+    // [9,9]: 6,0,20,
+    tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20];
+    y[9] += tf*g[9]+tg*f[9];
+    t = f[9]*g[9];
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [9,17]: 2,12,30,
+    tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.098140130728100003)*f[30];
+    tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.098140130728100003)*g[30];
+    y[9] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[9]+tg*f[9];
+    t = f[9]*g[17]+f[17]*g[9];
+    y[2] += CONSTANT(0.162867503964999990)*t;
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+    y[30] += CONSTANT(0.098140130728100003)*t;
+
+    // [9,18]: 3,13,31,35,
+    tf = CONSTANT(-0.043528171377799997)*f[3]+CONSTANT(0.133255230519000010)*f[13]+CONSTANT(-0.101584686310000010)*f[31]+CONSTANT(0.098140130731999994)*f[35];
+    tg = CONSTANT(-0.043528171377799997)*g[3]+CONSTANT(0.133255230519000010)*g[13]+CONSTANT(-0.101584686310000010)*g[31]+CONSTANT(0.098140130731999994)*g[35];
+    y[9] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[9]+tg*f[9];
+    t = f[9]*g[18]+f[18]*g[9];
+    y[3] += CONSTANT(-0.043528171377799997)*t;
+    y[13] += CONSTANT(0.133255230519000010)*t;
+    y[31] += CONSTANT(-0.101584686310000010)*t;
+    y[35] += CONSTANT(0.098140130731999994)*t;
+
+    // [9,19]: 14,32,34,
+    tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(0.131668802180999990)*f[34];
+    tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(0.131668802180999990)*g[34];
+    y[9] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[9]+tg*f[9];
+    t = f[9]*g[19]+f[19]*g[9];
+    y[14] += CONSTANT(-0.099322584600699995)*t;
+    y[32] += CONSTANT(0.126698363970000010)*t;
+    y[34] += CONSTANT(0.131668802180999990)*t;
+
+    // [9,22]: 1,11,25,29,
+    tf = CONSTANT(-0.043528171378199997)*f[1]+CONSTANT(0.133255230518000010)*f[11]+CONSTANT(-0.098140130732499997)*f[25]+CONSTANT(-0.101584686311000000)*f[29];
+    tg = CONSTANT(-0.043528171378199997)*g[1]+CONSTANT(0.133255230518000010)*g[11]+CONSTANT(-0.098140130732499997)*g[25]+CONSTANT(-0.101584686311000000)*g[29];
+    y[9] += tf*g[22]+tg*f[22];
+    y[22] += tf*g[9]+tg*f[9];
+    t = f[9]*g[22]+f[22]*g[9];
+    y[1] += CONSTANT(-0.043528171378199997)*t;
+    y[11] += CONSTANT(0.133255230518000010)*t;
+    y[25] += CONSTANT(-0.098140130732499997)*t;
+    y[29] += CONSTANT(-0.101584686311000000)*t;
+
+    // [9,27]: 6,20,
+    tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20];
+    tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20];
+    y[9] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[9]+tg*f[9];
+    t = f[9]*g[27]+f[27]*g[9];
+    y[6] += CONSTANT(0.126792179874999990)*t;
+    y[20] += CONSTANT(-0.196280261464999990)*t;
+
+    // [10,10]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24];
+    y[10] += tf*g[10]+tg*f[10];
+    t = f[10]*g[10];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(-0.151717754049000010)*t;
+
+    // [10,16]: 14,32,
+    tf = CONSTANT(0.151717754044999990)*f[14]+CONSTANT(-0.077413979111300005)*f[32];
+    tg = CONSTANT(0.151717754044999990)*g[14]+CONSTANT(-0.077413979111300005)*g[32];
+    y[10] += tf*g[16]+tg*f[16];
+    y[16] += tf*g[10]+tg*f[10];
+    t = f[10]*g[16]+f[16]*g[10];
+    y[14] += CONSTANT(0.151717754044999990)*t;
+    y[32] += CONSTANT(-0.077413979111300005)*t;
+
+    // [10,17]: 13,3,31,35,
+    tf = CONSTANT(0.067850242288900006)*f[13]+CONSTANT(0.199471140200000010)*f[3]+CONSTANT(-0.113793659091000000)*f[31]+CONSTANT(-0.149911525925999990)*f[35];
+    tg = CONSTANT(0.067850242288900006)*g[13]+CONSTANT(0.199471140200000010)*g[3]+CONSTANT(-0.113793659091000000)*g[31]+CONSTANT(-0.149911525925999990)*g[35];
+    y[10] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[10]+tg*f[10];
+    t = f[10]*g[17]+f[17]*g[10];
+    y[13] += CONSTANT(0.067850242288900006)*t;
+    y[3] += CONSTANT(0.199471140200000010)*t;
+    y[31] += CONSTANT(-0.113793659091000000)*t;
+    y[35] += CONSTANT(-0.149911525925999990)*t;
+
+    // [10,18]: 12,2,30,34,
+    tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(-0.101358691177000000)*f[34];
+    tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(-0.101358691177000000)*g[34];
+    y[10] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[10]+tg*f[10];
+    t = f[10]*g[18]+f[18]*g[10];
+    y[12] += CONSTANT(-0.044418410173299998)*t;
+    y[2] += CONSTANT(0.213243618621000000)*t;
+    y[30] += CONSTANT(-0.171327458205000000)*t;
+    y[34] += CONSTANT(-0.101358691177000000)*t;
+
+    // [10,19]: 3,15,13,31,33,
+    tf = CONSTANT(-0.075393004386799994)*f[3]+CONSTANT(0.099322584599600000)*f[15]+CONSTANT(0.102579924281000000)*f[13]+CONSTANT(0.097749909976500002)*f[31]+CONSTANT(-0.025339672794100002)*f[33];
+    tg = CONSTANT(-0.075393004386799994)*g[3]+CONSTANT(0.099322584599600000)*g[15]+CONSTANT(0.102579924281000000)*g[13]+CONSTANT(0.097749909976500002)*g[31]+CONSTANT(-0.025339672794100002)*g[33];
+    y[10] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[10]+tg*f[10];
+    t = f[10]*g[19]+f[19]*g[10];
+    y[3] += CONSTANT(-0.075393004386799994)*t;
+    y[15] += CONSTANT(0.099322584599600000)*t;
+    y[13] += CONSTANT(0.102579924281000000)*t;
+    y[31] += CONSTANT(0.097749909976500002)*t;
+    y[33] += CONSTANT(-0.025339672794100002)*t;
+
+    // [10,21]: 11,1,9,27,29,
+    tf = CONSTANT(0.102579924281000000)*f[11]+CONSTANT(-0.075393004386799994)*f[1]+CONSTANT(-0.099322584599600000)*f[9]+CONSTANT(0.025339672794100002)*f[27]+CONSTANT(0.097749909976500002)*f[29];
+    tg = CONSTANT(0.102579924281000000)*g[11]+CONSTANT(-0.075393004386799994)*g[1]+CONSTANT(-0.099322584599600000)*g[9]+CONSTANT(0.025339672794100002)*g[27]+CONSTANT(0.097749909976500002)*g[29];
+    y[10] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[10]+tg*f[10];
+    t = f[10]*g[21]+f[21]*g[10];
+    y[11] += CONSTANT(0.102579924281000000)*t;
+    y[1] += CONSTANT(-0.075393004386799994)*t;
+    y[9] += CONSTANT(-0.099322584599600000)*t;
+    y[27] += CONSTANT(0.025339672794100002)*t;
+    y[29] += CONSTANT(0.097749909976500002)*t;
+
+    // [10,23]: 11,1,25,29,
+    tf = CONSTANT(-0.067850242288900006)*f[11]+CONSTANT(-0.199471140200000010)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(0.113793659091000000)*f[29];
+    tg = CONSTANT(-0.067850242288900006)*g[11]+CONSTANT(-0.199471140200000010)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(0.113793659091000000)*g[29];
+    y[10] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[10]+tg*f[10];
+    t = f[10]*g[23]+f[23]*g[10];
+    y[11] += CONSTANT(-0.067850242288900006)*t;
+    y[1] += CONSTANT(-0.199471140200000010)*t;
+    y[25] += CONSTANT(0.149911525925999990)*t;
+    y[29] += CONSTANT(0.113793659091000000)*t;
+
+    // [10,28]: 6,20,24,
+    tf = CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.077413979109600004)*f[24];
+    tg = CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.077413979109600004)*g[24];
+    y[10] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[10]+tg*f[10];
+    t = f[10]*g[28]+f[28]*g[10];
+    y[6] += CONSTANT(0.190188269814000000)*t;
+    y[20] += CONSTANT(-0.065426753820500005)*t;
+    y[24] += CONSTANT(0.077413979109600004)*t;
+
+    // [11,11]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22];
+    y[11] += tf*g[11]+tg*f[11];
+    t = f[11]*g[11];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[8] += CONSTANT(-0.145673124078999990)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(-0.114687841910000000)*t;
+
+    // [11,16]: 15,33,35,
+    tf = CONSTANT(-0.117520066953000000)*f[15]+CONSTANT(0.119929220739999990)*f[33]+CONSTANT(0.134084945035999990)*f[35];
+    tg = CONSTANT(-0.117520066953000000)*g[15]+CONSTANT(0.119929220739999990)*g[33]+CONSTANT(0.134084945035999990)*g[35];
+    y[11] += tf*g[16]+tg*f[16];
+    y[16] += tf*g[11]+tg*f[11];
+    t = f[11]*g[16]+f[16]*g[11];
+    y[15] += CONSTANT(-0.117520066953000000)*t;
+    y[33] += CONSTANT(0.119929220739999990)*t;
+    y[35] += CONSTANT(0.134084945035999990)*t;
+
+    // [11,18]: 3,13,15,31,33,
+    tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15]+CONSTANT(0.075189952564900006)*f[31]+CONSTANT(-0.101990215611000000)*f[33];
+    tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15]+CONSTANT(0.075189952564900006)*g[31]+CONSTANT(-0.101990215611000000)*g[33];
+    y[11] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[11]+tg*f[11];
+    t = f[11]*g[18]+f[18]*g[11];
+    y[3] += CONSTANT(0.168583882834000000)*t;
+    y[13] += CONSTANT(0.114687841909000000)*t;
+    y[15] += CONSTANT(-0.133255230519000010)*t;
+    y[31] += CONSTANT(0.075189952564900006)*t;
+    y[33] += CONSTANT(-0.101990215611000000)*t;
+
+    // [11,19]: 2,14,12,30,32,
+    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(-0.104682806112000000)*f[32];
+    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(-0.104682806112000000)*g[32];
+    y[11] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[11]+tg*f[11];
+    t = f[11]*g[19]+f[19]*g[11];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[14] += CONSTANT(-0.102579924282000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+    y[30] += CONSTANT(0.009577496073830001)*t;
+    y[32] += CONSTANT(-0.104682806112000000)*t;
+
+    // [11,24]: 9,25,27,
+    tf = CONSTANT(0.117520066950999990)*f[9]+CONSTANT(-0.134084945037000000)*f[25]+CONSTANT(-0.119929220742000010)*f[27];
+    tg = CONSTANT(0.117520066950999990)*g[9]+CONSTANT(-0.134084945037000000)*g[25]+CONSTANT(-0.119929220742000010)*g[27];
+    y[11] += tf*g[24]+tg*f[24];
+    y[24] += tf*g[11]+tg*f[11];
+    t = f[11]*g[24]+f[24]*g[11];
+    y[9] += CONSTANT(0.117520066950999990)*t;
+    y[25] += CONSTANT(-0.134084945037000000)*t;
+    y[27] += CONSTANT(-0.119929220742000010)*t;
+
+    // [11,29]: 6,20,22,8,
+    tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.075189952565200002)*f[22]+CONSTANT(0.065621187395299999)*f[8];
+    tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.075189952565200002)*g[22]+CONSTANT(0.065621187395299999)*g[8];
+    y[11] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[11]+tg*f[11];
+    t = f[11]*g[29]+f[29]*g[11];
+    y[6] += CONSTANT(0.227318461243000010)*t;
+    y[20] += CONSTANT(0.086019920779800002)*t;
+    y[22] += CONSTANT(-0.075189952565200002)*t;
+    y[8] += CONSTANT(0.065621187395299999)*t;
+
+    // [12,12]: 0,6,20,
+    tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20];
+    tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20];
+    y[12] += tf*g[12]+tg*f[12];
+    t = f[12]*g[12];
+    y[0] += CONSTANT(0.282094799871999980)*t;
+    y[6] += CONSTANT(0.168208852954000010)*t;
+    y[20] += CONSTANT(0.153869910786000010)*t;
+
+    // [12,30]: 20,6,
+    tf = CONSTANT(0.148373961712999990)*f[20]+CONSTANT(0.239614719999000000)*f[6];
+    tg = CONSTANT(0.148373961712999990)*g[20]+CONSTANT(0.239614719999000000)*g[6];
+    y[12] += tf*g[30]+tg*f[30];
+    y[30] += tf*g[12]+tg*f[12];
+    t = f[12]*g[30]+f[30]*g[12];
+    y[20] += CONSTANT(0.148373961712999990)*t;
+    y[6] += CONSTANT(0.239614719999000000)*t;
+
+    // [13,13]: 0,8,6,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22];
+    y[13] += tf*g[13]+tg*f[13];
+    t = f[13]*g[13];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.145673124078999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(0.114687841910000000)*t;
+
+    // [13,16]: 9,25,27,
+    tf = CONSTANT(-0.117520066953000000)*f[9]+CONSTANT(-0.134084945035999990)*f[25]+CONSTANT(0.119929220739999990)*f[27];
+    tg = CONSTANT(-0.117520066953000000)*g[9]+CONSTANT(-0.134084945035999990)*g[25]+CONSTANT(0.119929220739999990)*g[27];
+    y[13] += tf*g[16]+tg*f[16];
+    y[16] += tf*g[13]+tg*f[13];
+    t = f[13]*g[16]+f[16]*g[13];
+    y[9] += CONSTANT(-0.117520066953000000)*t;
+    y[25] += CONSTANT(-0.134084945035999990)*t;
+    y[27] += CONSTANT(0.119929220739999990)*t;
+
+    // [13,21]: 2,12,14,30,32,
+    tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(0.104682806112000000)*f[32];
+    tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(0.104682806112000000)*g[32];
+    y[13] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[13]+tg*f[13];
+    t = f[13]*g[21]+f[21]*g[13];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+    y[14] += CONSTANT(0.102579924282000000)*t;
+    y[30] += CONSTANT(0.009577496073830001)*t;
+    y[32] += CONSTANT(0.104682806112000000)*t;
+
+    // [13,24]: 15,33,35,
+    tf = CONSTANT(-0.117520066950999990)*f[15]+CONSTANT(0.119929220742000010)*f[33]+CONSTANT(-0.134084945037000000)*f[35];
+    tg = CONSTANT(-0.117520066950999990)*g[15]+CONSTANT(0.119929220742000010)*g[33]+CONSTANT(-0.134084945037000000)*g[35];
+    y[13] += tf*g[24]+tg*f[24];
+    y[24] += tf*g[13]+tg*f[13];
+    t = f[13]*g[24]+f[24]*g[13];
+    y[15] += CONSTANT(-0.117520066950999990)*t;
+    y[33] += CONSTANT(0.119929220742000010)*t;
+    y[35] += CONSTANT(-0.134084945037000000)*t;
+
+    // [13,31]: 6,22,20,8,
+    tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.075189952565200002)*f[22]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.065621187395299999)*f[8];
+    tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.075189952565200002)*g[22]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.065621187395299999)*g[8];
+    y[13] += tf*g[31]+tg*f[31];
+    y[31] += tf*g[13]+tg*f[13];
+    t = f[13]*g[31]+f[31]*g[13];
+    y[6] += CONSTANT(0.227318461243000010)*t;
+    y[22] += CONSTANT(0.075189952565200002)*t;
+    y[20] += CONSTANT(0.086019920779800002)*t;
+    y[8] += CONSTANT(-0.065621187395299999)*t;
+
+    // [14,14]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24];
+    y[14] += tf*g[14]+tg*f[14];
+    t = f[14]*g[14];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(0.151717754049000010)*t;
+
+    // [14,17]: 11,1,25,29,
+    tf = CONSTANT(0.067850242288500007)*f[11]+CONSTANT(0.199471140196999990)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(-0.113793659092000000)*f[29];
+    tg = CONSTANT(0.067850242288500007)*g[11]+CONSTANT(0.199471140196999990)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(-0.113793659092000000)*g[29];
+    y[14] += tf*g[17]+tg*f[17];
+    y[17] += tf*g[14]+tg*f[14];
+    t = f[14]*g[17]+f[17]*g[14];
+    y[11] += CONSTANT(0.067850242288500007)*t;
+    y[1] += CONSTANT(0.199471140196999990)*t;
+    y[25] += CONSTANT(0.149911525925999990)*t;
+    y[29] += CONSTANT(-0.113793659092000000)*t;
+
+    // [14,22]: 12,2,30,34,
+    tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(0.101358691177000000)*f[34];
+    tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(0.101358691177000000)*g[34];
+    y[14] += tf*g[22]+tg*f[22];
+    y[22] += tf*g[14]+tg*f[14];
+    t = f[14]*g[22]+f[22]*g[14];
+    y[12] += CONSTANT(-0.044418410173299998)*t;
+    y[2] += CONSTANT(0.213243618621000000)*t;
+    y[30] += CONSTANT(-0.171327458205000000)*t;
+    y[34] += CONSTANT(0.101358691177000000)*t;
+
+    // [14,23]: 13,3,31,35,
+    tf = CONSTANT(0.067850242288500007)*f[13]+CONSTANT(0.199471140196999990)*f[3]+CONSTANT(-0.113793659092000000)*f[31]+CONSTANT(0.149911525925999990)*f[35];
+    tg = CONSTANT(0.067850242288500007)*g[13]+CONSTANT(0.199471140196999990)*g[3]+CONSTANT(-0.113793659092000000)*g[31]+CONSTANT(0.149911525925999990)*g[35];
+    y[14] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[14]+tg*f[14];
+    t = f[14]*g[23]+f[23]*g[14];
+    y[13] += CONSTANT(0.067850242288500007)*t;
+    y[3] += CONSTANT(0.199471140196999990)*t;
+    y[31] += CONSTANT(-0.113793659092000000)*t;
+    y[35] += CONSTANT(0.149911525925999990)*t;
+
+    // [14,32]: 20,6,24,
+    tf = CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.077413979109600004)*f[24];
+    tg = CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.077413979109600004)*g[24];
+    y[14] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[14]+tg*f[14];
+    t = f[14]*g[32]+f[32]*g[14];
+    y[20] += CONSTANT(-0.065426753820500005)*t;
+    y[6] += CONSTANT(0.190188269814000000)*t;
+    y[24] += CONSTANT(-0.077413979109600004)*t;
+
+    // [15,15]: 0,6,20,
+    tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20];
+    y[15] += tf*g[15]+tg*f[15];
+    t = f[15]*g[15];
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [15,21]: 14,32,34,
+    tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(-0.131668802180999990)*f[34];
+    tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(-0.131668802180999990)*g[34];
+    y[15] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[15]+tg*f[15];
+    t = f[15]*g[21]+f[21]*g[15];
+    y[14] += CONSTANT(-0.099322584600699995)*t;
+    y[32] += CONSTANT(0.126698363970000010)*t;
+    y[34] += CONSTANT(-0.131668802180999990)*t;
+
+    // [15,22]: 13,3,31,35,
+    tf = CONSTANT(0.133255230518000010)*f[13]+CONSTANT(-0.043528171378199997)*f[3]+CONSTANT(-0.101584686311000000)*f[31]+CONSTANT(-0.098140130732499997)*f[35];
+    tg = CONSTANT(0.133255230518000010)*g[13]+CONSTANT(-0.043528171378199997)*g[3]+CONSTANT(-0.101584686311000000)*g[31]+CONSTANT(-0.098140130732499997)*g[35];
+    y[15] += tf*g[22]+tg*f[22];
+    y[22] += tf*g[15]+tg*f[15];
+    t = f[15]*g[22]+f[22]*g[15];
+    y[13] += CONSTANT(0.133255230518000010)*t;
+    y[3] += CONSTANT(-0.043528171378199997)*t;
+    y[31] += CONSTANT(-0.101584686311000000)*t;
+    y[35] += CONSTANT(-0.098140130732499997)*t;
+
+    // [15,23]: 12,2,30,
+    tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2]+CONSTANT(0.098140130728100003)*f[30];
+    tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2]+CONSTANT(0.098140130728100003)*g[30];
+    y[15] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[15]+tg*f[15];
+    t = f[15]*g[23]+f[23]*g[15];
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+    y[2] += CONSTANT(0.162867503964999990)*t;
+    y[30] += CONSTANT(0.098140130728100003)*t;
+
+    // [15,33]: 6,20,
+    tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20];
+    tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20];
+    y[15] += tf*g[33]+tg*f[33];
+    y[33] += tf*g[15]+tg*f[15];
+    t = f[15]*g[33]+f[33]*g[15];
+    y[6] += CONSTANT(0.126792179874999990)*t;
+    y[20] += CONSTANT(-0.196280261464999990)*t;
+
+    // [16,16]: 0,6,20,
+    tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20];
+    y[16] += tf*g[16]+tg*f[16];
+    t = f[16]*g[16];
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // [16,18]: 8,22,
+    tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22];
+    tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22];
+    y[16] += tf*g[18]+tg*f[18];
+    y[18] += tf*g[16]+tg*f[16];
+    t = f[16]*g[18]+f[18]*g[16];
+    y[8] += CONSTANT(-0.075080816693699995)*t;
+    y[22] += CONSTANT(0.135045473380000000)*t;
+
+    // [16,23]: 19,5,
+    tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5];
+    tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5];
+    y[16] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[16]+tg*f[16];
+    t = f[16]*g[23]+f[23]*g[16];
+    y[19] += CONSTANT(-0.119098912754999990)*t;
+    y[5] += CONSTANT(0.140463346187999990)*t;
+
+    // [16,26]: 12,2,30,
+    tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.147319200325000010)*f[2]+CONSTANT(0.130197596199999990)*f[30];
+    tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.147319200325000010)*g[2]+CONSTANT(0.130197596199999990)*g[30];
+    y[16] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[16]+tg*f[16];
+    t = f[16]*g[26]+f[26]*g[16];
+    y[12] += CONSTANT(-0.207723503645000000)*t;
+    y[2] += CONSTANT(0.147319200325000010)*t;
+    y[30] += CONSTANT(0.130197596199999990)*t;
+
+    // [16,28]: 14,32,
+    tf = CONSTANT(-0.077413979111300005)*f[14]+CONSTANT(0.128376561115000010)*f[32];
+    tg = CONSTANT(-0.077413979111300005)*g[14]+CONSTANT(0.128376561115000010)*g[32];
+    y[16] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[16]+tg*f[16];
+    t = f[16]*g[28]+f[28]*g[16];
+    y[14] += CONSTANT(-0.077413979111300005)*t;
+    y[32] += CONSTANT(0.128376561115000010)*t;
+
+    // [16,29]: 15,33,35,
+    tf = CONSTANT(0.035835708931099997)*f[15]+CONSTANT(-0.118853600623999990)*f[33]+CONSTANT(-0.053152946071899999)*f[35];
+    tg = CONSTANT(0.035835708931099997)*g[15]+CONSTANT(-0.118853600623999990)*g[33]+CONSTANT(-0.053152946071899999)*g[35];
+    y[16] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[16]+tg*f[16];
+    t = f[16]*g[29]+f[29]*g[16];
+    y[15] += CONSTANT(0.035835708931099997)*t;
+    y[33] += CONSTANT(-0.118853600623999990)*t;
+    y[35] += CONSTANT(-0.053152946071899999)*t;
+
+    // [16,31]: 27,9,25,
+    tf = CONSTANT(-0.118853600623999990)*f[27]+CONSTANT(0.035835708931099997)*f[9]+CONSTANT(0.053152946071899999)*f[25];
+    tg = CONSTANT(-0.118853600623999990)*g[27]+CONSTANT(0.035835708931099997)*g[9]+CONSTANT(0.053152946071899999)*g[25];
+    y[16] += tf*g[31]+tg*f[31];
+    y[31] += tf*g[16]+tg*f[16];
+    t = f[16]*g[31]+f[31]*g[16];
+    y[27] += CONSTANT(-0.118853600623999990)*t;
+    y[9] += CONSTANT(0.035835708931099997)*t;
+    y[25] += CONSTANT(0.053152946071899999)*t;
+
+    // [17,17]: 0,6,20,
+    tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20];
+    tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20];
+    y[17] += tf*g[17]+tg*f[17];
+    t = f[17]*g[17];
+    y[0] += CONSTANT(0.282094791768999990)*t;
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+
+    // [17,19]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24];
+    y[17] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[17]+tg*f[17];
+    t = f[17]*g[19]+f[19]*g[17];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(0.119098912753000000)*t;
+
+    // [17,21]: 16,4,18,
+    tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18];
+    tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18];
+    y[17] += tf*g[21]+tg*f[21];
+    y[21] += tf*g[17]+tg*f[17];
+    t = f[17]*g[21]+f[21]*g[17];
+    y[16] += CONSTANT(-0.119098912754999990)*t;
+    y[4] += CONSTANT(-0.112621225039000000)*t;
+    y[18] += CONSTANT(0.045015157794399997)*t;
+
+    // [17,26]: 3,13,31,
+    tf = CONSTANT(0.208340811096000000)*f[3]+CONSTANT(0.029982305185199998)*f[13]+CONSTANT(-0.118853600623999990)*f[31];
+    tg = CONSTANT(0.208340811096000000)*g[3]+CONSTANT(0.029982305185199998)*g[13]+CONSTANT(-0.118853600623999990)*g[31];
+    y[17] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[17]+tg*f[17];
+    t = f[17]*g[26]+f[26]*g[17];
+    y[3] += CONSTANT(0.208340811096000000)*t;
+    y[13] += CONSTANT(0.029982305185199998)*t;
+    y[31] += CONSTANT(-0.118853600623999990)*t;
+
+    // [17,27]: 12,2,30,
+    tf = CONSTANT(-0.103861751821000010)*f[12]+CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30];
+    tg = CONSTANT(-0.103861751821000010)*g[12]+CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30];
+    y[17] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[17]+tg*f[17];
+    t = f[17]*g[27]+f[27]*g[17];
+    y[12] += CONSTANT(-0.103861751821000010)*t;
+    y[2] += CONSTANT(0.196425600433000000)*t;
+    y[30] += CONSTANT(-0.130197596204999990)*t;
+
+    // [17,28]: 13,3,31,35,
+    tf = CONSTANT(0.121172043789000000)*f[13]+CONSTANT(-0.060142811686500000)*f[3]+CONSTANT(0.034310079156700000)*f[31]+CONSTANT(0.099440056652200001)*f[35];
+    tg = CONSTANT(0.121172043789000000)*g[13]+CONSTANT(-0.060142811686500000)*g[3]+CONSTANT(0.034310079156700000)*g[31]+CONSTANT(0.099440056652200001)*g[35];
+    y[17] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[17]+tg*f[17];
+    t = f[17]*g[28]+f[28]*g[17];
+    y[13] += CONSTANT(0.121172043789000000)*t;
+    y[3] += CONSTANT(-0.060142811686500000)*t;
+    y[31] += CONSTANT(0.034310079156700000)*t;
+    y[35] += CONSTANT(0.099440056652200001)*t;
+
+    // [17,32]: 11,1,25,29,
+    tf = CONSTANT(0.121172043788000010)*f[11]+CONSTANT(-0.060142811686900000)*f[1]+CONSTANT(-0.099440056652700004)*f[25]+CONSTANT(0.034310079156599997)*f[29];
+    tg = CONSTANT(0.121172043788000010)*g[11]+CONSTANT(-0.060142811686900000)*g[1]+CONSTANT(-0.099440056652700004)*g[25]+CONSTANT(0.034310079156599997)*g[29];
+    y[17] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[17]+tg*f[17];
+    t = f[17]*g[32]+f[32]*g[17];
+    y[11] += CONSTANT(0.121172043788000010)*t;
+    y[1] += CONSTANT(-0.060142811686900000)*t;
+    y[25] += CONSTANT(-0.099440056652700004)*t;
+    y[29] += CONSTANT(0.034310079156599997)*t;
+
+    // [17,34]: 29,11,1,
+    tf = CONSTANT(0.118853600623000000)*f[29]+CONSTANT(-0.029982305185400002)*f[11]+CONSTANT(-0.208340811100000000)*f[1];
+    tg = CONSTANT(0.118853600623000000)*g[29]+CONSTANT(-0.029982305185400002)*g[11]+CONSTANT(-0.208340811100000000)*g[1];
+    y[17] += tf*g[34]+tg*f[34];
+    y[34] += tf*g[17]+tg*f[17];
+    t = f[17]*g[34]+f[34]*g[17];
+    y[29] += CONSTANT(0.118853600623000000)*t;
+    y[11] += CONSTANT(-0.029982305185400002)*t;
+    y[1] += CONSTANT(-0.208340811100000000)*t;
+
+    // [18,18]: 6,0,20,24,
+    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24];
+    y[18] += tf*g[18]+tg*f[18];
+    t = f[18]*g[18];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[24] += CONSTANT(-0.135045473384000000)*t;
+
+    // [18,19]: 7,21,23,
+    tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23];
+    tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23];
+    y[18] += tf*g[19]+tg*f[19];
+    y[19] += tf*g[18]+tg*f[18];
+    t = f[18]*g[19]+f[19]*g[18];
+    y[7] += CONSTANT(0.090297865407399994)*t;
+    y[21] += CONSTANT(0.102084782359000000)*t;
+    y[23] += CONSTANT(-0.045015157794399997)*t;
+
+    // [18,25]: 15,33,
+    tf = CONSTANT(-0.098140130731999994)*f[15]+CONSTANT(0.130197596202000000)*f[33];
+    tg = CONSTANT(-0.098140130731999994)*g[15]+CONSTANT(0.130197596202000000)*g[33];
+    y[18] += tf*g[25]+tg*f[25];
+    y[25] += tf*g[18]+tg*f[18];
+    t = f[18]*g[25]+f[25]*g[18];
+    y[15] += CONSTANT(-0.098140130731999994)*t;
+    y[33] += CONSTANT(0.130197596202000000)*t;
+
+    // [18,26]: 14,32,
+    tf = CONSTANT(0.101358691174000000)*f[14]+CONSTANT(0.084042186965900004)*f[32];
+    tg = CONSTANT(0.101358691174000000)*g[14]+CONSTANT(0.084042186965900004)*g[32];
+    y[18] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[18]+tg*f[18];
+    t = f[18]*g[26]+f[26]*g[18];
+    y[14] += CONSTANT(0.101358691174000000)*t;
+    y[32] += CONSTANT(0.084042186965900004)*t;
+
+    // [18,27]: 13,3,35,
+    tf = CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.183739324705999990)*f[3]+CONSTANT(-0.130197596202000000)*f[35];
+    tg = CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.183739324705999990)*g[3]+CONSTANT(-0.130197596202000000)*g[35];
+    y[18] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[18]+tg*f[18];
+    t = f[18]*g[27]+f[27]*g[18];
+    y[13] += CONSTANT(0.101990215611000000)*t;
+    y[3] += CONSTANT(0.183739324705999990)*t;
+    y[35] += CONSTANT(-0.130197596202000000)*t;
+
+    // [18,28]: 2,12,30,34,
+    tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(-0.084042186968800003)*f[34];
+    tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(-0.084042186968800003)*g[34];
+    y[18] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[18]+tg*f[18];
+    t = f[18]*g[28]+f[28]*g[18];
+    y[2] += CONSTANT(0.225033795606000010)*t;
+    y[12] += CONSTANT(0.022664492358099999)*t;
+    y[30] += CONSTANT(-0.099440056651100006)*t;
+    y[34] += CONSTANT(-0.084042186968800003)*t;
+
+    // [18,29]: 3,13,15,31,
+    tf = CONSTANT(-0.085054779966799998)*f[3]+CONSTANT(0.075189952564900006)*f[13]+CONSTANT(0.101584686310000010)*f[15]+CONSTANT(0.097043558538999999)*f[31];
+    tg = CONSTANT(-0.085054779966799998)*g[3]+CONSTANT(0.075189952564900006)*g[13]+CONSTANT(0.101584686310000010)*g[15]+CONSTANT(0.097043558538999999)*g[31];
+    y[18] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[18]+tg*f[18];
+    t = f[18]*g[29]+f[29]*g[18];
+    y[3] += CONSTANT(-0.085054779966799998)*t;
+    y[13] += CONSTANT(0.075189952564900006)*t;
+    y[15] += CONSTANT(0.101584686310000010)*t;
+    y[31] += CONSTANT(0.097043558538999999)*t;
+
+    // [19,19]: 6,8,0,20,22,
+    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22];
+    y[19] += tf*g[19]+tg*f[19];
+    t = f[19]*g[19];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[8] += CONSTANT(-0.141889406570999990)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[22] += CONSTANT(-0.102084782360000000)*t;
+
+    // [19,25]: 34,
+    tf = CONSTANT(-0.130197596205999990)*f[34];
+    tg = CONSTANT(-0.130197596205999990)*g[34];
+    y[19] += tf*g[25]+tg*f[25];
+    y[25] += tf*g[19]+tg*f[19];
+    t = f[19]*g[25]+f[25]*g[19];
+    y[34] += CONSTANT(-0.130197596205999990)*t;
+
+    // [19,26]: 15,35,
+    tf = CONSTANT(-0.131668802182000000)*f[15]+CONSTANT(0.130197596204999990)*f[35];
+    tg = CONSTANT(-0.131668802182000000)*g[15]+CONSTANT(0.130197596204999990)*g[35];
+    y[19] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[19]+tg*f[19];
+    t = f[19]*g[26]+f[26]*g[19];
+    y[15] += CONSTANT(-0.131668802182000000)*t;
+    y[35] += CONSTANT(0.130197596204999990)*t;
+
+    // [19,27]: 14,32,
+    tf = CONSTANT(0.025339672793899998)*f[14]+CONSTANT(0.084042186967699994)*f[32];
+    tg = CONSTANT(0.025339672793899998)*g[14]+CONSTANT(0.084042186967699994)*g[32];
+    y[19] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[19]+tg*f[19];
+    t = f[19]*g[27]+f[27]*g[19];
+    y[14] += CONSTANT(0.025339672793899998)*t;
+    y[32] += CONSTANT(0.084042186967699994)*t;
+
+    // [19,28]: 13,3,15,31,33,
+    tf = CONSTANT(0.104682806111000000)*f[13]+CONSTANT(0.159122922869999990)*f[3]+CONSTANT(-0.126698363970000010)*f[15]+CONSTANT(0.090775936911399999)*f[31]+CONSTANT(-0.084042186968400004)*f[33];
+    tg = CONSTANT(0.104682806111000000)*g[13]+CONSTANT(0.159122922869999990)*g[3]+CONSTANT(-0.126698363970000010)*g[15]+CONSTANT(0.090775936911399999)*g[31]+CONSTANT(-0.084042186968400004)*g[33];
+    y[19] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[19]+tg*f[19];
+    t = f[19]*g[28]+f[28]*g[19];
+    y[13] += CONSTANT(0.104682806111000000)*t;
+    y[3] += CONSTANT(0.159122922869999990)*t;
+    y[15] += CONSTANT(-0.126698363970000010)*t;
+    y[31] += CONSTANT(0.090775936911399999)*t;
+    y[33] += CONSTANT(-0.084042186968400004)*t;
+
+    // [19,29]: 12,14,2,30,32,
+    tf = CONSTANT(0.115089467124000010)*f[12]+CONSTANT(-0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(-0.090775936912099994)*f[32];
+    tg = CONSTANT(0.115089467124000010)*g[12]+CONSTANT(-0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(-0.090775936912099994)*g[32];
+    y[19] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[19]+tg*f[19];
+    t = f[19]*g[29]+f[29]*g[19];
+    y[12] += CONSTANT(0.115089467124000010)*t;
+    y[14] += CONSTANT(-0.097749909977199997)*t;
+    y[2] += CONSTANT(0.240571246744999990)*t;
+    y[30] += CONSTANT(0.053152946072499999)*t;
+    y[32] += CONSTANT(-0.090775936912099994)*t;
+
+    // [20,20]: 6,0,20,
+    tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0];
+    tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0];
+    y[20] += tf*g[20]+tg*f[20];
+    t = f[20]*g[20];
+    y[6] += CONSTANT(0.163839797503000010)*t;
+    y[0] += CONSTANT(0.282094802232000010)*t;
+    y[20] += CONSTANT(0.136961139005999990)*t;
+
+    // [21,21]: 6,20,0,8,22,
+    tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22];
+    y[21] += tf*g[21]+tg*f[21];
+    t = f[21]*g[21];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.141889406570999990)*t;
+    y[22] += CONSTANT(0.102084782360000000)*t;
+
+    // [21,23]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24];
+    y[21] += tf*g[23]+tg*f[23];
+    y[23] += tf*g[21]+tg*f[21];
+    t = f[21]*g[23]+f[23]*g[21];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(-0.119098912753000000)*t;
+
+    // [21,26]: 9,25,
+    tf = CONSTANT(-0.131668802182000000)*f[9]+CONSTANT(-0.130197596204999990)*f[25];
+    tg = CONSTANT(-0.131668802182000000)*g[9]+CONSTANT(-0.130197596204999990)*g[25];
+    y[21] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[21]+tg*f[21];
+    t = f[21]*g[26]+f[26]*g[21];
+    y[9] += CONSTANT(-0.131668802182000000)*t;
+    y[25] += CONSTANT(-0.130197596204999990)*t;
+
+    // [21,28]: 27,1,11,9,29,
+    tf = CONSTANT(0.084042186968400004)*f[27]+CONSTANT(0.159122922869999990)*f[1]+CONSTANT(0.104682806111000000)*f[11]+CONSTANT(0.126698363970000010)*f[9]+CONSTANT(0.090775936911399999)*f[29];
+    tg = CONSTANT(0.084042186968400004)*g[27]+CONSTANT(0.159122922869999990)*g[1]+CONSTANT(0.104682806111000000)*g[11]+CONSTANT(0.126698363970000010)*g[9]+CONSTANT(0.090775936911399999)*g[29];
+    y[21] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[21]+tg*f[21];
+    t = f[21]*g[28]+f[28]*g[21];
+    y[27] += CONSTANT(0.084042186968400004)*t;
+    y[1] += CONSTANT(0.159122922869999990)*t;
+    y[11] += CONSTANT(0.104682806111000000)*t;
+    y[9] += CONSTANT(0.126698363970000010)*t;
+    y[29] += CONSTANT(0.090775936911399999)*t;
+
+    // [21,31]: 14,2,30,12,32,
+    tf = CONSTANT(0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(0.115089467124000010)*f[12]+CONSTANT(0.090775936912099994)*f[32];
+    tg = CONSTANT(0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(0.115089467124000010)*g[12]+CONSTANT(0.090775936912099994)*g[32];
+    y[21] += tf*g[31]+tg*f[31];
+    y[31] += tf*g[21]+tg*f[21];
+    t = f[21]*g[31]+f[31]*g[21];
+    y[14] += CONSTANT(0.097749909977199997)*t;
+    y[2] += CONSTANT(0.240571246744999990)*t;
+    y[30] += CONSTANT(0.053152946072499999)*t;
+    y[12] += CONSTANT(0.115089467124000010)*t;
+    y[32] += CONSTANT(0.090775936912099994)*t;
+
+    // [21,33]: 32,14,
+    tf = CONSTANT(0.084042186967699994)*f[32]+CONSTANT(0.025339672793899998)*f[14];
+    tg = CONSTANT(0.084042186967699994)*g[32]+CONSTANT(0.025339672793899998)*g[14];
+    y[21] += tf*g[33]+tg*f[33];
+    y[33] += tf*g[21]+tg*f[21];
+    t = f[21]*g[33]+f[33]*g[21];
+    y[32] += CONSTANT(0.084042186967699994)*t;
+    y[14] += CONSTANT(0.025339672793899998)*t;
+
+    // [21,34]: 35,
+    tf = CONSTANT(-0.130197596205999990)*f[35];
+    tg = CONSTANT(-0.130197596205999990)*g[35];
+    y[21] += tf*g[34]+tg*f[34];
+    y[34] += tf*g[21]+tg*f[21];
+    t = f[21]*g[34]+f[34]*g[21];
+    y[35] += CONSTANT(-0.130197596205999990)*t;
+
+    // [22,22]: 6,20,0,24,
+    tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24];
+    y[22] += tf*g[22]+tg*f[22];
+    t = f[22]*g[22];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[24] += CONSTANT(0.135045473384000000)*t;
+
+    // [22,26]: 10,28,
+    tf = CONSTANT(0.101358691174000000)*f[10]+CONSTANT(0.084042186965900004)*f[28];
+    tg = CONSTANT(0.101358691174000000)*g[10]+CONSTANT(0.084042186965900004)*g[28];
+    y[22] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[22]+tg*f[22];
+    t = f[22]*g[26]+f[26]*g[22];
+    y[10] += CONSTANT(0.101358691174000000)*t;
+    y[28] += CONSTANT(0.084042186965900004)*t;
+
+    // [22,27]: 1,11,25,
+    tf = CONSTANT(0.183739324704000010)*f[1]+CONSTANT(0.101990215611000000)*f[11]+CONSTANT(0.130197596200999990)*f[25];
+    tg = CONSTANT(0.183739324704000010)*g[1]+CONSTANT(0.101990215611000000)*g[11]+CONSTANT(0.130197596200999990)*g[25];
+    y[22] += tf*g[27]+tg*f[27];
+    y[27] += tf*g[22]+tg*f[22];
+    t = f[22]*g[27]+f[27]*g[22];
+    y[1] += CONSTANT(0.183739324704000010)*t;
+    y[11] += CONSTANT(0.101990215611000000)*t;
+    y[25] += CONSTANT(0.130197596200999990)*t;
+
+    // [22,32]: 2,30,12,34,
+    tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(0.084042186968800003)*f[34];
+    tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(0.084042186968800003)*g[34];
+    y[22] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[22]+tg*f[22];
+    t = f[22]*g[32]+f[32]*g[22];
+    y[2] += CONSTANT(0.225033795606000010)*t;
+    y[30] += CONSTANT(-0.099440056651100006)*t;
+    y[12] += CONSTANT(0.022664492358099999)*t;
+    y[34] += CONSTANT(0.084042186968800003)*t;
+
+    // [22,33]: 3,13,35,
+    tf = CONSTANT(0.183739324704000010)*f[3]+CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.130197596200999990)*f[35];
+    tg = CONSTANT(0.183739324704000010)*g[3]+CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.130197596200999990)*g[35];
+    y[22] += tf*g[33]+tg*f[33];
+    y[33] += tf*g[22]+tg*f[22];
+    t = f[22]*g[33]+f[33]*g[22];
+    y[3] += CONSTANT(0.183739324704000010)*t;
+    y[13] += CONSTANT(0.101990215611000000)*t;
+    y[35] += CONSTANT(0.130197596200999990)*t;
+
+    // [23,23]: 6,20,0,
+    tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0];
+    tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0];
+    y[23] += tf*g[23]+tg*f[23];
+    t = f[23]*g[23];
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+    y[0] += CONSTANT(0.282094791768999990)*t;
+
+    // [23,26]: 1,11,29,
+    tf = CONSTANT(0.208340811096000000)*f[1]+CONSTANT(0.029982305185199998)*f[11]+CONSTANT(-0.118853600623999990)*f[29];
+    tg = CONSTANT(0.208340811096000000)*g[1]+CONSTANT(0.029982305185199998)*g[11]+CONSTANT(-0.118853600623999990)*g[29];
+    y[23] += tf*g[26]+tg*f[26];
+    y[26] += tf*g[23]+tg*f[23];
+    t = f[23]*g[26]+f[26]*g[23];
+    y[1] += CONSTANT(0.208340811096000000)*t;
+    y[11] += CONSTANT(0.029982305185199998)*t;
+    y[29] += CONSTANT(-0.118853600623999990)*t;
+
+    // [23,28]: 25,11,1,29,
+    tf = CONSTANT(-0.099440056652200001)*f[25]+CONSTANT(-0.121172043789000000)*f[11]+CONSTANT(0.060142811686500000)*f[1]+CONSTANT(-0.034310079156700000)*f[29];
+    tg = CONSTANT(-0.099440056652200001)*g[25]+CONSTANT(-0.121172043789000000)*g[11]+CONSTANT(0.060142811686500000)*g[1]+CONSTANT(-0.034310079156700000)*g[29];
+    y[23] += tf*g[28]+tg*f[28];
+    y[28] += tf*g[23]+tg*f[23];
+    t = f[23]*g[28]+f[28]*g[23];
+    y[25] += CONSTANT(-0.099440056652200001)*t;
+    y[11] += CONSTANT(-0.121172043789000000)*t;
+    y[1] += CONSTANT(0.060142811686500000)*t;
+    y[29] += CONSTANT(-0.034310079156700000)*t;
+
+    // [23,32]: 31,13,3,35,
+    tf = CONSTANT(0.034310079156599997)*f[31]+CONSTANT(0.121172043788000010)*f[13]+CONSTANT(-0.060142811686900000)*f[3]+CONSTANT(-0.099440056652700004)*f[35];
+    tg = CONSTANT(0.034310079156599997)*g[31]+CONSTANT(0.121172043788000010)*g[13]+CONSTANT(-0.060142811686900000)*g[3]+CONSTANT(-0.099440056652700004)*g[35];
+    y[23] += tf*g[32]+tg*f[32];
+    y[32] += tf*g[23]+tg*f[23];
+    t = f[23]*g[32]+f[32]*g[23];
+    y[31] += CONSTANT(0.034310079156599997)*t;
+    y[13] += CONSTANT(0.121172043788000010)*t;
+    y[3] += CONSTANT(-0.060142811686900000)*t;
+    y[35] += CONSTANT(-0.099440056652700004)*t;
+
+    // [23,33]: 2,30,12,
+    tf = CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30]+CONSTANT(-0.103861751821000010)*f[12];
+    tg = CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30]+CONSTANT(-0.103861751821000010)*g[12];
+    y[23] += tf*g[33]+tg*f[33];
+    y[33] += tf*g[23]+tg*f[23];
+    t = f[23]*g[33]+f[33]*g[23];
+    y[2] += CONSTANT(0.196425600433000000)*t;
+    y[30] += CONSTANT(-0.130197596204999990)*t;
+    y[12] += CONSTANT(-0.103861751821000010)*t;
+
+    // [23,34]: 3,13,31,
+    tf = CONSTANT(0.208340811100000000)*f[3]+CONSTANT(0.029982305185400002)*f[13]+CONSTANT(-0.118853600623000000)*f[31];
+    tg = CONSTANT(0.208340811100000000)*g[3]+CONSTANT(0.029982305185400002)*g[13]+CONSTANT(-0.118853600623000000)*g[31];
+    y[23] += tf*g[34]+tg*f[34];
+    y[34] += tf*g[23]+tg*f[23];
+    t = f[23]*g[34]+f[34]*g[23];
+    y[3] += CONSTANT(0.208340811100000000)*t;
+    y[13] += CONSTANT(0.029982305185400002)*t;
+    y[31] += CONSTANT(-0.118853600623000000)*t;
+
+    // [24,24]: 6,0,20,
+    tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20];
+    y[24] += tf*g[24]+tg*f[24];
+    t = f[24]*g[24];
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // [24,29]: 9,27,25,
+    tf = CONSTANT(-0.035835708931400000)*f[9]+CONSTANT(0.118853600623000000)*f[27]+CONSTANT(0.053152946071199997)*f[25];
+    tg = CONSTANT(-0.035835708931400000)*g[9]+CONSTANT(0.118853600623000000)*g[27]+CONSTANT(0.053152946071199997)*g[25];
+    y[24] += tf*g[29]+tg*f[29];
+    y[29] += tf*g[24]+tg*f[24];
+    t = f[24]*g[29]+f[29]*g[24];
+    y[9] += CONSTANT(-0.035835708931400000)*t;
+    y[27] += CONSTANT(0.118853600623000000)*t;
+    y[25] += CONSTANT(0.053152946071199997)*t;
+
+    // [24,31]: 15,33,35,
+    tf = CONSTANT(0.035835708931400000)*f[15]+CONSTANT(-0.118853600623000000)*f[33]+CONSTANT(0.053152946071199997)*f[35];
+    tg = CONSTANT(0.035835708931400000)*g[15]+CONSTANT(-0.118853600623000000)*g[33]+CONSTANT(0.053152946071199997)*g[35];
+    y[24] += tf*g[31]+tg*f[31];
+    y[31] += tf*g[24]+tg*f[24];
+    t = f[24]*g[31]+f[31]*g[24];
+    y[15] += CONSTANT(0.035835708931400000)*t;
+    y[33] += CONSTANT(-0.118853600623000000)*t;
+    y[35] += CONSTANT(0.053152946071199997)*t;
+
+    // [24,34]: 12,30,2,
+    tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.130197596199999990)*f[30]+CONSTANT(0.147319200325000010)*f[2];
+    tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.130197596199999990)*g[30]+CONSTANT(0.147319200325000010)*g[2];
+    y[24] += tf*g[34]+tg*f[34];
+    y[34] += tf*g[24]+tg*f[24];
+    t = f[24]*g[34]+f[34]*g[24];
+    y[12] += CONSTANT(-0.207723503645000000)*t;
+    y[30] += CONSTANT(0.130197596199999990)*t;
+    y[2] += CONSTANT(0.147319200325000010)*t;
+
+    // [25,25]: 0,6,20,
+    tf = CONSTANT(0.282094791761999970)*f[0]+CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.130197596198000000)*f[20];
+    tg = CONSTANT(0.282094791761999970)*g[0]+CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.130197596198000000)*g[20];
+    y[25] += tf*g[25]+tg*f[25];
+    t = f[25]*g[25];
+    y[0] += CONSTANT(0.282094791761999970)*t;
+    y[6] += CONSTANT(-0.242608896358999990)*t;
+    y[20] += CONSTANT(0.130197596198000000)*t;
+
+    // [26,26]: 6,20,0,
+    tf = CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(0.282094791766000000)*f[0];
+    tg = CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(0.282094791766000000)*g[0];
+    y[26] += tf*g[26]+tg*f[26];
+    t = f[26]*g[26];
+    y[6] += CONSTANT(-0.097043558542400002)*t;
+    y[20] += CONSTANT(-0.130197596207000000)*t;
+    y[0] += CONSTANT(0.282094791766000000)*t;
+
+    // [27,27]: 0,20,6,
+    tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.016173926423100001)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.016173926423100001)*g[6];
+    y[27] += tf*g[27]+tg*f[27];
+    t = f[27]*g[27];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[20] += CONSTANT(-0.130197596204999990)*t;
+    y[6] += CONSTANT(0.016173926423100001)*t;
+
+    // [28,28]: 6,0,20,24,
+    tf = CONSTANT(0.097043558538800007)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(-0.128376561118000000)*f[24];
+    tg = CONSTANT(0.097043558538800007)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(-0.128376561118000000)*g[24];
+    y[28] += tf*g[28]+tg*f[28];
+    t = f[28]*g[28];
+    y[6] += CONSTANT(0.097043558538800007)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.021699599367299999)*t;
+    y[24] += CONSTANT(-0.128376561118000000)*t;
+
+    // [29,29]: 20,6,0,22,8,
+    tf = CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(-0.097043558539500002)*f[22]+CONSTANT(-0.140070311615000000)*f[8];
+    tg = CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(-0.097043558539500002)*g[22]+CONSTANT(-0.140070311615000000)*g[8];
+    y[29] += tf*g[29]+tg*f[29];
+    t = f[29]*g[29];
+    y[20] += CONSTANT(0.086798397468799998)*t;
+    y[6] += CONSTANT(0.145565337808999990)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[22] += CONSTANT(-0.097043558539500002)*t;
+    y[8] += CONSTANT(-0.140070311615000000)*t;
+
+    // [30,30]: 0,20,6,
+    tf = CONSTANT(0.282094804531000000)*f[0]+CONSTANT(0.130197634486000000)*f[20]+CONSTANT(0.161739292769000010)*f[6];
+    tg = CONSTANT(0.282094804531000000)*g[0]+CONSTANT(0.130197634486000000)*g[20]+CONSTANT(0.161739292769000010)*g[6];
+    y[30] += tf*g[30]+tg*f[30];
+    t = f[30]*g[30];
+    y[0] += CONSTANT(0.282094804531000000)*t;
+    y[20] += CONSTANT(0.130197634486000000)*t;
+    y[6] += CONSTANT(0.161739292769000010)*t;
+
+    // [31,31]: 6,8,20,22,0,
+    tf = CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.140070311615000000)*f[8]+CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.097043558539500002)*f[22]+CONSTANT(0.282094791773999990)*f[0];
+    tg = CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.140070311615000000)*g[8]+CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.097043558539500002)*g[22]+CONSTANT(0.282094791773999990)*g[0];
+    y[31] += tf*g[31]+tg*f[31];
+    t = f[31]*g[31];
+    y[6] += CONSTANT(0.145565337808999990)*t;
+    y[8] += CONSTANT(0.140070311615000000)*t;
+    y[20] += CONSTANT(0.086798397468799998)*t;
+    y[22] += CONSTANT(0.097043558539500002)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+
+    // [32,32]: 0,24,20,6,
+    tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.128376561118000000)*f[24]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(0.097043558538800007)*f[6];
+    tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.128376561118000000)*g[24]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(0.097043558538800007)*g[6];
+    y[32] += tf*g[32]+tg*f[32];
+    t = f[32]*g[32];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[24] += CONSTANT(0.128376561118000000)*t;
+    y[20] += CONSTANT(-0.021699599367299999)*t;
+    y[6] += CONSTANT(0.097043558538800007)*t;
+
+    // [33,33]: 6,20,0,
+    tf = CONSTANT(0.016173926423100001)*f[6]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.282094791770000020)*f[0];
+    tg = CONSTANT(0.016173926423100001)*g[6]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.282094791770000020)*g[0];
+    y[33] += tf*g[33]+tg*f[33];
+    t = f[33]*g[33];
+    y[6] += CONSTANT(0.016173926423100001)*t;
+    y[20] += CONSTANT(-0.130197596204999990)*t;
+    y[0] += CONSTANT(0.282094791770000020)*t;
+
+    // [34,34]: 20,6,0,
+    tf = CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(0.282094791766000000)*f[0];
+    tg = CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(0.282094791766000000)*g[0];
+    y[34] += tf*g[34]+tg*f[34];
+    t = f[34]*g[34];
+    y[20] += CONSTANT(-0.130197596207000000)*t;
+    y[6] += CONSTANT(-0.097043558542400002)*t;
+    y[0] += CONSTANT(0.282094791766000000)*t;
+
+    // [35,35]: 6,0,20,
+    tf = CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.282094791761999970)*f[0]+CONSTANT(0.130197596198000000)*f[20];
+    tg = CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.282094791761999970)*g[0]+CONSTANT(0.130197596198000000)*g[20];
+    y[35] += tf*g[35]+tg*f[35];
+    t = f[35]*g[35];
+    y[6] += CONSTANT(-0.242608896358999990)*t;
+    y[0] += CONSTANT(0.282094791761999970)*t;
+    y[20] += CONSTANT(0.130197596198000000)*t;
+
+    // multiply count=2527
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Evaluates a directional light and returns spectral SH data.  The output 
+// vector is computed so that if the intensity of R/G/B is unit the resulting
+// exit radiance of a point directly under the light on a diffuse object with
+// an albedo of 1 would be 1.0.  This will compute 3 spectral samples, resultR
+// has to be specified, while resultG and resultB are optional.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204988.aspx
+//-------------------------------------------------------------------------------------
+bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order,
+                                           _In_ FXMVECTOR dir,
+                                           _In_ FXMVECTOR color,
+                                           _Out_writes_(order*order) float *resultR,
+                                           _Out_writes_opt_(order*order) float *resultG,
+                                           _Out_writes_opt_(order*order) float *resultB )
+{
+    if ( !resultR )
+        return false;
+
+    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
+        return false;
+
+    XMFLOAT3A clr;
+    XMStoreFloat3A( &clr, color );
+
+    float fTmp[ XM_SH_MAXORDER * XM_SH_MAXORDER ];
+
+    XMSHEvalDirection(fTmp,order,dir); // evaluate the BF in this direction...
+
+    // now compute "normalization" and scale vector for each valid spectral band
+    const float fNorm = XM_PI / CosWtInt(order);
+
+    const size_t numcoeff = order*order;
+
+    const float fRScale = fNorm * clr.x;
+
+    for( size_t i=0; i < numcoeff; ++i)
+    {
+        resultR[i] = fTmp[i] * fRScale;
+    }
+
+    if (resultG)
+    {
+        const float fGScale = fNorm * clr.y;
+
+        for( size_t i=0; i < numcoeff; ++i)
+        {
+            resultG[i] = fTmp[i] * fGScale;
+        }
+    }
+
+    if (resultB)
+    {
+        const float fBScale = fNorm * clr.z;
+
+        for( size_t i=0; i < numcoeff; ++i)
+        {
+            resultB[i] = fTmp[i]*fBScale;
+        }
+    }
+
+    return true;
+}
+
+
+//------------------------------------------------------------------------------------
+// Evaluates a spherical light and returns spectral SH data.  There is no 
+// normalization of the intensity of the light like there is for directional
+// lights, care has to be taken when specifiying the intensities.  This will 
+// compute 3 spectral samples, resultR has to be specified, while resultG and 
+// resultB are optional.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205451.aspx
+//-------------------------------------------------------------------------------------
+bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order,
+                                         _In_ FXMVECTOR pos,
+                                         _In_ float radius,
+                                         _In_ FXMVECTOR color,
+                                         _Out_writes_(order*order) float *resultR,
+                                         _Out_writes_opt_(order*order) float *resultG,
+                                         _Out_writes_opt_(order*order) float *resultB )
+{
+    if ( !resultR )
+        return false;
+
+    if ( radius < 0.f )
+        return false;
+
+    const float fDist = XMVectorGetX( XMVector3Length( pos ) );
+
+    // WARNING: fDist should not be < radius - otherwise light contains origin
+
+    //const float fSinConeAngle = (fDist <= radius) ? 0.99999f : radius/fDist;
+    const float fConeAngle = (fDist <= radius) ? (XM_PIDIV2) : asinf(radius/fDist);
+
+    XMVECTOR dir = XMVector3Normalize( pos );
+
+    float fTmpDir[ XM_SH_MAXORDER* XM_SH_MAXORDER];  // rotation "vector"
+    float fTmpL0[ XM_SH_MAXORDER ];
+
+    //
+    // Sphere at distance fDist, the cone angle is determined by looking at the
+    // right triangle with one side (the hypotenuse) beind the vector from the 
+    // origin to the center of the sphere, another side is from the origin to
+    // a point on the sphere whose normal is perpendicular to the given side (this
+    // is one of the points on the cone that is defined by the projection of the sphere
+    // through the origin - we want to find the angle of this cone) and the final
+    // side being from the center of the sphere to the point of tagency (the two
+    // sides conected to this are at a right angle by construction.)
+    // From trig we know that sin(theta) = ||opposite||/||hypotenuse||, where
+    // ||opposite|| = Radius, ||hypotenuse|| = fDist
+    // theta is the angle of the cone that subtends the sphere from the origin
+    //
+
+    // no default normalization is done for this case, have to be careful how
+    // you represent the coefficients...
+
+    const float fNewNorm = 1.0f;///(fSinConeAngle*fSinConeAngle); 
+
+    ComputeCapInt(order,fConeAngle,fTmpL0);
+
+    XMFLOAT3A vd;
+    XMStoreFloat3( &vd, dir );
+
+    const float fX = vd.x;
+    const float fY = vd.y;
+    const float fZ = vd.z;
+
+    switch (order)
+    {
+    case 2:
+        sh_eval_basis_1(fX,fY,fZ,fTmpDir);
+        break;
+
+    case 3:
+        sh_eval_basis_2(fX,fY,fZ,fTmpDir);
+        break;
+
+    case 4:
+        sh_eval_basis_3(fX,fY,fZ,fTmpDir);
+        break;
+
+    case 5:
+        sh_eval_basis_4(fX,fY,fZ,fTmpDir);
+        break;
+
+    case 6:
+        sh_eval_basis_5(fX,fY,fZ,fTmpDir);
+        break;
+
+    default:
+        assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
+        return false;
+    }
+
+    XMFLOAT3A clr;
+    XMStoreFloat3A( &clr, color );
+
+    for( size_t i=0; i<order; ++i)
+    {
+        const size_t cNumCoefs = 2*i + 1;
+        const size_t cStart = i*i;
+        const float fValUse = fTmpL0[i]*clr.x*fNewNorm*fExtraNormFac[i];
+        for( size_t j=0; j < cNumCoefs; ++j ) resultR[cStart + j] = fTmpDir[cStart+j]*fValUse;
+    }
+
+    if (resultG)
+    {
+        for( size_t i=0; i<order; ++i)
+        {
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i]*clr.y*fNewNorm*fExtraNormFac[i];
+            for( size_t j=0; j < cNumCoefs; ++j ) resultG[cStart + j] = fTmpDir[cStart+j]*fValUse;
+        }
+    }
+
+    if (resultB)
+    {
+        for( size_t i=0; i<order; ++i)
+        {
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i]*clr.z*fNewNorm*fExtraNormFac[i];
+            for( size_t j=0; j < cNumCoefs; ++j ) resultB[cStart + j] = fTmpDir[cStart+j]*fValUse;
+        }
+    }
+
+    return true;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Evaluates a light that is a cone of constant intensity and returns spectral
+// SH data.  The output vector is computed so that if the intensity of R/G/B is
+// unit the resulting exit radiance of a point directly under the light oriented
+// in the cone direction on a diffuse object with an albedo of 1 would be 1.0.
+// This will compute 3 spectral samples, resultR has to be specified, while resultG
+// and resultB are optional.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204986.aspx
+//-------------------------------------------------------------------------------------
+bool XM_CALLCONV XMSHEvalConeLight( _In_ size_t order,
+                                    _In_ FXMVECTOR dir,
+                                    _In_ float radius,
+                                    _In_ FXMVECTOR color,
+                                    _Out_writes_(order*order) float *resultR,
+                                    _Out_writes_opt_(order*order) float *resultG,
+                                    _Out_writes_opt_(order*order) float *resultB )
+{
+    if ( !resultR )
+        return false;
+
+    if ( radius < 0.f || radius > (XM_PI*1.00001f) )
+        return false;
+
+    if (radius < 0.0001f)
+    {
+        // turn it into a pure directional light...
+        return XMSHEvalDirectionalLight(order, dir,color,resultR,resultG,resultB);
+    }
+    else
+    {
+        float fTmpL0[ XM_SH_MAXORDER ];
+        float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER ];
+
+        const float fConeAngle = radius;
+        const float fAngCheck = (fConeAngle > XM_PIDIV2) ? (XM_PIDIV2) : fConeAngle;
+
+        const float fNewNorm = 1.0f/(sinf(fAngCheck)*sinf(fAngCheck));
+
+        ComputeCapInt(order,fConeAngle,fTmpL0);
+
+        XMFLOAT3A vd;
+        XMStoreFloat3( &vd, dir );
+
+        const float fX = vd.x;
+        const float fY = vd.y;
+        const float fZ = vd.z;
+
+        switch (order)
+        {
+        case 2:
+            sh_eval_basis_1(fX,fY,fZ,fTmpDir);
+            break;
+
+        case 3:
+            sh_eval_basis_2(fX,fY,fZ,fTmpDir);
+            break;
+
+        case 4:
+            sh_eval_basis_3(fX,fY,fZ,fTmpDir);
+            break;
+
+        case 5:
+            sh_eval_basis_4(fX,fY,fZ,fTmpDir);
+            break;
+
+        case 6:
+            sh_eval_basis_5(fX,fY,fZ,fTmpDir);
+            break;
+
+        default:
+            assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER );
+            return false;
+        }
+
+        XMFLOAT3A clr;
+        XMStoreFloat3A( &clr, color );
+
+        for( size_t i=0; i<order; ++i)
+        {
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i]*clr.x*fNewNorm*fExtraNormFac[i];
+            for( size_t j=0; j<cNumCoefs; ++j) 
+                resultR[cStart + j] = fTmpDir[cStart+j]*fValUse;
+        }
+
+        if (resultG)
+        {
+            for( size_t i=0; i<order; ++i)
+            {
+                const size_t cNumCoefs = 2*i + 1;
+                const size_t cStart = i*i;
+                const float fValUse = fTmpL0[i]*clr.y*fNewNorm*fExtraNormFac[i];
+                for( size_t j=0; j<cNumCoefs; ++j)
+                    resultG[cStart + j] = fTmpDir[cStart+j]*fValUse;
+            }
+        }
+
+        if (resultB)
+        {
+            for( size_t i=0; i<order; ++i)
+            {
+                const size_t cNumCoefs = 2*i + 1;
+                const size_t cStart = i*i;
+                const float fValUse = fTmpL0[i]*clr.z*fNewNorm*fExtraNormFac[i];
+                for( size_t j=0; j<cNumCoefs; ++j)
+                    resultB[cStart + j] = fTmpDir[cStart+j]*fValUse;
+            }
+        }
+    }
+
+    return true;
+}
+
+
+//------------------------------------------------------------------------------------
+// Evaluates a light that is a linear interpolant between two colors over the
+// sphere.  The interpolant is linear along the axis of the two points, not
+// over the surface of the sphere (ie: if the axis was (0,0,1) it is linear in
+// Z, not in the azimuthal angle.)  The resulting spherical lighting function
+// is normalized so that a point on a perfectly diffuse surface with no
+// shadowing and a normal pointed in the direction pDir would result in exit
+// radiance with a value of 1 if the top color was white and the bottom color
+// was black.  This is a very simple model where topColor represents the intensity 
+// of the "sky" and bottomColor represents the intensity of the "ground".
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204989.aspx
+//-------------------------------------------------------------------------------------
+bool XM_CALLCONV XMSHEvalHemisphereLight( _In_ size_t order,
+                                          _In_ FXMVECTOR dir,
+                                          _In_ FXMVECTOR topColor,
+                                          _In_ FXMVECTOR bottomColor,
+                                          _Out_writes_(order*order) float *resultR,
+                                          _Out_writes_opt_(order*order) float *resultG,
+                                          _Out_writes_opt_(order*order) float *resultB )
+{
+    if ( !resultR )
+        return false;
+
+    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
+        return false;
+
+    // seperate "R/G/B colors...
+
+    float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER];  // rotation "vector"
+    float fTmpL0[ XM_SH_MAXORDER ];
+
+    const float fNewNorm = 3.0f/2.0f; // normalizes things for 1 sky color, 0 ground color...
+
+    XMFLOAT3A vd;
+    XMStoreFloat3( &vd, dir );
+
+    const float fX = vd.x;
+    const float fY = vd.y;
+    const float fZ = vd.z;
+
+    sh_eval_basis_1(fX,fY,fZ,fTmpDir);
+
+    XMFLOAT3A clrTop;
+    XMStoreFloat3A( &clrTop, topColor );
+
+    XMFLOAT3A clrBottom;
+    XMStoreFloat3A( &clrBottom, bottomColor );
+
+    float fA = clrTop.x;
+    float fAvrg = (clrTop.x + clrBottom.x)*0.5f;
+
+    fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
+    fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
+    
+    size_t i = 0;
+    for( ; i<2; ++i)
+    {
+        _Analysis_assume_(i < order);
+        const size_t cNumCoefs = 2*i + 1;
+        const size_t cStart = i*i;
+        const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i];
+        for( size_t j=0; j<cNumCoefs; ++j) resultR[cStart + j] = fTmpDir[cStart+j]*fValUse;
+    }
+
+    for( ; i<order; ++i)
+    {
+        const size_t cNumCoefs = 2*i + 1;
+        const size_t cStart = i*i;
+        for( size_t j=0; j<cNumCoefs; ++j) resultR[cStart + j] = 0.0f;
+    }
+
+    if (resultG)
+    {
+        fA = clrTop.y;
+        fAvrg = (clrTop.y + clrBottom.y)*0.5f;
+
+        fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
+        fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
+
+        for( i=0; i<2; ++i)
+        {
+            _Analysis_assume_(i < order);
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i];
+            for( size_t j=0; j<cNumCoefs; ++j) resultG[cStart + j] = fTmpDir[cStart+j]*fValUse;
+        }
+
+        for( ;i<order; ++i)
+        {
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            for( size_t j=0; j<cNumCoefs; ++j) resultG[cStart + j] = 0.0f;
+        }
+    }
+
+    if (resultB)
+    {
+        fA = clrTop.z;
+        fAvrg = (clrTop.z + clrBottom.z)*0.5f;
+
+        fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
+        fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
+
+        for( i=0; i<2; ++i)
+        {
+            _Analysis_assume_(i < order);
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i];
+            for( size_t j=0; j<cNumCoefs; ++j) resultB[cStart + j] = fTmpDir[cStart+j]*fValUse;
+        }
+
+        for( ; i<order; ++i)
+        {
+            const size_t cNumCoefs = 2*i + 1;
+            const size_t cStart = i*i;
+            for( size_t j=0; j<cNumCoefs; ++j) resultB[cStart + j] = 0.0f;
+        }
+    }
+
+    return true;
+}
+
+}; // namespace DirectX
diff --git a/SHMath/DirectXSH.h b/SHMath/DirectXSH.h
index adb0c1a..5a66e1b 100644
--- a/SHMath/DirectXSH.h
+++ b/SHMath/DirectXSH.h
@@ -1,77 +1,77 @@
-//-------------------------------------------------------------------------------------
-// DirectXSH.h -- C++ Spherical Harmonics Math Library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/p/?LinkId=262885
-//-------------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma once
-#endif
-
-#define DIRECTX_SHMATH_VERSION 102
-
-#include <DirectXMath.h>
-
-#include <winerror.h>
-
-struct ID3D11DeviceContext;
-struct ID3D11Texture2D;
-
-namespace DirectX
-{
-#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-#define XM_CALLCONV __fastcall
-typedef const DirectX::XMVECTOR& HXMVECTOR;
-typedef const DirectX::XMMATRIX& FXMMATRIX;
-#endif
-
-const size_t XM_SH_MINORDER = 2;
-const size_t XM_SH_MAXORDER = 6;
-
-float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMVECTOR dir );
-
-float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMMATRIX rotMatrix, _In_reads_(order*order) const float *input );
-
-float* XMSHRotateZ( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ float angle, _In_reads_(order*order) const float *input );
-
-float* XMSHAdd( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB );
-
-float* XMSHScale( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *input, _In_ float scale );
-
-float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB );
-
-float* XMSHMultiply( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputF, _In_reads_(order*order) const float *inputG );
-
-float* XMSHMultiply2( _Out_writes_(4) float *result, _In_reads_(4) const float *inputF, _In_reads_(4) const float *inputG );
-
-float* XMSHMultiply3( _Out_writes_(9) float *result, _In_reads_(9) const float *inputF, _In_reads_(9) const float *inputG );
-
-float* XMSHMultiply4( _Out_writes_(16) float *result, _In_reads_(16) const float *inputF, _In_reads_(16) const float *inputG );
-
-float* XMSHMultiply5( _Out_writes_(25) float *result, _In_reads_(25) const float *inputF, _In_reads_(25) const float *inputG );
-
-float* XMSHMultiply6( _Out_writes_(36) float *result, _In_reads_(36) const float *inputF, _In_reads_(36) const float *inputG );
-
-bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR color,
-                                           _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
-
-bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order, _In_ FXMVECTOR pos, _In_ float radius, _In_ FXMVECTOR color,
-                                         _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
-
-bool XM_CALLCONV XMSHEvalConeLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ float radius, _In_ FXMVECTOR color,
-                                    _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
-
-bool XM_CALLCONV XMSHEvalHemisphereLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR topColor, _In_ FXMVECTOR bottomColor,
-                                          _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
-
-HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context, _In_ size_t order, _In_ ID3D11Texture2D *cubeMap,
-                          _Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
-
-}; // namespace DirectX
+//-------------------------------------------------------------------------------------
+// DirectXSH.h -- C++ Spherical Harmonics Math Library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#define DIRECTX_SHMATH_VERSION 102
+
+#include <DirectXMath.h>
+
+#include <winerror.h>
+
+struct ID3D11DeviceContext;
+struct ID3D11Texture2D;
+
+namespace DirectX
+{
+#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+#define XM_CALLCONV __fastcall
+typedef const DirectX::XMVECTOR& HXMVECTOR;
+typedef const DirectX::XMMATRIX& FXMMATRIX;
+#endif
+
+const size_t XM_SH_MINORDER = 2;
+const size_t XM_SH_MAXORDER = 6;
+
+float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMVECTOR dir );
+
+float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMMATRIX rotMatrix, _In_reads_(order*order) const float *input );
+
+float* XMSHRotateZ( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ float angle, _In_reads_(order*order) const float *input );
+
+float* XMSHAdd( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB );
+
+float* XMSHScale( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *input, _In_ float scale );
+
+float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB );
+
+float* XMSHMultiply( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputF, _In_reads_(order*order) const float *inputG );
+
+float* XMSHMultiply2( _Out_writes_(4) float *result, _In_reads_(4) const float *inputF, _In_reads_(4) const float *inputG );
+
+float* XMSHMultiply3( _Out_writes_(9) float *result, _In_reads_(9) const float *inputF, _In_reads_(9) const float *inputG );
+
+float* XMSHMultiply4( _Out_writes_(16) float *result, _In_reads_(16) const float *inputF, _In_reads_(16) const float *inputG );
+
+float* XMSHMultiply5( _Out_writes_(25) float *result, _In_reads_(25) const float *inputF, _In_reads_(25) const float *inputG );
+
+float* XMSHMultiply6( _Out_writes_(36) float *result, _In_reads_(36) const float *inputF, _In_reads_(36) const float *inputG );
+
+bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR color,
+                                           _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
+
+bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order, _In_ FXMVECTOR pos, _In_ float radius, _In_ FXMVECTOR color,
+                                         _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
+
+bool XM_CALLCONV XMSHEvalConeLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ float radius, _In_ FXMVECTOR color,
+                                    _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
+
+bool XM_CALLCONV XMSHEvalHemisphereLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR topColor, _In_ FXMVECTOR bottomColor,
+                                          _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
+
+HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context, _In_ size_t order, _In_ ID3D11Texture2D *cubeMap,
+                          _Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB );
+
+}; // namespace DirectX
diff --git a/SHMath/DirectXSHD3D11.cpp b/SHMath/DirectXSHD3D11.cpp
index 9556526..73e24ad 100644
--- a/SHMath/DirectXSHD3D11.cpp
+++ b/SHMath/DirectXSHD3D11.cpp
@@ -1,390 +1,390 @@
-//-------------------------------------------------------------------------------------
-// DirectXSHD3D11.cpp -- C++ Spherical Harmonics Math Library
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//  
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/p/?LinkId=262885
-//-------------------------------------------------------------------------------------
-
-#include "DirectXSH.h"
-
-#include <d3d11.h>
-
-#include <DirectXPackedVector.h>
-
-#include <assert.h>
-#include <memory>
-#include <malloc.h>
-
-namespace
-{
-struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } };
-
-typedef std::unique_ptr<DirectX::XMVECTOR, aligned_deleter> ScopedAlignedArrayXMVECTOR;
-
-template<class T> class ScopedObject
-{
-public:
-    explicit ScopedObject( T *p = 0 ) : _pointer(p) {}
-    ~ScopedObject()
-    {
-        if ( _pointer )
-        {
-            _pointer->Release();
-            _pointer = nullptr;
-        }
-    }
-
-    bool IsNull() const { return (!_pointer); }
-
-    T& operator*() { return *_pointer; }
-    T* operator->() { return _pointer; }
-    T** operator&() { return &_pointer; }
-
-    void Reset(T *p = 0) { if ( _pointer ) { _pointer->Release(); } _pointer = p; }
-
-    T* Get() const { return _pointer; }
-
-private:
-    ScopedObject(const ScopedObject&);
-    ScopedObject& operator=(const ScopedObject&);
-        
-    T* _pointer;
-};
-
-//-------------------------------------------------------------------------------------
-// This code is lifted from DirectXTex http://directxtex.codeplex.com/
-// If you need additional DXGI format support, see DirectXTexConvert.cpp
-//-------------------------------------------------------------------------------------
-#define LOAD_SCANLINE( type, func )\
-        if ( size >= sizeof(type) )\
-        {\
-            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
-            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
-            {\
-                if ( dPtr >= ePtr ) break;\
-                *(dPtr++) = func( sPtr++ );\
-            }\
-            return true;\
-        }\
-        return false;
-
-#define LOAD_SCANLINE3( type, func, defvec )\
-        if ( size >= sizeof(type) )\
-        {\
-            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
-            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
-            {\
-                XMVECTOR v = func( sPtr++ );\
-                if ( dPtr >= ePtr ) break;\
-                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\
-            }\
-            return true;\
-        }\
-        return false;
-
-#define LOAD_SCANLINE2( type, func, defvec )\
-        if ( size >= sizeof(type) )\
-        {\
-            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
-            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
-            {\
-                XMVECTOR v = func( sPtr++ );\
-                if ( dPtr >= ePtr ) break;\
-                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\
-            }\
-            return true;\
-        }\
-        return false;
-
-#pragma warning(push)
-#pragma warning(disable : 6101)
-_Success_(return)
-static bool _LoadScanline( _Out_writes_(count) DirectX::XMVECTOR* pDestination, _In_ size_t count,
-                           _In_reads_bytes_(size) LPCVOID pSource, _In_ size_t size, _In_ DXGI_FORMAT format )
-{
-    assert( pDestination && count > 0 && (((uintptr_t)pDestination & 0xF) == 0) );
-    assert( pSource && size > 0 );
-
-    using namespace DirectX;
-    using namespace DirectX::PackedVector;
-
-    XMVECTOR* __restrict dPtr = pDestination;
-    if ( !dPtr )
-        return false;
-
-    const XMVECTOR* ePtr = pDestination + count;
-
-    switch( format )
-    {
-    case DXGI_FORMAT_R32G32B32A32_FLOAT:
-        {
-            size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size;
-            memcpy_s( dPtr, sizeof(XMVECTOR)*count, pSource, msize );
-        }
-        return true;
-
-    case DXGI_FORMAT_R32G32B32_FLOAT:
-        LOAD_SCANLINE3( XMFLOAT3, XMLoadFloat3, g_XMIdentityR3 )
-            
-    case DXGI_FORMAT_R16G16B16A16_FLOAT:
-        LOAD_SCANLINE( XMHALF4, XMLoadHalf4 )
-
-    case DXGI_FORMAT_R32G32_FLOAT:
-        LOAD_SCANLINE2( XMFLOAT2, XMLoadFloat2, g_XMIdentityR3 )
-
-    case DXGI_FORMAT_R11G11B10_FLOAT:
-        LOAD_SCANLINE3( XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3 );
-
-    case DXGI_FORMAT_R16G16_FLOAT:
-        LOAD_SCANLINE2( XMHALF2, XMLoadHalf2, g_XMIdentityR3 )
-
-    case DXGI_FORMAT_R32_FLOAT:
-        if ( size >= sizeof(float) )
-        {
-            const float* __restrict sPtr = reinterpret_cast<const float*>(pSource);
-            for( size_t icount = 0; icount < size; icount += sizeof(float) )
-            {
-                XMVECTOR v = XMLoadFloat( sPtr++ );
-                if ( dPtr >= ePtr ) break;
-                *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 );
-            }
-            return true;
-        }
-        return false;
-
-    case DXGI_FORMAT_R16_FLOAT:
-        if ( size >= sizeof(HALF) )
-        {
-            const HALF * __restrict sPtr = reinterpret_cast<const HALF*>(pSource);
-            for( size_t icount = 0; icount < size; icount += sizeof(HALF) )
-            {
-                if ( dPtr >= ePtr ) break;
-                *(dPtr++) = XMVectorSet( XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f );
-            }
-            return true;
-        }
-        return false;
-
-    default:
-        return false;
-    }
-}
-#pragma warning(pop)
-
-}; // namespace anonymous
-
-namespace DirectX
-{
-
-//-------------------------------------------------------------------------------------
-// Projects a function represented in a cube map into spherical harmonics.
-//
-// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx
-//-------------------------------------------------------------------------------------
-HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context,
-                          _In_ size_t order,
-                          _In_ ID3D11Texture2D *cubeMap,
-                          _Out_writes_opt_(order*order) float *resultR,
-                          _Out_writes_opt_(order*order) float *resultG,
-                          _Out_writes_opt_(order*order) float* resultB )
-{
-    if ( !context || !cubeMap )
-        return E_INVALIDARG;
-
-    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
-        return E_INVALIDARG;
-
-    D3D11_TEXTURE2D_DESC desc;
-    cubeMap->GetDesc( &desc );
-
-    if ( (desc.ArraySize != 6)
-         || (desc.Width != desc.Height)
-         || (desc.SampleDesc.Count > 1) )
-         return E_FAIL;
-
-    switch( desc.Format )
-    {
-    case DXGI_FORMAT_R32G32B32A32_FLOAT:
-    case DXGI_FORMAT_R32G32B32_FLOAT:
-    case DXGI_FORMAT_R16G16B16A16_FLOAT:
-    case DXGI_FORMAT_R32G32_FLOAT:
-    case DXGI_FORMAT_R11G11B10_FLOAT:
-    case DXGI_FORMAT_R16G16_FLOAT:
-    case DXGI_FORMAT_R32_FLOAT:
-    case DXGI_FORMAT_R16_FLOAT:
-        // See _LoadScanline to support more pixel formats
-        break;
-
-    default:
-        return E_FAIL;
-    }
-
-    //--- Create a staging resource copy (if needed) to be able to read data
-    ID3D11Texture2D* texture = nullptr;
-
-    ScopedObject<ID3D11Texture2D> staging;
-    if ( !(desc.CPUAccessFlags & D3D11_CPU_ACCESS_READ) )
-    {
-        D3D11_TEXTURE2D_DESC sdesc = desc;
-        sdesc.BindFlags = 0;
-        sdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
-        sdesc.Usage = D3D11_USAGE_STAGING;
-
-        ScopedObject<ID3D11Device> device;
-        context->GetDevice( &device );
-        assert( !device.IsNull() );
-
-        HRESULT hr = device->CreateTexture2D( &sdesc, nullptr, &staging );
-        if ( FAILED(hr) )
-            return hr;
-
-        context->CopyResource( staging.Get(), cubeMap );
-            
-        texture = staging.Get();
-    }
-    else
-        texture = cubeMap;
-
-    assert( texture != 0 );
-
-    //--- Setup for SH projection
-    ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( sizeof(XMVECTOR)*desc.Width, 16 ) ) );
-    if ( !scanline )
-        return E_OUTOFMEMORY;
-
-    assert( desc.Width > 0 );
-    float fSize = static_cast<float>( desc.Width );
-    float fPicSize = 1.0f / fSize;
-
-    // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w
-    // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into
-    // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did 
-    // this was incorrect - but only for computing the differential solid
-    // angle, where the final value was 1.0 instead of 1-1/w...
-
-    float fB = -1.0f + 1.0f/fSize;
-    float fS = ( desc.Width > 1 ) ? (2.0f*(1.0f-1.0f/fSize)/(fSize-1.0f)) : 0.f;
-
-    // clear out accumulation variables
-    float fWt = 0.0f;
-
-    if ( resultR )
-        memset( resultR, 0, sizeof(float)*order*order );
-    if ( resultG )
-        memset( resultG, 0, sizeof(float)*order*order );
-    if ( resultB )
-        memset( resultB, 0, sizeof(float)*order*order );
-
-    float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER];
-    float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER];
-
-    //--- Process each face of the cubemap
-    for (UINT face=0; face < 6; ++face )
-    {
-        UINT dindex = D3D11CalcSubresource( 0, face, desc.MipLevels );
-
-        D3D11_MAPPED_SUBRESOURCE mapped;
-        HRESULT hr = context->Map( texture, dindex, D3D11_MAP_READ, 0, &mapped );
-        if ( FAILED(hr) )
-            return hr;
-
-        const uint8_t *pSrc = reinterpret_cast<const uint8_t*>(mapped.pData);
-        for( UINT y=0; y < desc.Height; ++y )
-        {
-            XMVECTOR* ptr = scanline.get();
-            if ( !_LoadScanline( ptr, desc.Width, pSrc, mapped.RowPitch, desc.Format ) )
-            {
-                context->Unmap( texture, dindex );
-                return E_FAIL;
-            }
-
-            const float fV = y*fS + fB;
-
-            XMVECTOR* pixel = ptr;
-            for( UINT x=0; x < desc.Width; ++x, ++pixel )
-            {
-                const float fU = x*fS + fB;
-
-                float ix, iy, iz;
-                switch( face )
-                {
-                case 0: // Positive X
-                    iz = 1.0f - (2.0f * (float)x + 1.0f) * fPicSize;
-                    iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
-                    ix = 1.0f;
-                    break;
-
-                case 1: // Negative X
-                    iz = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;
-                    iy =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
-                    ix = -1;
-                    break;
-
-                case 2: // Positive Y
-                    iz = -1.0f + (2.0f * (float)y + 1.0f) * fPicSize;
-                    iy =  1.0f;
-                    ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;
-                    break;
-
-                case 3: // Negative Y
-                    iz =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
-                    iy = -1.0f;
-                    ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;  
-                    break;
-
-                case 4: // Positive Z
-                    iz =  1.0f;
-                    iy =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
-                    ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;  
-                    break;
-
-                case 5: // Negative Z
-                    iz = -1.0f;
-                    iy =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
-                    ix =  1.0f - (2.0f * (float)x + 1.0f) * fPicSize;
-                    break;
-
-                default:
-                    ix = iy = iz = 0.f;
-                    assert(false);
-                    break;
-                }
-
-                XMVECTOR dir = XMVectorSet( ix, iy, iz, 0 );
-                dir = XMVector3Normalize( dir );
-
-                const float fDiffSolid = 4.0f/((1.0f + fU*fU + fV*fV)*sqrtf(1.0f + fU*fU+fV*fV));
-                fWt += fDiffSolid;
-
-                XMSHEvalDirection(shBuff,order,dir);
-
-                XMFLOAT3A clr;
-                XMStoreFloat3A( &clr, *pixel );
-
-                if ( resultR ) XMSHAdd(resultR,order,resultR, XMSHScale(shBuffB,order,shBuff,clr.x*fDiffSolid) );
-                if ( resultG ) XMSHAdd(resultG,order,resultG, XMSHScale(shBuffB,order,shBuff,clr.y*fDiffSolid) );
-                if ( resultB ) XMSHAdd(resultB,order,resultB, XMSHScale(shBuffB,order,shBuff,clr.z*fDiffSolid) );
-           }
-
-            pSrc += mapped.RowPitch;
-        }
-
-        context->Unmap( texture, dindex );
-    }
-
-    const float fNormProj = (4.0f*XM_PI)/fWt;
-
-    if ( resultR ) XMSHScale(resultR,order,resultR,fNormProj);
-    if ( resultG ) XMSHScale(resultG,order,resultG,fNormProj);
-    if ( resultB ) XMSHScale(resultB,order,resultB,fNormProj);
-
-    return S_OK;
-}
-
-}; // namespace DirectX
+//-------------------------------------------------------------------------------------
+// DirectXSHD3D11.cpp -- C++ Spherical Harmonics Math Library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//  
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#include "DirectXSH.h"
+
+#include <d3d11.h>
+
+#include <DirectXPackedVector.h>
+
+#include <assert.h>
+#include <memory>
+#include <malloc.h>
+
+namespace
+{
+struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } };
+
+typedef std::unique_ptr<DirectX::XMVECTOR, aligned_deleter> ScopedAlignedArrayXMVECTOR;
+
+template<class T> class ScopedObject
+{
+public:
+    explicit ScopedObject( T *p = 0 ) : _pointer(p) {}
+    ~ScopedObject()
+    {
+        if ( _pointer )
+        {
+            _pointer->Release();
+            _pointer = nullptr;
+        }
+    }
+
+    bool IsNull() const { return (!_pointer); }
+
+    T& operator*() { return *_pointer; }
+    T* operator->() { return _pointer; }
+    T** operator&() { return &_pointer; }
+
+    void Reset(T *p = 0) { if ( _pointer ) { _pointer->Release(); } _pointer = p; }
+
+    T* Get() const { return _pointer; }
+
+private:
+    ScopedObject(const ScopedObject&);
+    ScopedObject& operator=(const ScopedObject&);
+        
+    T* _pointer;
+};
+
+//-------------------------------------------------------------------------------------
+// This code is lifted from DirectXTex http://directxtex.codeplex.com/
+// If you need additional DXGI format support, see DirectXTexConvert.cpp
+//-------------------------------------------------------------------------------------
+#define LOAD_SCANLINE( type, func )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = func( sPtr++ );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#define LOAD_SCANLINE3( type, func, defvec )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                XMVECTOR v = func( sPtr++ );\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#define LOAD_SCANLINE2( type, func, defvec )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                XMVECTOR v = func( sPtr++ );\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#pragma warning(push)
+#pragma warning(disable : 6101)
+_Success_(return)
+static bool _LoadScanline( _Out_writes_(count) DirectX::XMVECTOR* pDestination, _In_ size_t count,
+                           _In_reads_bytes_(size) LPCVOID pSource, _In_ size_t size, _In_ DXGI_FORMAT format )
+{
+    assert( pDestination && count > 0 && (((uintptr_t)pDestination & 0xF) == 0) );
+    assert( pSource && size > 0 );
+
+    using namespace DirectX;
+    using namespace DirectX::PackedVector;
+
+    XMVECTOR* __restrict dPtr = pDestination;
+    if ( !dPtr )
+        return false;
+
+    const XMVECTOR* ePtr = pDestination + count;
+
+    switch( format )
+    {
+    case DXGI_FORMAT_R32G32B32A32_FLOAT:
+        {
+            size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size;
+            memcpy_s( dPtr, sizeof(XMVECTOR)*count, pSource, msize );
+        }
+        return true;
+
+    case DXGI_FORMAT_R32G32B32_FLOAT:
+        LOAD_SCANLINE3( XMFLOAT3, XMLoadFloat3, g_XMIdentityR3 )
+            
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+        LOAD_SCANLINE( XMHALF4, XMLoadHalf4 )
+
+    case DXGI_FORMAT_R32G32_FLOAT:
+        LOAD_SCANLINE2( XMFLOAT2, XMLoadFloat2, g_XMIdentityR3 )
+
+    case DXGI_FORMAT_R11G11B10_FLOAT:
+        LOAD_SCANLINE3( XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3 );
+
+    case DXGI_FORMAT_R16G16_FLOAT:
+        LOAD_SCANLINE2( XMHALF2, XMLoadHalf2, g_XMIdentityR3 )
+
+    case DXGI_FORMAT_R32_FLOAT:
+        if ( size >= sizeof(float) )
+        {
+            const float* __restrict sPtr = reinterpret_cast<const float*>(pSource);
+            for( size_t icount = 0; icount < size; icount += sizeof(float) )
+            {
+                XMVECTOR v = XMLoadFloat( sPtr++ );
+                if ( dPtr >= ePtr ) break;
+                *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 );
+            }
+            return true;
+        }
+        return false;
+
+    case DXGI_FORMAT_R16_FLOAT:
+        if ( size >= sizeof(HALF) )
+        {
+            const HALF * __restrict sPtr = reinterpret_cast<const HALF*>(pSource);
+            for( size_t icount = 0; icount < size; icount += sizeof(HALF) )
+            {
+                if ( dPtr >= ePtr ) break;
+                *(dPtr++) = XMVectorSet( XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f );
+            }
+            return true;
+        }
+        return false;
+
+    default:
+        return false;
+    }
+}
+#pragma warning(pop)
+
+}; // namespace anonymous
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Projects a function represented in a cube map into spherical harmonics.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx
+//-------------------------------------------------------------------------------------
+HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context,
+                          _In_ size_t order,
+                          _In_ ID3D11Texture2D *cubeMap,
+                          _Out_writes_opt_(order*order) float *resultR,
+                          _Out_writes_opt_(order*order) float *resultG,
+                          _Out_writes_opt_(order*order) float* resultB )
+{
+    if ( !context || !cubeMap )
+        return E_INVALIDARG;
+
+    if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER )
+        return E_INVALIDARG;
+
+    D3D11_TEXTURE2D_DESC desc;
+    cubeMap->GetDesc( &desc );
+
+    if ( (desc.ArraySize != 6)
+         || (desc.Width != desc.Height)
+         || (desc.SampleDesc.Count > 1) )
+         return E_FAIL;
+
+    switch( desc.Format )
+    {
+    case DXGI_FORMAT_R32G32B32A32_FLOAT:
+    case DXGI_FORMAT_R32G32B32_FLOAT:
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+    case DXGI_FORMAT_R32G32_FLOAT:
+    case DXGI_FORMAT_R11G11B10_FLOAT:
+    case DXGI_FORMAT_R16G16_FLOAT:
+    case DXGI_FORMAT_R32_FLOAT:
+    case DXGI_FORMAT_R16_FLOAT:
+        // See _LoadScanline to support more pixel formats
+        break;
+
+    default:
+        return E_FAIL;
+    }
+
+    //--- Create a staging resource copy (if needed) to be able to read data
+    ID3D11Texture2D* texture = nullptr;
+
+    ScopedObject<ID3D11Texture2D> staging;
+    if ( !(desc.CPUAccessFlags & D3D11_CPU_ACCESS_READ) )
+    {
+        D3D11_TEXTURE2D_DESC sdesc = desc;
+        sdesc.BindFlags = 0;
+        sdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+        sdesc.Usage = D3D11_USAGE_STAGING;
+
+        ScopedObject<ID3D11Device> device;
+        context->GetDevice( &device );
+        assert( !device.IsNull() );
+
+        HRESULT hr = device->CreateTexture2D( &sdesc, nullptr, &staging );
+        if ( FAILED(hr) )
+            return hr;
+
+        context->CopyResource( staging.Get(), cubeMap );
+            
+        texture = staging.Get();
+    }
+    else
+        texture = cubeMap;
+
+    assert( texture != 0 );
+
+    //--- Setup for SH projection
+    ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( sizeof(XMVECTOR)*desc.Width, 16 ) ) );
+    if ( !scanline )
+        return E_OUTOFMEMORY;
+
+    assert( desc.Width > 0 );
+    float fSize = static_cast<float>( desc.Width );
+    float fPicSize = 1.0f / fSize;
+
+    // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w
+    // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into
+    // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did 
+    // this was incorrect - but only for computing the differential solid
+    // angle, where the final value was 1.0 instead of 1-1/w...
+
+    float fB = -1.0f + 1.0f/fSize;
+    float fS = ( desc.Width > 1 ) ? (2.0f*(1.0f-1.0f/fSize)/(fSize-1.0f)) : 0.f;
+
+    // clear out accumulation variables
+    float fWt = 0.0f;
+
+    if ( resultR )
+        memset( resultR, 0, sizeof(float)*order*order );
+    if ( resultG )
+        memset( resultG, 0, sizeof(float)*order*order );
+    if ( resultB )
+        memset( resultB, 0, sizeof(float)*order*order );
+
+    float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER];
+    float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER];
+
+    //--- Process each face of the cubemap
+    for (UINT face=0; face < 6; ++face )
+    {
+        UINT dindex = D3D11CalcSubresource( 0, face, desc.MipLevels );
+
+        D3D11_MAPPED_SUBRESOURCE mapped;
+        HRESULT hr = context->Map( texture, dindex, D3D11_MAP_READ, 0, &mapped );
+        if ( FAILED(hr) )
+            return hr;
+
+        const uint8_t *pSrc = reinterpret_cast<const uint8_t*>(mapped.pData);
+        for( UINT y=0; y < desc.Height; ++y )
+        {
+            XMVECTOR* ptr = scanline.get();
+            if ( !_LoadScanline( ptr, desc.Width, pSrc, mapped.RowPitch, desc.Format ) )
+            {
+                context->Unmap( texture, dindex );
+                return E_FAIL;
+            }
+
+            const float fV = y*fS + fB;
+
+            XMVECTOR* pixel = ptr;
+            for( UINT x=0; x < desc.Width; ++x, ++pixel )
+            {
+                const float fU = x*fS + fB;
+
+                float ix, iy, iz;
+                switch( face )
+                {
+                case 0: // Positive X
+                    iz = 1.0f - (2.0f * (float)x + 1.0f) * fPicSize;
+                    iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
+                    ix = 1.0f;
+                    break;
+
+                case 1: // Negative X
+                    iz = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;
+                    iy =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
+                    ix = -1;
+                    break;
+
+                case 2: // Positive Y
+                    iz = -1.0f + (2.0f * (float)y + 1.0f) * fPicSize;
+                    iy =  1.0f;
+                    ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;
+                    break;
+
+                case 3: // Negative Y
+                    iz =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
+                    iy = -1.0f;
+                    ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;  
+                    break;
+
+                case 4: // Positive Z
+                    iz =  1.0f;
+                    iy =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
+                    ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize;  
+                    break;
+
+                case 5: // Negative Z
+                    iz = -1.0f;
+                    iy =  1.0f - (2.0f * (float)y + 1.0f) * fPicSize;
+                    ix =  1.0f - (2.0f * (float)x + 1.0f) * fPicSize;
+                    break;
+
+                default:
+                    ix = iy = iz = 0.f;
+                    assert(false);
+                    break;
+                }
+
+                XMVECTOR dir = XMVectorSet( ix, iy, iz, 0 );
+                dir = XMVector3Normalize( dir );
+
+                const float fDiffSolid = 4.0f/((1.0f + fU*fU + fV*fV)*sqrtf(1.0f + fU*fU+fV*fV));
+                fWt += fDiffSolid;
+
+                XMSHEvalDirection(shBuff,order,dir);
+
+                XMFLOAT3A clr;
+                XMStoreFloat3A( &clr, *pixel );
+
+                if ( resultR ) XMSHAdd(resultR,order,resultR, XMSHScale(shBuffB,order,shBuff,clr.x*fDiffSolid) );
+                if ( resultG ) XMSHAdd(resultG,order,resultG, XMSHScale(shBuffB,order,shBuff,clr.y*fDiffSolid) );
+                if ( resultB ) XMSHAdd(resultB,order,resultB, XMSHScale(shBuffB,order,shBuff,clr.z*fDiffSolid) );
+           }
+
+            pSrc += mapped.RowPitch;
+        }
+
+        context->Unmap( texture, dindex );
+    }
+
+    const float fNormProj = (4.0f*XM_PI)/fWt;
+
+    if ( resultR ) XMSHScale(resultR,order,resultR,fNormProj);
+    if ( resultG ) XMSHScale(resultG,order,resultG,fNormProj);
+    if ( resultB ) XMSHScale(resultB,order,resultB,fNormProj);
+
+    return S_OK;
+}
+
+}; // namespace DirectX
diff --git a/XDSP/XDSP.h b/XDSP/XDSP.h
index 28dfe8b..802386f 100644
--- a/XDSP/XDSP.h
+++ b/XDSP/XDSP.h
@@ -1,811 +1,811 @@
-//--------------------------------------------------------------------------------------
-// File: XDSP.h
-//
-// DirectXMath based Digital Signal Processing (DSP) functions for audio,
-// primarily Fast Fourier Transform (FFT)
-//
-// All buffer parameters must be 16-byte aligned
-//
-// All FFT functions support only single-precision floating-point audio
-//
-// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
-// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
-// PARTICULAR PURPOSE.
-//
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615557
-//--------------------------------------------------------------------------------------
-
-#pragma once
-
-#include <assert.h>
-#include <directxmath.h>
-
-#pragma warning(push)
-#pragma warning(disable : 4005 4668)
-#include <stdint.h>
-#pragma warning(pop)
-
-#pragma warning(push)
-#pragma warning(disable: 4328 4640 6001 6262)
-
-namespace XDSP
-{
-    #if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
-    #define XM_CALLCONV __fastcall
-    typedef const DirectX::XMVECTOR& HXMVECTOR;
-    typedef const DirectX::XMMATRIX& FXMMATRIX;
-    #endif
-
-    typedef DirectX::XMVECTOR XMVECTOR;
-    typedef DirectX::FXMVECTOR FXMVECTOR;
-    typedef DirectX::GXMVECTOR GXMVECTOR;
-    typedef DirectX::CXMVECTOR CXMVECTOR;
-
-    inline bool ISPOWEROF2(size_t n) { return ( ((n)&((n)-1)) == 0 && (n) != 0 ); }
-
-    // Parallel multiplication of four complex numbers, assuming real and imaginary values are stored in separate vectors.
-    __forceinline void XM_CALLCONV vmulComplex (_Out_ XMVECTOR& rResult, _Out_ XMVECTOR& iResult,
-                                                _In_ FXMVECTOR r1, _In_ FXMVECTOR i1, _In_ FXMVECTOR r2, _In_ GXMVECTOR i2)
-    {
-        using namespace DirectX;
-        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
-        XMVECTOR vi1i2 = XMVectorMultiply(i1, i2);
-        XMVECTOR vr1r2 = XMVectorMultiply(r1, r2);
-        XMVECTOR vr1i2 = XMVectorMultiply(r1, i2);
-        XMVECTOR vr2i1 = XMVectorMultiply(r2, i1);
-        rResult = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
-        iResult = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
-    }
-
-    __forceinline void XM_CALLCONV vmulComplex (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1, _In_ FXMVECTOR r2, _In_ FXMVECTOR i2)
-    {
-        using namespace DirectX;
-        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
-        XMVECTOR vi1i2 = XMVectorMultiply(i1, i2);
-        XMVECTOR vr1r2 = XMVectorMultiply(r1, r2);
-        XMVECTOR vr1i2 = XMVectorMultiply(r1, i2);
-        XMVECTOR vr2i1 = XMVectorMultiply(r2, i1);
-        r1 = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
-        i1 = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
-    }
-
-    //----------------------------------------------------------------------------------
-    // Radix-4 decimation-in-time FFT butterfly.
-    // This version assumes that all four elements of the butterfly are
-    // adjacent in a single vector.
-    //
-    // Compute the product of the complex input vector and the
-    // 4-element DFT matrix:
-    //     | 1  1  1  1 |    | (r1X,i1X) |
-    //     | 1 -j -1  j |    | (r1Y,i1Y) |
-    //     | 1 -1  1 -1 |    | (r1Z,i1Z) |
-    //     | 1  j -1 -j |    | (r1W,i1W) |
-    //
-    // This matrix can be decomposed into two simpler ones to reduce the
-    // number of additions needed. The decomposed matrices look like this:
-    //     | 1  0  1  0 |    | 1  0  1  0 |
-    //     | 0  1  0 -j |    | 1  0 -1  0 |
-    //     | 1  0 -1  0 |    | 0  1  0  1 |
-    //     | 0  1  0  j |    | 0  1  0 -1 |
-    //
-    // Combine as follows:
-    //          | 1  0  1  0 |   | (r1X,i1X) |         | (r1X + r1Z, i1X + i1Z) |
-    // Temp   = | 1  0 -1  0 | * | (r1Y,i1Y) |       = | (r1X - r1Z, i1X - i1Z) |
-    //          | 0  1  0  1 |   | (r1Z,i1Z) |         | (r1Y + r1W, i1Y + i1W) |
-    //          | 0  1  0 -1 |   | (r1W,i1W) |         | (r1Y - r1W, i1Y - i1W) |
-    //
-    //          | 1  0  1  0 |   | (rTempX,iTempX) |   | (rTempX + rTempZ, iTempX + iTempZ) |
-    // Result = | 0  1  0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
-    //          | 1  0 -1  0 |   | (rTempZ,iTempZ) |   | (rTempX - rTempZ, iTempX - iTempZ) |
-    //          | 0  1  0  j |   | (rTempW,iTempW) |   | (rTempY - iTempW, iTempY + rTempW) |
-    //----------------------------------------------------------------------------------
-    __forceinline void ButterflyDIT4_1 (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1)
-    {
-        using namespace DirectX;
-
-        // sign constants for radix-4 butterflies
-        const static XMVECTORF32 vDFT4SignBits1 = { 1.0f, -1.0f,  1.0f, -1.0f };
-        const static XMVECTORF32 vDFT4SignBits2 = { 1.0f,  1.0f, -1.0f, -1.0f };
-        const static XMVECTORF32 vDFT4SignBits3 = { 1.0f, -1.0f, -1.0f,  1.0f };
-
-        // calculating Temp
-        // [r1X| r1X|r1Y| r1Y] + [r1Z|-r1Z|r1W|-r1W]
-        // [i1X| i1X|i1Y| i1Y] + [i1Z|-i1Z|i1W|-i1W]
-        XMVECTOR r1L = XMVectorSwizzle<0,0,1,1>( r1 );
-        XMVECTOR r1H = XMVectorSwizzle<2,2,3,3>( r1 );
-
-        XMVECTOR i1L = XMVectorSwizzle<0,0,1,1>( i1 );
-        XMVECTOR i1H = XMVectorSwizzle<2,2,3,3>( i1 );
-
-        XMVECTOR rTemp = XMVectorMultiplyAdd( r1H, vDFT4SignBits1, r1L );  
-        XMVECTOR iTemp = XMVectorMultiplyAdd( i1H, vDFT4SignBits1, i1L ); 
-
-        // calculating Result
-        XMVECTOR rZrWiZiW = XMVectorPermute<2,3,6,7>(rTemp,iTemp);  // [rTempZ|rTempW|iTempZ|iTempW]
-        XMVECTOR rZiWrZiW = XMVectorSwizzle<0,3,0,3>(rZrWiZiW);     // [rTempZ|iTempW|rTempZ|iTempW]
-        XMVECTOR iZrWiZrW = XMVectorSwizzle<2,1,2,1>(rZrWiZiW);     // [rTempZ|iTempW|rTempZ|iTempW]
-
-        // [rTempX| rTempY| rTempX| rTempY] + [rTempZ| iTempW|-rTempZ|-iTempW]
-        // [iTempX| iTempY| iTempX| iTempY] + // [iTempZ|-rTempW|-iTempZ| rTempW]
-        XMVECTOR rTempL = XMVectorSwizzle<0,1,0,1>(rTemp);
-        XMVECTOR iTempL = XMVectorSwizzle<0,1,0,1>(iTemp);
-
-        r1 = XMVectorMultiplyAdd( rZiWrZiW, vDFT4SignBits2, rTempL );
-        i1 = XMVectorMultiplyAdd( iZrWiZrW, vDFT4SignBits3, iTempL );                
-    }
-
-    //----------------------------------------------------------------------------------
-    // Radix-4 decimation-in-time FFT butterfly.
-    // This version assumes that elements of the butterfly are
-    // in different vectors, so that each vector in the input
-    // contains elements from four different butterflies.
-    // The four separate butterflies are processed in parallel.
-    //
-    // The calculations here are the same as the ones in the single-vector
-    // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
-    // they are done in parallel on sixteen independent complex values.
-    // There is no interdependence between the vector elements:
-    // | 1  0  1  0 |    | (rIn0,iIn0) |               | (rIn0 + rIn2, iIn0 + iIn2) |
-    // | 1  0 -1  0 | *  | (rIn1,iIn1) |  =   Temp   = | (rIn0 - rIn2, iIn0 - iIn2) |
-    // | 0  1  0  1 |    | (rIn2,iIn2) |               | (rIn1 + rIn3, iIn1 + iIn3) |
-    // | 0  1  0 -1 |    | (rIn3,iIn3) |               | (rIn1 - rIn3, iIn1 - iIn3) |
-    //
-    //          | 1  0  1  0 |   | (rTemp0,iTemp0) |   | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
-    // Result = | 0  1  0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
-    //          | 1  0 -1  0 |   | (rTemp2,iTemp2) |   | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
-    //          | 0  1  0  j |   | (rTemp3,iTemp3) |   | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
-    //----------------------------------------------------------------------------------
-    __forceinline void ButterflyDIT4_4 (_Inout_ XMVECTOR& r0,
-                                        _Inout_ XMVECTOR& r1,
-                                        _Inout_ XMVECTOR& r2,
-                                        _Inout_ XMVECTOR& r3,
-                                        _Inout_ XMVECTOR& i0,
-                                        _Inout_ XMVECTOR& i1,
-                                        _Inout_ XMVECTOR& i2,
-                                        _Inout_ XMVECTOR& i3,
-                                        _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableReal,
-                                        _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableImaginary,
-                                        _In_ size_t uStride,
-                                        _In_ const bool fLast)
-    {
-        using namespace DirectX;
-
-        assert(pUnityTableReal);
-        assert(pUnityTableImaginary);
-        assert((uintptr_t)pUnityTableReal % 16 == 0);
-        assert((uintptr_t)pUnityTableImaginary % 16 == 0);
-        assert(ISPOWEROF2(uStride));
-
-        // calculating Temp
-        XMVECTOR rTemp0 = XMVectorAdd(r0, r2);
-        XMVECTOR iTemp0 = XMVectorAdd(i0, i2);
-
-        XMVECTOR rTemp2 = XMVectorAdd(r1, r3);
-        XMVECTOR iTemp2 = XMVectorAdd(i1, i3);
-
-        XMVECTOR rTemp1 = XMVectorSubtract(r0, r2);
-        XMVECTOR iTemp1 = XMVectorSubtract(i0, i2);
-
-        XMVECTOR rTemp3 = XMVectorSubtract(r1, r3);
-        XMVECTOR iTemp3 = XMVectorSubtract(i1, i3);
-
-        XMVECTOR rTemp4 = XMVectorAdd(rTemp0, rTemp2); 
-        XMVECTOR iTemp4 = XMVectorAdd(iTemp0, iTemp2);
-
-        XMVECTOR rTemp5 = XMVectorAdd(rTemp1, iTemp3); 
-        XMVECTOR iTemp5 = XMVectorSubtract(iTemp1, rTemp3);
-
-        XMVECTOR rTemp6 = XMVectorSubtract(rTemp0, rTemp2);
-        XMVECTOR iTemp6 = XMVectorSubtract(iTemp0, iTemp2);
-
-        XMVECTOR rTemp7 = XMVectorSubtract(rTemp1, iTemp3);
-        XMVECTOR iTemp7 = XMVectorAdd(iTemp1, rTemp3);
-
-        // calculating Result
-        // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
-        vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
-        vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
-        vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
-        
-        if (fLast)
-        {
-            ButterflyDIT4_1(rTemp4, iTemp4);
-            ButterflyDIT4_1(rTemp5, iTemp5);
-            ButterflyDIT4_1(rTemp6, iTemp6);
-            ButterflyDIT4_1(rTemp7, iTemp7);
-        }
-
-        r0 = rTemp4;    i0 = iTemp4;
-        r1 = rTemp5;    i1 = iTemp5;
-        r2 = rTemp6;    i2 = iTemp6;
-        r3 = rTemp7;    i3 = iTemp7;
-    }
-
-    //==================================================================================
-    // F-U-N-C-T-I-O-N-S
-    //==================================================================================
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  4-sample FFT.
-    //
-    // PARAMETERS:
-    //  pReal      - [inout] real components, must have at least uCount elements
-    //  pImaginary - [inout] imaginary components, must have at least uCount elements
-    //  uCount     - [in]    number of FFT iterations
-    //----------------------------------------------------------------------------------
-    __forceinline void FFT4(_Inout_updates_(uCount) XMVECTOR* __restrict pReal,
-                            _Inout_updates_(uCount) XMVECTOR* __restrict pImaginary,
-                            _In_ const size_t uCount=1)
-    {
-        assert(pReal);
-        assert(pImaginary);
-        assert((uintptr_t)pReal % 16 == 0);
-        assert((uintptr_t)pImaginary % 16 == 0);
-        assert(ISPOWEROF2(uCount));
-
-        for (size_t uIndex=0; uIndex < uCount; ++uIndex)
-        {
-            ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  8-sample FFT.
-    //
-    // PARAMETERS:
-    //  pReal      - [inout] real components, must have at least uCount*2 elements
-    //  pImaginary - [inout] imaginary components, must have at least uCount*2 elements
-    //  uCount     - [in]    number of FFT iterations
-    //----------------------------------------------------------------------------------
-    __forceinline void FFT8 (_Inout_updates_(uCount*2) XMVECTOR* __restrict pReal,
-                             _Inout_updates_(uCount*2) XMVECTOR* __restrict pImaginary,
-                             _In_ const size_t uCount=1)
-    {
-        using namespace DirectX;
-
-        assert(pReal);
-        assert(pImaginary);
-        assert((uintptr_t)pReal % 16 == 0);
-        assert((uintptr_t)pImaginary % 16 == 0);
-        assert(ISPOWEROF2(uCount));
-
-        static const XMVECTORF32 wr1 = {  1.0f,  0.70710677f,  0.0f, -0.70710677f };
-        static const XMVECTORF32 wi1 = {  0.0f, -0.70710677f, -1.0f, -0.70710677f };
-        static const XMVECTORF32 wr2 = { -1.0f, -0.70710677f,  0.0f,  0.70710677f };
-        static const XMVECTORF32 wi2 = {  0.0f,  0.70710677f,  1.0f,  0.70710677f };
-
-        for (size_t uIndex=0; uIndex < uCount; ++uIndex)
-        {
-            XMVECTOR* __restrict pR = pReal      + uIndex*2;
-            XMVECTOR* __restrict pI = pImaginary + uIndex*2;
-
-            XMVECTOR oddsR  = XMVectorPermute<1,3,5,7>(pR[0], pR[1]);
-            XMVECTOR evensR = XMVectorPermute<0,2,4,6>(pR[0], pR[1]);
-            XMVECTOR oddsI  = XMVectorPermute<1,3,5,7>(pI[0], pI[1]);
-            XMVECTOR evensI = XMVectorPermute<0,2,4,6>(pI[0], pI[1]);
-            ButterflyDIT4_1(oddsR, oddsI);
-            ButterflyDIT4_1(evensR, evensI);
-
-            XMVECTOR r, i;
-            vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
-            pR[0] = XMVectorAdd(evensR, r);
-            pI[0] = XMVectorAdd(evensI, i);
-
-            vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
-            pR[1] = XMVectorAdd(evensR, r);
-            pI[1] = XMVectorAdd(evensI, i);
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  16-sample FFT.
-    //
-    // PARAMETERS:
-    //  pReal      - [inout] real components, must have at least uCount*4 elements
-    //  pImaginary - [inout] imaginary components, must have at least uCount*4 elements
-    //  uCount     - [in]    number of FFT iterations
-    //----------------------------------------------------------------------------------
-    __forceinline void FFT16 (_Inout_updates_(uCount*4) XMVECTOR* __restrict pReal,
-                              _Inout_updates_(uCount*4) XMVECTOR* __restrict pImaginary,
-                              _In_ const size_t uCount=1)
-    {
-        using namespace DirectX;
-
-        assert(pReal);
-        assert(pImaginary);
-        assert((uintptr_t)pReal % 16 == 0);
-        assert((uintptr_t)pImaginary % 16 == 0);
-        assert(ISPOWEROF2(uCount));
-
-        static const XMVECTORF32 aUnityTableReal[4]      = { { 1.0f, 1.0f, 1.0f, 1.0f },
-                                                             { 1.0f, 0.92387950f, 0.70710677f, 0.38268343f },
-                                                             { 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f },
-                                                             { 1.0f, 0.38268343f, -0.70710677f, -0.92387950f } };
-        static const XMVECTORF32 aUnityTableImaginary[4] = { { -0.0f, -0.0f, -0.0f, -0.0f },
-                                                             { -0.0f, -0.38268343f, -0.70710677f, -0.92387950f },
-                                                             { -0.0f, -0.70710677f, -1.0f, -0.70710677f },
-                                                             { -0.0f, -0.92387950f, -0.70710677f, 0.38268343f } };
-
-        for (size_t uIndex=0; uIndex < uCount; ++uIndex)
-        {
-            ButterflyDIT4_4(pReal[uIndex*4],
-                            pReal[uIndex*4 + 1],
-                            pReal[uIndex*4 + 2],
-                            pReal[uIndex*4 + 3],
-                            pImaginary[uIndex*4],
-                            pImaginary[uIndex*4 + 1],
-                            pImaginary[uIndex*4 + 2],
-                            pImaginary[uIndex*4 + 3],
-                            reinterpret_cast<const XMVECTOR*>(aUnityTableReal),
-                            reinterpret_cast<const XMVECTOR*>(aUnityTableImaginary),
-                            1, true);
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  2^N-sample FFT.
-    //
-    // REMARKS:
-    //  For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
-    //
-    // PARAMETERS:
-    //  pReal       - [inout] real components, must have at least (uLength*uCount)/4 elements
-    //  pImaginary  - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
-    //  pUnityTable - [in]    unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
-    //  uLength     - [in]    FFT length in samples, must be a power of 2 > 16
-    //  uCount      - [in]    number of FFT iterations
-    //----------------------------------------------------------------------------------
-    inline void FFT (_Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pReal,
-                     _Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pImaginary,
-                     _In_reads_(uLength*uCount) const XMVECTOR* __restrict pUnityTable,
-                     _In_ const size_t uLength,
-                     _In_ const size_t uCount=1)
-    {
-        assert(pReal);
-        assert(pImaginary);
-        assert(pUnityTable);
-        assert((uintptr_t)pReal % 16 == 0);
-        assert((uintptr_t)pImaginary % 16 == 0);
-        assert((uintptr_t)pUnityTable % 16 == 0);
-        assert(uLength > 16);
-        _Analysis_assume_(uLength > 16);
-        assert(ISPOWEROF2(uLength));
-        assert(ISPOWEROF2(uCount));
-
-        const XMVECTOR* __restrict pUnityTableReal      = pUnityTable;
-        const XMVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
-        const size_t uTotal              = uCount * uLength;
-        const size_t uTotal_vectors      = uTotal >> 2;
-        const size_t uStage_vectors      = uLength >> 2;
-        const size_t uStage_vectors_mask = uStage_vectors - 1;
-        const size_t uStride        = uLength >> 4; // stride between butterfly elements
-        const size_t uStrideMask    = uStride - 1;
-        const size_t uStride2       = uStride * 2;
-        const size_t uStride3       = uStride * 3;
-        const size_t uStrideInvMask = ~uStrideMask;
-
-        for (size_t uIndex=0; uIndex < (uTotal_vectors>>2); ++uIndex)
-        {
-            const size_t n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask);
-            ButterflyDIT4_4(pReal[n],
-                            pReal[n + uStride],
-                            pReal[n + uStride2],
-                            pReal[n + uStride3],
-                            pImaginary[n ],
-                            pImaginary[n + uStride],
-                            pImaginary[n + uStride2],
-                            pImaginary[n + uStride3],
-                            pUnityTableReal      + (n & uStage_vectors_mask),
-                            pUnityTableImaginary + (n & uStage_vectors_mask),
-                            uStride, false);
-        }
-
-        if (uLength > 16*4)
-        {
-            FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
-        }
-        else if (uLength == 16*4)
-        {
-            FFT16(pReal, pImaginary, uCount*4);
-        }
-        else if (uLength == 8*4)
-        {
-            FFT8(pReal, pImaginary, uCount*4);
-        }
-        else if (uLength == 4*4)
-        {
-            FFT4(pReal, pImaginary, uCount*4);
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  Initializes unity roots lookup table used by FFT functions.
-    //  Once initialized, the table need not be initialized again unless a
-    //  different FFT length is desired.
-    //
-    // REMARKS:
-    //  The unity tables of FFT length 16 and below are hard coded into the
-    //  respective FFT functions and so need not be initialized.
-    //
-    // PARAMETERS:
-    //  pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
-    //  uLength     - [in]  FFT length in frames, must be a power of 2 > 16
-    //----------------------------------------------------------------------------------
-    inline void FFTInitializeUnityTable (_Out_writes_(uLength) XMVECTOR* __restrict pUnityTable, _In_ size_t uLength)
-    {
-        assert(pUnityTable);
-        assert(uLength > 16);
-        _Analysis_assume_(uLength > 16);
-        assert(ISPOWEROF2(uLength));
-
-        float* __restrict pfUnityTable = reinterpret_cast<float* __restrict>(pUnityTable);
-
-        // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
-        do
-        {
-            float flStep = 6.283185307f / uLength; // 2PI / FFT length
-            uLength >>= 2;
-
-            // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
-            // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
-            for (size_t i=0; i<4; ++i)
-            {
-                for (size_t j=0; j<uLength; ++j)
-                {
-                    size_t uIndex = (i*uLength) + j;
-                    pfUnityTable[uIndex]             = cosf(float(i)*float(j)*flStep);  // real component
-#pragma warning(suppress: 6386)
-                    pfUnityTable[uIndex + uLength*4] = -sinf(float(i)*float(j)*flStep); // imaginary component
-                }
-            }
-            pfUnityTable += uLength*8;
-        }
-        while (uLength > 16);
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  The FFT functions generate output in bit reversed order.
-    //  Use this function to re-arrange them into order of increasing frequency.
-    //
-    // REMARKS:
-    //
-    // PARAMETERS:
-    //  pOutput     - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<<uLog2Length)/4 elements
-    //  pInput      - [in]  input buffer, samples in bit reversed order as generated by FFT functions, cannot overlap pOutput, must have at least (1<<uLog2Length)/4 elements
-    //  uLog2Length - [in]  LOG (base 2) of FFT length in samples, must be >= 2
-    //----------------------------------------------------------------------------------
-    inline void FFTUnswizzle (_Out_writes_((1<<uLog2Length)/4) XMVECTOR* __restrict pOutput,
-                              _In_reads_((1<<uLog2Length)/4) const XMVECTOR* __restrict pInput,
-                              _In_ const size_t uLog2Length)
-    {
-        assert(pOutput);
-        assert(pInput);
-        assert(uLog2Length >= 2);
-        _Analysis_assume_(uLog2Length >= 2);
-
-        float*       __restrict pfOutput = (float* __restrict)pOutput;
-        const float* __restrict pfInput  = (const float* __restrict)pInput;
-        const size_t uLength = size_t(1) << uLog2Length;
-
-        if ((uLog2Length & 0x1) == 0)
-        {
-            // even powers of two
-            for (size_t uIndex=0; uIndex < uLength; ++uIndex)
-            {
-                size_t n = uIndex;
-                n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
-                n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
-                n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
-                n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
-                n >>= (32 - uLog2Length);
-                pfOutput[n] = pfInput[uIndex];
-            }
-        }
-        else
-        {
-            // odd powers of two
-            for (size_t uIndex=0; uIndex < uLength; ++uIndex)
-            {
-                size_t n = (uIndex>>3);
-                n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
-                n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
-                n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
-                n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
-                n >>= (32 - (uLog2Length-3));
-                n |= ((uIndex & 0x7) << (uLog2Length - 3));
-                pfOutput[n] = pfInput[uIndex];
-            }
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  Convert complex components to polar form.
-    //
-    // PARAMETERS:
-    //  pOutput         - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
-    //  pInputReal      - [in]  input buffer (real components), must have at least uLength/4 elements
-    //  pInputImaginary - [in]  input buffer (imaginary components), must have at least uLength/4 elements
-    //  uLength         - [in]  FFT length in samples, must be a power of 2 >= 4
-    //----------------------------------------------------------------------------------
-#pragma warning(suppress: 6101)
-    inline void FFTPolar (_Out_writes_(uLength/4) XMVECTOR* __restrict pOutput,
-                          _In_reads_(uLength/4) const XMVECTOR* __restrict pInputReal,
-                          _In_reads_(uLength/4) const XMVECTOR* __restrict pInputImaginary,
-                          _In_ const size_t uLength)
-    {
-        using namespace DirectX;
-
-        assert(pOutput);
-        assert(pInputReal);
-        assert(pInputImaginary);
-        assert(uLength >= 4);
-        _Analysis_assume_(uLength >= 4);
-        assert(ISPOWEROF2(uLength));
-
-        float flOneOverLength = 1.0f / uLength;
-
-        // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
-        XMVECTOR vOneOverLength = XMVectorReplicate( flOneOverLength );
-
-        for (size_t uIndex=0; uIndex < (uLength>>2); ++uIndex)
-        {
-            XMVECTOR vReal      = XMVectorMultiply(pInputReal[uIndex], vOneOverLength);
-            XMVECTOR vImaginary = XMVectorMultiply(pInputImaginary[uIndex], vOneOverLength);
-            XMVECTOR vRR        = XMVectorMultiply(vReal, vReal);
-            XMVECTOR vII        = XMVectorMultiply(vImaginary, vImaginary);
-            XMVECTOR vRRplusII  = XMVectorAdd(vRR, vII);
-            XMVECTOR vTotal     = XMVectorSqrt(vRRplusII);
-            pOutput[uIndex]     = XMVectorAdd(vTotal, vTotal);
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  Deinterleaves audio samples
-    //
-    // REMARKS:
-    //  For example, audio of the form [LRLRLR] becomes [LLLRRR].
-    //
-    // PARAMETERS:
-    //  pOutput       - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
-    //  pInput        - [in]  input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
-    //  uChannelCount - [in]  number of channels, must be > 1
-    //  uFrameCount   - [in]  number of frames of valid data, must be > 0
-    //----------------------------------------------------------------------------------
-    inline void Deinterleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput,
-                              _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput,
-                              _In_ const size_t uChannelCount,
-                              _In_ const size_t uFrameCount)
-    {
-        assert(pOutput);
-        assert(pInput);
-        assert(uChannelCount > 1);
-        assert(uFrameCount > 0);
-
-        float*       __restrict pfOutput = reinterpret_cast<float* __restrict>(pOutput);
-        const float* __restrict pfInput  = reinterpret_cast<const float* __restrict>(pInput);
-
-        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-        {
-            for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame)
-            {
-                pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel];
-            }
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  Interleaves audio samples
-    //
-    // REMARKS:
-    //  For example, audio of the form [LLLRRR] becomes [LRLRLR].
-    //
-    // PARAMETERS:
-    //  pOutput       - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
-    //  pInput        - [in]  input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
-    //  uChannelCount - [in]  number of channels, must be > 1
-    //  uFrameCount   - [in]  number of frames of valid data, must be > 0
-    //----------------------------------------------------------------------------------
-    inline void Interleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput,
-                            _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput,
-                            _In_ const size_t uChannelCount,
-                            _In_ const size_t uFrameCount)
-    {
-        assert(pOutput);
-        assert(pInput);
-        assert(uChannelCount > 1);
-        assert(uFrameCount > 0);
-
-        float*       __restrict pfOutput = reinterpret_cast<float* __restrict>(pOutput);
-        const float* __restrict pfInput  = reinterpret_cast<const float* __restrict>(pInput);
-
-        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-        {
-            for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame)
-            {
-                pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame];
-            }
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  This function applies a 2^N-sample FFT and unswizzles the result such
-    //  that the samples are in order of increasing frequency.
-    //  Audio is first deinterleaved if multichannel.
-    //
-    // PARAMETERS:
-    //  pReal         - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
-    //  pImaginary    - [out]   imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
-    //  pUnityTable   - [in]    unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
-    //  uChannelCount - [in]    number of channels, must be within [1, 6]
-    //  uLog2Length   - [in]    LOG (base 2) of FFT length in frames, must within [2, 9]
-    //----------------------------------------------------------------------------------
-    inline void FFTInterleaved (_Inout_updates_(((1<<uLog2Length)*uChannelCount)/4) XMVECTOR* __restrict pReal,
-                                _Out_writes_(((1<<uLog2Length)*uChannelCount)/4) XMVECTOR* __restrict pImaginary,
-                                _In_reads_(1<<uLog2Length) const XMVECTOR* __restrict pUnityTable,
-                                _In_ const size_t uChannelCount,
-                                _In_ const size_t uLog2Length)
-    {
-        assert(pReal);
-        assert(pImaginary);
-        assert(pUnityTable);
-        assert((uintptr_t)pReal % 16 == 0);
-        assert((uintptr_t)pImaginary % 16 == 0);
-        assert((uintptr_t)pUnityTable % 16 == 0);
-        assert(uChannelCount > 0 && uChannelCount <= 6);
-        assert(uLog2Length >= 2 && uLog2Length <= 9);
-
-        XMVECTOR vRealTemp[768];
-        XMVECTOR vImaginaryTemp[768];
-        const size_t uLength = size_t(1) << uLog2Length;
-
-        if (uChannelCount > 1)
-        {
-            Deinterleave(vRealTemp, pReal, uChannelCount, uLength);
-        }
-        else
-        {
-            memcpy_s(vRealTemp, sizeof(vRealTemp), pReal, (uLength>>2)*sizeof(XMVECTOR));
-        }
-
-        memset( vImaginaryTemp, 0, (uChannelCount*(uLength>>2)) * sizeof(XMVECTOR) );
-
-        if (uLength > 16)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
-            }
-        }
-        else if (uLength == 16)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
-            }
-        }
-        else if (uLength == 8)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
-            }
-        }
-        else if (uLength == 4)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
-            }
-        }
-
-        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-        {
-            FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
-            FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length);
-        }
-    }
-
-    //----------------------------------------------------------------------------------
-    // DESCRIPTION:
-    //  This function applies a 2^N-sample inverse FFT.
-    //  Audio is interleaved if multichannel.
-    //
-    // PARAMETERS:
-    //  pReal         - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
-    //  pImaginary    - [in]    imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
-    //  pUnityTable   - [in]    unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
-    //  uChannelCount - [in]    number of channels, must be > 0
-    //  uLog2Length   - [in]    LOG (base 2) of FFT length in frames, must within [2, 9]
-    //----------------------------------------------------------------------------------
-    inline void IFFTDeinterleaved (_Inout_updates_(((1<<uLog2Length)*uChannelCount)/4) XMVECTOR* __restrict pReal,
-                                   _In_reads_(((1<<uLog2Length)*uChannelCount)/4) const XMVECTOR* __restrict pImaginary,
-                                   _In_reads_(1<<uLog2Length) const XMVECTOR* __restrict pUnityTable,
-                                   _In_ const size_t uChannelCount,
-                                   _In_ const size_t uLog2Length)
-    {
-        using namespace DirectX;
-
-        assert(pReal);
-        assert(pImaginary);
-        assert(pUnityTable);
-        assert((uintptr_t)pReal % 16 == 0);
-        assert((uintptr_t)pImaginary % 16 == 0);
-        assert((uintptr_t)pUnityTable % 16 == 0);
-        assert(uChannelCount > 0 && uChannelCount <= 6);
-        _Analysis_assume_(uChannelCount > 0 && uChannelCount <= 6);
-        assert(uLog2Length >= 2 && uLog2Length <= 9);
-        _Analysis_assume_(uLog2Length >= 2 && uLog2Length <= 9);
-
-        XMVECTOR vRealTemp[768] = { 0 };
-        XMVECTOR vImaginaryTemp[768] = { 0 };
-
-        const size_t uLength = size_t(1) << uLog2Length;
-
-        const XMVECTOR vRnp = XMVectorReplicate(1.0f/uLength);
-        const XMVECTOR vRnm = XMVectorReplicate(-1.0f/uLength);
-        for (size_t u=0; u < uChannelCount*(uLength>>2); u++)
-        {
-            vRealTemp[u]      = XMVectorMultiply(pReal[u], vRnp);
-            vImaginaryTemp[u] = XMVectorMultiply(pImaginary[u], vRnm);
-        }
-
-        if (uLength > 16)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
-            }
-        }
-        else if (uLength == 16)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
-            }
-        }
-        else if (uLength == 8)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
-            }
-        }
-        else if (uLength == 4)
-        {
-            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-            {
-                FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
-            }
-        }
-
-        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
-        {
-            FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
-        }
-
-        if (uChannelCount > 1)
-        {
-            Interleave(pReal, vImaginaryTemp, uChannelCount, uLength);
-        }
-        else
-        {
-            memcpy_s(pReal, uLength*uChannelCount*sizeof(float), vImaginaryTemp, (uLength>>2)*sizeof(XMVECTOR));
-        }
-    }
-
-}; // namespace XDSP
-
-#pragma warning(pop)
+//--------------------------------------------------------------------------------------
+// File: XDSP.h
+//
+// DirectXMath based Digital Signal Processing (DSP) functions for audio,
+// primarily Fast Fourier Transform (FFT)
+//
+// All buffer parameters must be 16-byte aligned
+//
+// All FFT functions support only single-precision floating-point audio
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615557
+//--------------------------------------------------------------------------------------
+
+#pragma once
+
+#include <assert.h>
+#include <directxmath.h>
+
+#pragma warning(push)
+#pragma warning(disable : 4005 4668)
+#include <stdint.h>
+#pragma warning(pop)
+
+#pragma warning(push)
+#pragma warning(disable: 4328 4640 6001 6262)
+
+namespace XDSP
+{
+    #if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV)
+    #define XM_CALLCONV __fastcall
+    typedef const DirectX::XMVECTOR& HXMVECTOR;
+    typedef const DirectX::XMMATRIX& FXMMATRIX;
+    #endif
+
+    typedef DirectX::XMVECTOR XMVECTOR;
+    typedef DirectX::FXMVECTOR FXMVECTOR;
+    typedef DirectX::GXMVECTOR GXMVECTOR;
+    typedef DirectX::CXMVECTOR CXMVECTOR;
+
+    inline bool ISPOWEROF2(size_t n) { return ( ((n)&((n)-1)) == 0 && (n) != 0 ); }
+
+    // Parallel multiplication of four complex numbers, assuming real and imaginary values are stored in separate vectors.
+    __forceinline void XM_CALLCONV vmulComplex (_Out_ XMVECTOR& rResult, _Out_ XMVECTOR& iResult,
+                                                _In_ FXMVECTOR r1, _In_ FXMVECTOR i1, _In_ FXMVECTOR r2, _In_ GXMVECTOR i2)
+    {
+        using namespace DirectX;
+        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
+        XMVECTOR vi1i2 = XMVectorMultiply(i1, i2);
+        XMVECTOR vr1r2 = XMVectorMultiply(r1, r2);
+        XMVECTOR vr1i2 = XMVectorMultiply(r1, i2);
+        XMVECTOR vr2i1 = XMVectorMultiply(r2, i1);
+        rResult = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
+        iResult = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
+    }
+
+    __forceinline void XM_CALLCONV vmulComplex (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1, _In_ FXMVECTOR r2, _In_ FXMVECTOR i2)
+    {
+        using namespace DirectX;
+        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
+        XMVECTOR vi1i2 = XMVectorMultiply(i1, i2);
+        XMVECTOR vr1r2 = XMVectorMultiply(r1, r2);
+        XMVECTOR vr1i2 = XMVectorMultiply(r1, i2);
+        XMVECTOR vr2i1 = XMVectorMultiply(r2, i1);
+        r1 = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
+        i1 = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
+    }
+
+    //----------------------------------------------------------------------------------
+    // Radix-4 decimation-in-time FFT butterfly.
+    // This version assumes that all four elements of the butterfly are
+    // adjacent in a single vector.
+    //
+    // Compute the product of the complex input vector and the
+    // 4-element DFT matrix:
+    //     | 1  1  1  1 |    | (r1X,i1X) |
+    //     | 1 -j -1  j |    | (r1Y,i1Y) |
+    //     | 1 -1  1 -1 |    | (r1Z,i1Z) |
+    //     | 1  j -1 -j |    | (r1W,i1W) |
+    //
+    // This matrix can be decomposed into two simpler ones to reduce the
+    // number of additions needed. The decomposed matrices look like this:
+    //     | 1  0  1  0 |    | 1  0  1  0 |
+    //     | 0  1  0 -j |    | 1  0 -1  0 |
+    //     | 1  0 -1  0 |    | 0  1  0  1 |
+    //     | 0  1  0  j |    | 0  1  0 -1 |
+    //
+    // Combine as follows:
+    //          | 1  0  1  0 |   | (r1X,i1X) |         | (r1X + r1Z, i1X + i1Z) |
+    // Temp   = | 1  0 -1  0 | * | (r1Y,i1Y) |       = | (r1X - r1Z, i1X - i1Z) |
+    //          | 0  1  0  1 |   | (r1Z,i1Z) |         | (r1Y + r1W, i1Y + i1W) |
+    //          | 0  1  0 -1 |   | (r1W,i1W) |         | (r1Y - r1W, i1Y - i1W) |
+    //
+    //          | 1  0  1  0 |   | (rTempX,iTempX) |   | (rTempX + rTempZ, iTempX + iTempZ) |
+    // Result = | 0  1  0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
+    //          | 1  0 -1  0 |   | (rTempZ,iTempZ) |   | (rTempX - rTempZ, iTempX - iTempZ) |
+    //          | 0  1  0  j |   | (rTempW,iTempW) |   | (rTempY - iTempW, iTempY + rTempW) |
+    //----------------------------------------------------------------------------------
+    __forceinline void ButterflyDIT4_1 (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1)
+    {
+        using namespace DirectX;
+
+        // sign constants for radix-4 butterflies
+        const static XMVECTORF32 vDFT4SignBits1 = { 1.0f, -1.0f,  1.0f, -1.0f };
+        const static XMVECTORF32 vDFT4SignBits2 = { 1.0f,  1.0f, -1.0f, -1.0f };
+        const static XMVECTORF32 vDFT4SignBits3 = { 1.0f, -1.0f, -1.0f,  1.0f };
+
+        // calculating Temp
+        // [r1X| r1X|r1Y| r1Y] + [r1Z|-r1Z|r1W|-r1W]
+        // [i1X| i1X|i1Y| i1Y] + [i1Z|-i1Z|i1W|-i1W]
+        XMVECTOR r1L = XMVectorSwizzle<0,0,1,1>( r1 );
+        XMVECTOR r1H = XMVectorSwizzle<2,2,3,3>( r1 );
+
+        XMVECTOR i1L = XMVectorSwizzle<0,0,1,1>( i1 );
+        XMVECTOR i1H = XMVectorSwizzle<2,2,3,3>( i1 );
+
+        XMVECTOR rTemp = XMVectorMultiplyAdd( r1H, vDFT4SignBits1, r1L );  
+        XMVECTOR iTemp = XMVectorMultiplyAdd( i1H, vDFT4SignBits1, i1L ); 
+
+        // calculating Result
+        XMVECTOR rZrWiZiW = XMVectorPermute<2,3,6,7>(rTemp,iTemp);  // [rTempZ|rTempW|iTempZ|iTempW]
+        XMVECTOR rZiWrZiW = XMVectorSwizzle<0,3,0,3>(rZrWiZiW);     // [rTempZ|iTempW|rTempZ|iTempW]
+        XMVECTOR iZrWiZrW = XMVectorSwizzle<2,1,2,1>(rZrWiZiW);     // [rTempZ|iTempW|rTempZ|iTempW]
+
+        // [rTempX| rTempY| rTempX| rTempY] + [rTempZ| iTempW|-rTempZ|-iTempW]
+        // [iTempX| iTempY| iTempX| iTempY] + // [iTempZ|-rTempW|-iTempZ| rTempW]
+        XMVECTOR rTempL = XMVectorSwizzle<0,1,0,1>(rTemp);
+        XMVECTOR iTempL = XMVectorSwizzle<0,1,0,1>(iTemp);
+
+        r1 = XMVectorMultiplyAdd( rZiWrZiW, vDFT4SignBits2, rTempL );
+        i1 = XMVectorMultiplyAdd( iZrWiZrW, vDFT4SignBits3, iTempL );                
+    }
+
+    //----------------------------------------------------------------------------------
+    // Radix-4 decimation-in-time FFT butterfly.
+    // This version assumes that elements of the butterfly are
+    // in different vectors, so that each vector in the input
+    // contains elements from four different butterflies.
+    // The four separate butterflies are processed in parallel.
+    //
+    // The calculations here are the same as the ones in the single-vector
+    // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
+    // they are done in parallel on sixteen independent complex values.
+    // There is no interdependence between the vector elements:
+    // | 1  0  1  0 |    | (rIn0,iIn0) |               | (rIn0 + rIn2, iIn0 + iIn2) |
+    // | 1  0 -1  0 | *  | (rIn1,iIn1) |  =   Temp   = | (rIn0 - rIn2, iIn0 - iIn2) |
+    // | 0  1  0  1 |    | (rIn2,iIn2) |               | (rIn1 + rIn3, iIn1 + iIn3) |
+    // | 0  1  0 -1 |    | (rIn3,iIn3) |               | (rIn1 - rIn3, iIn1 - iIn3) |
+    //
+    //          | 1  0  1  0 |   | (rTemp0,iTemp0) |   | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
+    // Result = | 0  1  0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
+    //          | 1  0 -1  0 |   | (rTemp2,iTemp2) |   | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
+    //          | 0  1  0  j |   | (rTemp3,iTemp3) |   | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
+    //----------------------------------------------------------------------------------
+    __forceinline void ButterflyDIT4_4 (_Inout_ XMVECTOR& r0,
+                                        _Inout_ XMVECTOR& r1,
+                                        _Inout_ XMVECTOR& r2,
+                                        _Inout_ XMVECTOR& r3,
+                                        _Inout_ XMVECTOR& i0,
+                                        _Inout_ XMVECTOR& i1,
+                                        _Inout_ XMVECTOR& i2,
+                                        _Inout_ XMVECTOR& i3,
+                                        _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableReal,
+                                        _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableImaginary,
+                                        _In_ size_t uStride,
+                                        _In_ const bool fLast)
+    {
+        using namespace DirectX;
+
+        assert(pUnityTableReal);
+        assert(pUnityTableImaginary);
+        assert((uintptr_t)pUnityTableReal % 16 == 0);
+        assert((uintptr_t)pUnityTableImaginary % 16 == 0);
+        assert(ISPOWEROF2(uStride));
+
+        // calculating Temp
+        XMVECTOR rTemp0 = XMVectorAdd(r0, r2);
+        XMVECTOR iTemp0 = XMVectorAdd(i0, i2);
+
+        XMVECTOR rTemp2 = XMVectorAdd(r1, r3);
+        XMVECTOR iTemp2 = XMVectorAdd(i1, i3);
+
+        XMVECTOR rTemp1 = XMVectorSubtract(r0, r2);
+        XMVECTOR iTemp1 = XMVectorSubtract(i0, i2);
+
+        XMVECTOR rTemp3 = XMVectorSubtract(r1, r3);
+        XMVECTOR iTemp3 = XMVectorSubtract(i1, i3);
+
+        XMVECTOR rTemp4 = XMVectorAdd(rTemp0, rTemp2); 
+        XMVECTOR iTemp4 = XMVectorAdd(iTemp0, iTemp2);
+
+        XMVECTOR rTemp5 = XMVectorAdd(rTemp1, iTemp3); 
+        XMVECTOR iTemp5 = XMVectorSubtract(iTemp1, rTemp3);
+
+        XMVECTOR rTemp6 = XMVectorSubtract(rTemp0, rTemp2);
+        XMVECTOR iTemp6 = XMVectorSubtract(iTemp0, iTemp2);
+
+        XMVECTOR rTemp7 = XMVectorSubtract(rTemp1, iTemp3);
+        XMVECTOR iTemp7 = XMVectorAdd(iTemp1, rTemp3);
+
+        // calculating Result
+        // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
+        vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
+        vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
+        vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
+        
+        if (fLast)
+        {
+            ButterflyDIT4_1(rTemp4, iTemp4);
+            ButterflyDIT4_1(rTemp5, iTemp5);
+            ButterflyDIT4_1(rTemp6, iTemp6);
+            ButterflyDIT4_1(rTemp7, iTemp7);
+        }
+
+        r0 = rTemp4;    i0 = iTemp4;
+        r1 = rTemp5;    i1 = iTemp5;
+        r2 = rTemp6;    i2 = iTemp6;
+        r3 = rTemp7;    i3 = iTemp7;
+    }
+
+    //==================================================================================
+    // F-U-N-C-T-I-O-N-S
+    //==================================================================================
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  4-sample FFT.
+    //
+    // PARAMETERS:
+    //  pReal      - [inout] real components, must have at least uCount elements
+    //  pImaginary - [inout] imaginary components, must have at least uCount elements
+    //  uCount     - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    __forceinline void FFT4(_Inout_updates_(uCount) XMVECTOR* __restrict pReal,
+                            _Inout_updates_(uCount) XMVECTOR* __restrict pImaginary,
+                            _In_ const size_t uCount=1)
+    {
+        assert(pReal);
+        assert(pImaginary);
+        assert((uintptr_t)pReal % 16 == 0);
+        assert((uintptr_t)pImaginary % 16 == 0);
+        assert(ISPOWEROF2(uCount));
+
+        for (size_t uIndex=0; uIndex < uCount; ++uIndex)
+        {
+            ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  8-sample FFT.
+    //
+    // PARAMETERS:
+    //  pReal      - [inout] real components, must have at least uCount*2 elements
+    //  pImaginary - [inout] imaginary components, must have at least uCount*2 elements
+    //  uCount     - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    __forceinline void FFT8 (_Inout_updates_(uCount*2) XMVECTOR* __restrict pReal,
+                             _Inout_updates_(uCount*2) XMVECTOR* __restrict pImaginary,
+                             _In_ const size_t uCount=1)
+    {
+        using namespace DirectX;
+
+        assert(pReal);
+        assert(pImaginary);
+        assert((uintptr_t)pReal % 16 == 0);
+        assert((uintptr_t)pImaginary % 16 == 0);
+        assert(ISPOWEROF2(uCount));
+
+        static const XMVECTORF32 wr1 = {  1.0f,  0.70710677f,  0.0f, -0.70710677f };
+        static const XMVECTORF32 wi1 = {  0.0f, -0.70710677f, -1.0f, -0.70710677f };
+        static const XMVECTORF32 wr2 = { -1.0f, -0.70710677f,  0.0f,  0.70710677f };
+        static const XMVECTORF32 wi2 = {  0.0f,  0.70710677f,  1.0f,  0.70710677f };
+
+        for (size_t uIndex=0; uIndex < uCount; ++uIndex)
+        {
+            XMVECTOR* __restrict pR = pReal      + uIndex*2;
+            XMVECTOR* __restrict pI = pImaginary + uIndex*2;
+
+            XMVECTOR oddsR  = XMVectorPermute<1,3,5,7>(pR[0], pR[1]);
+            XMVECTOR evensR = XMVectorPermute<0,2,4,6>(pR[0], pR[1]);
+            XMVECTOR oddsI  = XMVectorPermute<1,3,5,7>(pI[0], pI[1]);
+            XMVECTOR evensI = XMVectorPermute<0,2,4,6>(pI[0], pI[1]);
+            ButterflyDIT4_1(oddsR, oddsI);
+            ButterflyDIT4_1(evensR, evensI);
+
+            XMVECTOR r, i;
+            vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
+            pR[0] = XMVectorAdd(evensR, r);
+            pI[0] = XMVectorAdd(evensI, i);
+
+            vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
+            pR[1] = XMVectorAdd(evensR, r);
+            pI[1] = XMVectorAdd(evensI, i);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  16-sample FFT.
+    //
+    // PARAMETERS:
+    //  pReal      - [inout] real components, must have at least uCount*4 elements
+    //  pImaginary - [inout] imaginary components, must have at least uCount*4 elements
+    //  uCount     - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    __forceinline void FFT16 (_Inout_updates_(uCount*4) XMVECTOR* __restrict pReal,
+                              _Inout_updates_(uCount*4) XMVECTOR* __restrict pImaginary,
+                              _In_ const size_t uCount=1)
+    {
+        using namespace DirectX;
+
+        assert(pReal);
+        assert(pImaginary);
+        assert((uintptr_t)pReal % 16 == 0);
+        assert((uintptr_t)pImaginary % 16 == 0);
+        assert(ISPOWEROF2(uCount));
+
+        static const XMVECTORF32 aUnityTableReal[4]      = { { 1.0f, 1.0f, 1.0f, 1.0f },
+                                                             { 1.0f, 0.92387950f, 0.70710677f, 0.38268343f },
+                                                             { 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f },
+                                                             { 1.0f, 0.38268343f, -0.70710677f, -0.92387950f } };
+        static const XMVECTORF32 aUnityTableImaginary[4] = { { -0.0f, -0.0f, -0.0f, -0.0f },
+                                                             { -0.0f, -0.38268343f, -0.70710677f, -0.92387950f },
+                                                             { -0.0f, -0.70710677f, -1.0f, -0.70710677f },
+                                                             { -0.0f, -0.92387950f, -0.70710677f, 0.38268343f } };
+
+        for (size_t uIndex=0; uIndex < uCount; ++uIndex)
+        {
+            ButterflyDIT4_4(pReal[uIndex*4],
+                            pReal[uIndex*4 + 1],
+                            pReal[uIndex*4 + 2],
+                            pReal[uIndex*4 + 3],
+                            pImaginary[uIndex*4],
+                            pImaginary[uIndex*4 + 1],
+                            pImaginary[uIndex*4 + 2],
+                            pImaginary[uIndex*4 + 3],
+                            reinterpret_cast<const XMVECTOR*>(aUnityTableReal),
+                            reinterpret_cast<const XMVECTOR*>(aUnityTableImaginary),
+                            1, true);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  2^N-sample FFT.
+    //
+    // REMARKS:
+    //  For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
+    //
+    // PARAMETERS:
+    //  pReal       - [inout] real components, must have at least (uLength*uCount)/4 elements
+    //  pImaginary  - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
+    //  pUnityTable - [in]    unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
+    //  uLength     - [in]    FFT length in samples, must be a power of 2 > 16
+    //  uCount      - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    inline void FFT (_Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pReal,
+                     _Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pImaginary,
+                     _In_reads_(uLength*uCount) const XMVECTOR* __restrict pUnityTable,
+                     _In_ const size_t uLength,
+                     _In_ const size_t uCount=1)
+    {
+        assert(pReal);
+        assert(pImaginary);
+        assert(pUnityTable);
+        assert((uintptr_t)pReal % 16 == 0);
+        assert((uintptr_t)pImaginary % 16 == 0);
+        assert((uintptr_t)pUnityTable % 16 == 0);
+        assert(uLength > 16);
+        _Analysis_assume_(uLength > 16);
+        assert(ISPOWEROF2(uLength));
+        assert(ISPOWEROF2(uCount));
+
+        const XMVECTOR* __restrict pUnityTableReal      = pUnityTable;
+        const XMVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
+        const size_t uTotal              = uCount * uLength;
+        const size_t uTotal_vectors      = uTotal >> 2;
+        const size_t uStage_vectors      = uLength >> 2;
+        const size_t uStage_vectors_mask = uStage_vectors - 1;
+        const size_t uStride        = uLength >> 4; // stride between butterfly elements
+        const size_t uStrideMask    = uStride - 1;
+        const size_t uStride2       = uStride * 2;
+        const size_t uStride3       = uStride * 3;
+        const size_t uStrideInvMask = ~uStrideMask;
+
+        for (size_t uIndex=0; uIndex < (uTotal_vectors>>2); ++uIndex)
+        {
+            const size_t n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask);
+            ButterflyDIT4_4(pReal[n],
+                            pReal[n + uStride],
+                            pReal[n + uStride2],
+                            pReal[n + uStride3],
+                            pImaginary[n ],
+                            pImaginary[n + uStride],
+                            pImaginary[n + uStride2],
+                            pImaginary[n + uStride3],
+                            pUnityTableReal      + (n & uStage_vectors_mask),
+                            pUnityTableImaginary + (n & uStage_vectors_mask),
+                            uStride, false);
+        }
+
+        if (uLength > 16*4)
+        {
+            FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
+        }
+        else if (uLength == 16*4)
+        {
+            FFT16(pReal, pImaginary, uCount*4);
+        }
+        else if (uLength == 8*4)
+        {
+            FFT8(pReal, pImaginary, uCount*4);
+        }
+        else if (uLength == 4*4)
+        {
+            FFT4(pReal, pImaginary, uCount*4);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Initializes unity roots lookup table used by FFT functions.
+    //  Once initialized, the table need not be initialized again unless a
+    //  different FFT length is desired.
+    //
+    // REMARKS:
+    //  The unity tables of FFT length 16 and below are hard coded into the
+    //  respective FFT functions and so need not be initialized.
+    //
+    // PARAMETERS:
+    //  pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
+    //  uLength     - [in]  FFT length in frames, must be a power of 2 > 16
+    //----------------------------------------------------------------------------------
+    inline void FFTInitializeUnityTable (_Out_writes_(uLength) XMVECTOR* __restrict pUnityTable, _In_ size_t uLength)
+    {
+        assert(pUnityTable);
+        assert(uLength > 16);
+        _Analysis_assume_(uLength > 16);
+        assert(ISPOWEROF2(uLength));
+
+        float* __restrict pfUnityTable = reinterpret_cast<float* __restrict>(pUnityTable);
+
+        // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
+        do
+        {
+            float flStep = 6.283185307f / uLength; // 2PI / FFT length
+            uLength >>= 2;
+
+            // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
+            // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
+            for (size_t i=0; i<4; ++i)
+            {
+                for (size_t j=0; j<uLength; ++j)
+                {
+                    size_t uIndex = (i*uLength) + j;
+                    pfUnityTable[uIndex]             = cosf(float(i)*float(j)*flStep);  // real component
+#pragma warning(suppress: 6386)
+                    pfUnityTable[uIndex + uLength*4] = -sinf(float(i)*float(j)*flStep); // imaginary component
+                }
+            }
+            pfUnityTable += uLength*8;
+        }
+        while (uLength > 16);
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  The FFT functions generate output in bit reversed order.
+    //  Use this function to re-arrange them into order of increasing frequency.
+    //
+    // REMARKS:
+    //
+    // PARAMETERS:
+    //  pOutput     - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<<uLog2Length)/4 elements
+    //  pInput      - [in]  input buffer, samples in bit reversed order as generated by FFT functions, cannot overlap pOutput, must have at least (1<<uLog2Length)/4 elements
+    //  uLog2Length - [in]  LOG (base 2) of FFT length in samples, must be >= 2
+    //----------------------------------------------------------------------------------
+    inline void FFTUnswizzle (_Out_writes_((1<<uLog2Length)/4) XMVECTOR* __restrict pOutput,
+                              _In_reads_((1<<uLog2Length)/4) const XMVECTOR* __restrict pInput,
+                              _In_ const size_t uLog2Length)
+    {
+        assert(pOutput);
+        assert(pInput);
+        assert(uLog2Length >= 2);
+        _Analysis_assume_(uLog2Length >= 2);
+
+        float*       __restrict pfOutput = (float* __restrict)pOutput;
+        const float* __restrict pfInput  = (const float* __restrict)pInput;
+        const size_t uLength = size_t(1) << uLog2Length;
+
+        if ((uLog2Length & 0x1) == 0)
+        {
+            // even powers of two
+            for (size_t uIndex=0; uIndex < uLength; ++uIndex)
+            {
+                size_t n = uIndex;
+                n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
+                n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
+                n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
+                n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
+                n >>= (32 - uLog2Length);
+                pfOutput[n] = pfInput[uIndex];
+            }
+        }
+        else
+        {
+            // odd powers of two
+            for (size_t uIndex=0; uIndex < uLength; ++uIndex)
+            {
+                size_t n = (uIndex>>3);
+                n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
+                n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
+                n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
+                n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
+                n >>= (32 - (uLog2Length-3));
+                n |= ((uIndex & 0x7) << (uLog2Length - 3));
+                pfOutput[n] = pfInput[uIndex];
+            }
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Convert complex components to polar form.
+    //
+    // PARAMETERS:
+    //  pOutput         - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
+    //  pInputReal      - [in]  input buffer (real components), must have at least uLength/4 elements
+    //  pInputImaginary - [in]  input buffer (imaginary components), must have at least uLength/4 elements
+    //  uLength         - [in]  FFT length in samples, must be a power of 2 >= 4
+    //----------------------------------------------------------------------------------
+#pragma warning(suppress: 6101)
+    inline void FFTPolar (_Out_writes_(uLength/4) XMVECTOR* __restrict pOutput,
+                          _In_reads_(uLength/4) const XMVECTOR* __restrict pInputReal,
+                          _In_reads_(uLength/4) const XMVECTOR* __restrict pInputImaginary,
+                          _In_ const size_t uLength)
+    {
+        using namespace DirectX;
+
+        assert(pOutput);
+        assert(pInputReal);
+        assert(pInputImaginary);
+        assert(uLength >= 4);
+        _Analysis_assume_(uLength >= 4);
+        assert(ISPOWEROF2(uLength));
+
+        float flOneOverLength = 1.0f / uLength;
+
+        // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
+        XMVECTOR vOneOverLength = XMVectorReplicate( flOneOverLength );
+
+        for (size_t uIndex=0; uIndex < (uLength>>2); ++uIndex)
+        {
+            XMVECTOR vReal      = XMVectorMultiply(pInputReal[uIndex], vOneOverLength);
+            XMVECTOR vImaginary = XMVectorMultiply(pInputImaginary[uIndex], vOneOverLength);
+            XMVECTOR vRR        = XMVectorMultiply(vReal, vReal);
+            XMVECTOR vII        = XMVectorMultiply(vImaginary, vImaginary);
+            XMVECTOR vRRplusII  = XMVectorAdd(vRR, vII);
+            XMVECTOR vTotal     = XMVectorSqrt(vRRplusII);
+            pOutput[uIndex]     = XMVectorAdd(vTotal, vTotal);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Deinterleaves audio samples
+    //
+    // REMARKS:
+    //  For example, audio of the form [LRLRLR] becomes [LLLRRR].
+    //
+    // PARAMETERS:
+    //  pOutput       - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  pInput        - [in]  input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  uChannelCount - [in]  number of channels, must be > 1
+    //  uFrameCount   - [in]  number of frames of valid data, must be > 0
+    //----------------------------------------------------------------------------------
+    inline void Deinterleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput,
+                              _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput,
+                              _In_ const size_t uChannelCount,
+                              _In_ const size_t uFrameCount)
+    {
+        assert(pOutput);
+        assert(pInput);
+        assert(uChannelCount > 1);
+        assert(uFrameCount > 0);
+
+        float*       __restrict pfOutput = reinterpret_cast<float* __restrict>(pOutput);
+        const float* __restrict pfInput  = reinterpret_cast<const float* __restrict>(pInput);
+
+        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+        {
+            for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame)
+            {
+                pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel];
+            }
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Interleaves audio samples
+    //
+    // REMARKS:
+    //  For example, audio of the form [LLLRRR] becomes [LRLRLR].
+    //
+    // PARAMETERS:
+    //  pOutput       - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  pInput        - [in]  input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  uChannelCount - [in]  number of channels, must be > 1
+    //  uFrameCount   - [in]  number of frames of valid data, must be > 0
+    //----------------------------------------------------------------------------------
+    inline void Interleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput,
+                            _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput,
+                            _In_ const size_t uChannelCount,
+                            _In_ const size_t uFrameCount)
+    {
+        assert(pOutput);
+        assert(pInput);
+        assert(uChannelCount > 1);
+        assert(uFrameCount > 0);
+
+        float*       __restrict pfOutput = reinterpret_cast<float* __restrict>(pOutput);
+        const float* __restrict pfInput  = reinterpret_cast<const float* __restrict>(pInput);
+
+        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+        {
+            for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame)
+            {
+                pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame];
+            }
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  This function applies a 2^N-sample FFT and unswizzles the result such
+    //  that the samples are in order of increasing frequency.
+    //  Audio is first deinterleaved if multichannel.
+    //
+    // PARAMETERS:
+    //  pReal         - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pImaginary    - [out]   imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pUnityTable   - [in]    unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
+    //  uChannelCount - [in]    number of channels, must be within [1, 6]
+    //  uLog2Length   - [in]    LOG (base 2) of FFT length in frames, must within [2, 9]
+    //----------------------------------------------------------------------------------
+    inline void FFTInterleaved (_Inout_updates_(((1<<uLog2Length)*uChannelCount)/4) XMVECTOR* __restrict pReal,
+                                _Out_writes_(((1<<uLog2Length)*uChannelCount)/4) XMVECTOR* __restrict pImaginary,
+                                _In_reads_(1<<uLog2Length) const XMVECTOR* __restrict pUnityTable,
+                                _In_ const size_t uChannelCount,
+                                _In_ const size_t uLog2Length)
+    {
+        assert(pReal);
+        assert(pImaginary);
+        assert(pUnityTable);
+        assert((uintptr_t)pReal % 16 == 0);
+        assert((uintptr_t)pImaginary % 16 == 0);
+        assert((uintptr_t)pUnityTable % 16 == 0);
+        assert(uChannelCount > 0 && uChannelCount <= 6);
+        assert(uLog2Length >= 2 && uLog2Length <= 9);
+
+        XMVECTOR vRealTemp[768];
+        XMVECTOR vImaginaryTemp[768];
+        const size_t uLength = size_t(1) << uLog2Length;
+
+        if (uChannelCount > 1)
+        {
+            Deinterleave(vRealTemp, pReal, uChannelCount, uLength);
+        }
+        else
+        {
+            memcpy_s(vRealTemp, sizeof(vRealTemp), pReal, (uLength>>2)*sizeof(XMVECTOR));
+        }
+
+        memset( vImaginaryTemp, 0, (uChannelCount*(uLength>>2)) * sizeof(XMVECTOR) );
+
+        if (uLength > 16)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
+            }
+        }
+        else if (uLength == 16)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
+            }
+        }
+        else if (uLength == 8)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
+            }
+        }
+        else if (uLength == 4)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
+            }
+        }
+
+        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+        {
+            FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
+            FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  This function applies a 2^N-sample inverse FFT.
+    //  Audio is interleaved if multichannel.
+    //
+    // PARAMETERS:
+    //  pReal         - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pImaginary    - [in]    imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pUnityTable   - [in]    unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
+    //  uChannelCount - [in]    number of channels, must be > 0
+    //  uLog2Length   - [in]    LOG (base 2) of FFT length in frames, must within [2, 9]
+    //----------------------------------------------------------------------------------
+    inline void IFFTDeinterleaved (_Inout_updates_(((1<<uLog2Length)*uChannelCount)/4) XMVECTOR* __restrict pReal,
+                                   _In_reads_(((1<<uLog2Length)*uChannelCount)/4) const XMVECTOR* __restrict pImaginary,
+                                   _In_reads_(1<<uLog2Length) const XMVECTOR* __restrict pUnityTable,
+                                   _In_ const size_t uChannelCount,
+                                   _In_ const size_t uLog2Length)
+    {
+        using namespace DirectX;
+
+        assert(pReal);
+        assert(pImaginary);
+        assert(pUnityTable);
+        assert((uintptr_t)pReal % 16 == 0);
+        assert((uintptr_t)pImaginary % 16 == 0);
+        assert((uintptr_t)pUnityTable % 16 == 0);
+        assert(uChannelCount > 0 && uChannelCount <= 6);
+        _Analysis_assume_(uChannelCount > 0 && uChannelCount <= 6);
+        assert(uLog2Length >= 2 && uLog2Length <= 9);
+        _Analysis_assume_(uLog2Length >= 2 && uLog2Length <= 9);
+
+        XMVECTOR vRealTemp[768] = { 0 };
+        XMVECTOR vImaginaryTemp[768] = { 0 };
+
+        const size_t uLength = size_t(1) << uLog2Length;
+
+        const XMVECTOR vRnp = XMVectorReplicate(1.0f/uLength);
+        const XMVECTOR vRnm = XMVectorReplicate(-1.0f/uLength);
+        for (size_t u=0; u < uChannelCount*(uLength>>2); u++)
+        {
+            vRealTemp[u]      = XMVectorMultiply(pReal[u], vRnp);
+            vImaginaryTemp[u] = XMVectorMultiply(pImaginary[u], vRnm);
+        }
+
+        if (uLength > 16)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
+            }
+        }
+        else if (uLength == 16)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
+            }
+        }
+        else if (uLength == 8)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
+            }
+        }
+        else if (uLength == 4)
+        {
+            for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
+            }
+        }
+
+        for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel)
+        {
+            FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
+        }
+
+        if (uChannelCount > 1)
+        {
+            Interleave(pReal, vImaginaryTemp, uChannelCount, uLength);
+        }
+        else
+        {
+            memcpy_s(pReal, uLength*uChannelCount*sizeof(float), vImaginaryTemp, (uLength>>2)*sizeof(XMVECTOR));
+        }
+    }
+
+}; // namespace XDSP
+
+#pragma warning(pop)