From 9a1e0b53187e99096c473b9b8b11e9f07313e357 Mon Sep 17 00:00:00 2001
From: Chuck Walbourn <chuckw@windows.microsoft.com>
Date: Mon, 23 May 2016 12:45:32 -0700
Subject: [PATCH] XNAMath 2.02

---
 Inc/xnamath.h          |  49 ++++++-----
 Inc/xnamathconvert.inl | 185 ++++++++++++++++++++++++-----------------
 Inc/xnamathmatrix.inl  |  24 +++---
 Inc/xnamathmisc.inl    | 143 +++++--------------------------
 Inc/xnamathvector.inl  |  83 +++++++++---------
 5 files changed, 208 insertions(+), 276 deletions(-)

diff --git a/Inc/xnamath.h b/Inc/xnamath.h
index eabe19f..df0a571 100644
--- a/Inc/xnamath.h
+++ b/Inc/xnamath.h
@@ -22,7 +22,7 @@ Abstract:
 #error XNAMATH and XBOXMATH are incompatible in the same compilation module. Use one or the other.
 #endif
 
-#define XNAMATH_VERSION 201
+#define XNAMATH_VERSION 202
 
 #if !defined(_XM_X64_) && !defined(_XM_X86_)
 #if defined(_M_AMD64) || defined(_AMD64_)
@@ -32,6 +32,16 @@ Abstract:
 #endif
 #endif
 
+#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_)
+#if defined(_XM_X64_) || defined(_XM_X86_)
+#define _XM_LITTLEENDIAN_
+#elif defined(_XBOX_VER)
+#define _XM_BIGENDIAN_
+#else
+#error xnamath.h only supports x86, x64, or XBox 360 targets
+#endif
+#endif
+
 #if defined(_XM_X86_) || defined(_XM_X64_)
 #define _XM_SSE_INTRINSICS_
 #if !defined(__cplusplus) && !defined(_XM_NO_INTRINSICS_)
@@ -174,7 +184,7 @@ XMFINLINE FLOAT XMConvertToDegrees(FLOAT fRadians) { return fRadians * (180.0f /
  ****************************************************************************/
 
 #pragma warning(push)
-#pragma warning(disable:4201)
+#pragma warning(disable:4201 4365)
 
 #if !defined (_XM_X86_) && !defined(_XM_X64_)
 #pragma bitfield_order(push)
@@ -278,9 +288,9 @@ typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU32 {
 } XMVECTORU32;
 
 // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86 and Xbox 360, but not for other targets
-#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINISCS_)
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 typedef const XMVECTOR FXMVECTOR;
-#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINISCS_)
+#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINSICS_)
 typedef const XMVECTOR FXMVECTOR;
 #elif defined(__cplusplus)
 typedef const XMVECTOR& FXMVECTOR;
@@ -289,7 +299,7 @@ typedef const XMVECTOR FXMVECTOR;
 #endif
 
 // Fix-up for (4th+) XMVECTOR parameters to pass in-register for Xbox 360 and by reference otherwise
-#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINISCS_)
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 typedef const XMVECTOR CXMVECTOR;
 #elif defined(__cplusplus)
 typedef const XMVECTOR& CXMVECTOR;
@@ -354,11 +364,11 @@ typedef _DECLSPEC_ALIGN_16_ struct _XMMATRIX
     FLOAT       operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
     FLOAT&      operator() (UINT Row, UINT Column) { return m[Row][Column]; }
 
-    _XMMATRIX&  operator= (CONST _XMMATRIX&);
+    _XMMATRIX&  operator= (CONST _XMMATRIX& M);
 
 #ifndef XM_NO_OPERATOR_OVERLOADS
-    _XMMATRIX&  operator*= (CONST _XMMATRIX&);
-    _XMMATRIX   operator* (CONST _XMMATRIX&) CONST;
+    _XMMATRIX&  operator*= (CONST _XMMATRIX& M);
+    _XMMATRIX   operator* (CONST _XMMATRIX& M) CONST;
 #endif // !XM_NO_OPERATOR_OVERLOADS
 
 #endif // __cplusplus
@@ -1500,7 +1510,7 @@ typedef struct _XMCOLOR
 
     _XMCOLOR() {};
     _XMCOLOR(UINT Color) : c(Color) {};
-    _XMCOLOR(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+    _XMCOLOR(FLOAT _r, FLOAT _g, FLOAT _b, FLOAT _a);
     _XMCOLOR(CONST FLOAT *pArray);
 
     operator UINT () { return c; }
@@ -2582,9 +2592,6 @@ XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0       = {-1.0f,0.0f, 0.0f, 0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1       = {0.0f,-1.0f, 0.0f, 0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2       = {0.0f, 0.0f,-1.0f, 0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3       = {0.0f, 0.0f, 0.0f,-1.0f};
-
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
-
 XMGLOBALCONST XMVECTORI32 g_XMNegativeZero      = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
 XMGLOBALCONST XMVECTORI32 g_XMNegate3           = {0x80000000, 0x80000000, 0x80000000, 0x00000000};
 XMGLOBALCONST XMVECTORI32 g_XMMask3             = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
@@ -2638,8 +2645,6 @@ XMGLOBALCONST XMVECTORF32 g_XMNegateW           = { 1.0f, 1.0f, 1.0f,-1.0f};
 XMGLOBALCONST XMVECTORI32 g_XMSelect0101        = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1};
 XMGLOBALCONST XMVECTORI32 g_XMSelect1010        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0};
 XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD};
-
-#ifdef _XM_NO_INTRINSICS_
 XMGLOBALCONST XMVECTORI32 g_XMSelect1000        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0};
 XMGLOBALCONST XMVECTORI32 g_XMSelect1100        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0};
 XMGLOBALCONST XMVECTORI32 g_XMSelect1110        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0};
@@ -2650,9 +2655,6 @@ XMGLOBALCONST XMVECTORI32 g_XMSwizzleYZXW       = {XM_PERMUTE_0Y, XM_PERMUTE_0Z,
 XMGLOBALCONST XMVECTORI32 g_XMSwizzleZXYW       = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
 XMGLOBALCONST XMVECTORI32 g_XMPermute0X0Y1X1Y   = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y};
 XMGLOBALCONST XMVECTORI32 g_XMPermute0Z0W1Z1W   = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_1W};
-#endif // !_XM_NO_INTRINSICS_
-
-#ifdef _XM_SSE_INTRINSICS_
 XMGLOBALCONST XMVECTORF32 g_XMFixupY16          = {1.0f,1.0f/65536.0f,0.0f,0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16       = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f};
 XMGLOBALCONST XMVECTORI32 g_XMFlipY             = {0,0x80000000,0,0};
@@ -2685,9 +2687,6 @@ XMGLOBALCONST XMVECTORF32 g_XMMulDec4           = {1.0f,1.0f/1024.0f,1.0f/(1024.
 XMGLOBALCONST XMVECTORI32 g_XMMaskByte4         = {0xFF,0xFF00,0xFF0000,0xFF000000};
 XMGLOBALCONST XMVECTORI32 g_XMXorByte4          = {0x80,0x8000,0x800000,0x00000000};
 XMGLOBALCONST XMVECTORF32 g_XMAddByte4          = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0};
-#endif
-
-#endif // _XM_NO_INTRINSICS_
 
 /****************************************************************************
  *
@@ -2696,7 +2695,7 @@ XMGLOBALCONST XMVECTORF32 g_XMAddByte4          = {-128.0f,-128.0f*256.0f,-128.0
  ****************************************************************************/
 
 #pragma warning(push)
-#pragma warning(disable:4214 4204 4616 6001)
+#pragma warning(disable:4214 4204 4365 4616 6001)
 
 #if !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_)
 
@@ -2861,10 +2860,10 @@ XMFINLINE XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateE
 
 //------------------------------------------------------------------------------
 
-#include <xnamathconvert.inl>
-#include <xnamathvector.inl>
-#include <xnamathmatrix.inl>
-#include <xnamathmisc.inl>
+#include "xnamathconvert.inl"
+#include "xnamathvector.inl"
+#include "xnamathmatrix.inl"
+#include "xnamathmisc.inl"
 
 #pragma warning(pop)
 
diff --git a/Inc/xnamathconvert.inl b/Inc/xnamathconvert.inl
index 7180e44..d76d78d 100644
--- a/Inc/xnamathconvert.inl
+++ b/Inc/xnamathconvert.inl
@@ -431,8 +431,8 @@ XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
     XMASSERT(((UINT_PTR)pSource & 3) == 0);
-    __m128i V = _mm_set_epi32( 0, 0, 0, *pSource );
-    return reinterpret_cast<__m128 *>(&V)[0];
+
+    return _mm_load_ss( (const float*)pSource );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -480,9 +480,10 @@ XMFINLINE XMVECTOR XMLoadInt2
 #elif defined(_XM_SSE_INTRINSICS_)
 
     XMASSERT(pSource);
-    __m128i V = _mm_set_epi32( 0, 0, *(pSource+1), *pSource );
-    return reinterpret_cast<__m128 *>(&V)[0];
 
+    __m128 x = _mm_load_ss( (const float*)pSource );
+    __m128 y = _mm_load_ss( (const float*)(pSource+1) );
+    return _mm_unpacklo_ps( x, y );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -509,6 +510,8 @@ XMFINLINE XMVECTOR XMLoadInt2A
 #elif defined(_XM_SSE_INTRINSICS_)
 
     XMASSERT(pSource);
+    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
     __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
     return reinterpret_cast<__m128 *>(&V)[0];
 
@@ -526,20 +529,16 @@ XMFINLINE XMVECTOR XMLoadFloat2
 #if defined(_XM_NO_INTRINSICS_)
     XMVECTOR V;
     XMASSERT(pSource);
+
     ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
     ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
-    V.vector4_f32[2] = V.vector4_f32[3] = 0.0f;
     return V;
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
-#ifdef _XM_X86_
+
     __m128 x = _mm_load_ss( &pSource->x );
     __m128 y = _mm_load_ss( &pSource->y );
     return _mm_unpacklo_ps( x, y );
-#else // _XM_X64_
-    // This reads 2 floats past the memory that should be ignored.
-    return _mm_loadu_ps( &pSource->x );
-#endif
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -565,14 +564,10 @@ XMFINLINE XMVECTOR XMLoadFloat2A
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
-#ifdef _XM_X86_
-    __m128 x = _mm_load_ss( &pSource->x );
-    __m128 y = _mm_load_ss( &pSource->y );
-    return _mm_unpacklo_ps( x, y );
-#else // _XM_X64_
-    // This reads 2 floats past the memory that should be ignored.
-    return _mm_load_ps( &pSource->x );
-#endif
+    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+    __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
+    return reinterpret_cast<__m128 *>(&V)[0];
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -792,8 +787,17 @@ XMFINLINE XMVECTOR XMLoadInt3
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
     __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
     return reinterpret_cast<__m128 *>(&V)[0];
+#else
+    __m128 x = _mm_load_ss( (const float*)pSource );
+    __m128 y = _mm_load_ss( (const float*)(pSource+1) );
+    __m128 z = _mm_load_ss( (const float*)(pSource+2) );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    return _mm_movelh_ps( xy, z );
+#endif // !_XM_ISVS2005_
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -839,15 +843,26 @@ XMFINLINE XMVECTOR XMLoadFloat3
 #if defined(_XM_NO_INTRINSICS_)
     XMVECTOR V;
     XMASSERT(pSource);
+
     ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
     ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
     ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
-    V.vector4_f32[3] = 0.0f;
     return V;
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
     // This reads 1 floats past the memory that should be ignored.
+    // Need to continue to do this for VS 2005 due to compiler issue but prefer new method
+    // to avoid triggering issues with memory debug tools (like AV)
     return _mm_loadu_ps( &pSource->x );
+#else
+    __m128 x = _mm_load_ss( &pSource->x );
+    __m128 y = _mm_load_ss( &pSource->y );
+    __m128 z = _mm_load_ss( &pSource->z );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    return _mm_movelh_ps( xy, z );
+#endif // !_XM_ISVS2005_
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -874,10 +889,10 @@ XMFINLINE XMVECTOR XMLoadFloat3A
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
+    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
 
-	// This reads 1 floats past the memory that should be ignored.
-
-	return _mm_load_ps( &pSource->x );
+    // This reads 1 floats past the memory that should be ignored.
+    return _mm_load_ps( &pSource->x );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -1298,9 +1313,9 @@ XMFINLINE XMVECTOR XMLoadFloat3PK
     CONST XMFLOAT3PK* pSource
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT Result[4];
     UINT Mantissa;
     UINT Exponent;
-    UINT Result[3];
 
     XMASSERT(pSource);
 
@@ -1406,7 +1421,7 @@ XMFINLINE XMVECTOR XMLoadFloat3PK
         Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
     }
 
-    return XMLoadFloat3( (XMFLOAT3*)&Result );
+    return XMLoadFloat3A( (XMFLOAT3A*)&Result );
 }
 
 //------------------------------------------------------------------------------
@@ -1416,9 +1431,9 @@ XMFINLINE XMVECTOR XMLoadFloat3SE
     CONST XMFLOAT3SE* pSource
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT Result[4];
     UINT Mantissa;
     UINT Exponent, ExpBits;
-    UINT Result[3];
 
     XMASSERT(pSource);
 
@@ -1515,7 +1530,7 @@ XMFINLINE XMVECTOR XMLoadFloat3SE
         Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
     }
 
-    return XMLoadFloat3( (XMFLOAT3*)&Result );
+    return XMLoadFloat3A( (XMFLOAT3A*)&Result );
 }
 
 //------------------------------------------------------------------------------
@@ -1541,6 +1556,7 @@ XMFINLINE XMVECTOR XMLoadInt4
 #elif defined(_XM_SSE_INTRINSICS_)
 
     XMASSERT(pSource);
+
     __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
     return reinterpret_cast<__m128 *>(&V)[0];
 
@@ -1577,7 +1593,6 @@ XMFINLINE XMVECTOR XMLoadInt4A
     __m128i V = _mm_load_si128( (const __m128i*)pSource );
     return reinterpret_cast<__m128 *>(&V)[0];
 
-
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -1592,6 +1607,7 @@ XMFINLINE XMVECTOR XMLoadFloat4
 #if defined(_XM_NO_INTRINSICS_)
     XMVECTOR V;
     XMASSERT(pSource);
+
     ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
     ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
     ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
@@ -1599,6 +1615,7 @@ XMFINLINE XMVECTOR XMLoadFloat4
     return V;
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pSource);
+
     return _mm_loadu_ps( &pSource->x );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
@@ -3055,13 +3072,18 @@ XMFINLINE VOID XMStoreInt
     FXMVECTOR V
 )
 {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_NO_INTRINSICS_)
 
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 3) == 0);
 
     *pDestination = XMVectorGetIntX( V );
 
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pDestination);
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+    _mm_store_ss( (float*)pDestination, V );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -3074,13 +3096,18 @@ XMFINLINE VOID XMStoreFloat
     FXMVECTOR V
 )
 {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_NO_INTRINSICS_)
 
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 3) == 0);
 
     *pDestination = XMVectorGetX( V );
 
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pDestination);
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+    _mm_store_ss( pDestination, V );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -3102,12 +3129,12 @@ XMFINLINE VOID XMStoreInt2
     pDestination[1] = V.vector4_u32[1];
 
 #elif defined(_XM_SSE_INTRINSICS_)
-
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 3) == 0);
-    pDestination[0] = XMVectorGetIntX( V );
-    pDestination[1] = XMVectorGetIntY( V );
 
+    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( (float*)&pDestination[0], V );
+    _mm_store_ss( (float*)&pDestination[1], T );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -3131,7 +3158,7 @@ XMFINLINE VOID XMStoreInt2A
 #elif defined(_XM_SSE_INTRINSICS_)
 
     XMASSERT(pDestination);
-    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
 
     _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
 
@@ -3156,14 +3183,12 @@ XMFINLINE VOID XMStoreFloat2
     pDestination->y = V.vector4_f32[1];
 
 #elif defined(_XM_SSE_INTRINSICS_)
-
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 3) == 0);
 
-	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T );
-
+    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss( &pDestination->y, T );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@@ -3187,11 +3212,9 @@ XMFINLINE VOID XMStoreFloat2A
 #elif defined(_XM_SSE_INTRINSICS_)
 
     XMASSERT(pDestination);
-    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
 
-	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T );
+    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
 
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@@ -3391,9 +3414,12 @@ XMFINLINE VOID XMStoreInt3
 
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 3) == 0);
-    pDestination[0] = XMVectorGetIntX( V );
-    pDestination[1] = XMVectorGetIntY( V );
-    pDestination[2] = XMVectorGetIntZ( V );
+
+    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( (float*)pDestination, V );
+    _mm_store_ss( (float*)&pDestination[1], T1 );
+    _mm_store_ss( (float*)&pDestination[2], T2 );
 
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@@ -3419,10 +3445,11 @@ XMFINLINE VOID XMStoreInt3A
 #elif defined(_XM_SSE_INTRINSICS_)
 
     XMASSERT(pDestination);
-    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
-    pDestination[0] = XMVectorGetIntX( V );
-    pDestination[1] = XMVectorGetIntY( V );
-    pDestination[2] = XMVectorGetIntZ( V );
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_store_ss( (float*)&pDestination[2], T );
 
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@@ -3450,11 +3477,11 @@ XMFINLINE VOID XMStoreFloat3
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 3) == 0);
 
-	XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
-	XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T1 );
-	_mm_store_ss( &pDestination->z, T2 );
+    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss( &pDestination->y, T1 );
+    _mm_store_ss( &pDestination->z, T2 );
 
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@@ -3482,11 +3509,9 @@ XMFINLINE VOID XMStoreFloat3A
     XMASSERT(pDestination);
     XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
 
-	XMVECTOR T1 = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-	XMVECTOR T2 = _mm_unpackhi_ps( V, V );
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T1 );
-	_mm_store_ss( &pDestination->z, T2 );
+    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_store_ss( &pDestination->z, T );
 
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@@ -3918,13 +3943,13 @@ XMFINLINE VOID XMStoreFloat3PK
     FXMVECTOR V
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT IValue[4];
     UINT I, Sign, j;
-    UINT IValue[3];
     UINT Result[3];
 
     XMASSERT(pDestination);
 
-    XMStoreFloat3( (XMFLOAT3*)&IValue, V );
+    XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
 
     // X & Y Channels (5-bit exponent, 6-bit mantissa)
     for(j=0; j < 2; ++j)
@@ -4036,14 +4061,15 @@ XMFINLINE VOID XMStoreFloat3SE
     FXMVECTOR V
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT IValue[4];
     UINT I, Sign, j, T;
-    UINT IValue[3];
     UINT Frac[3];
     UINT Exp[3];
+    
 
     XMASSERT(pDestination);
 
-    XMStoreFloat3( (XMFLOAT3*)&IValue, V );
+    XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
 
     // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
     for(j=0; j < 3; ++j)
@@ -4131,7 +4157,7 @@ XMFINLINE VOID XMStoreInt4
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pDestination);
-
+    
     _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
 
 #else // _XM_VMX128_INTRINSICS_
@@ -4158,6 +4184,7 @@ XMFINLINE VOID XMStoreInt4A
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pDestination);
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
 
     _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
 
@@ -4176,7 +4203,8 @@ XMFINLINE VOID XMStoreInt4NC
 #if defined(_XM_NO_INTRINSICS_)
 
     XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
     pDestination[0] = V.vector4_u32[0];
     pDestination[1] = V.vector4_u32[1];
     pDestination[2] = V.vector4_u32[2];
@@ -4184,7 +4212,8 @@ XMFINLINE VOID XMStoreInt4NC
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
     _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
 
 #else // _XM_VMX128_INTRINSICS_
@@ -4202,7 +4231,7 @@ XMFINLINE VOID XMStoreFloat4
 #if defined(_XM_NO_INTRINSICS_)
 
     XMASSERT(pDestination);
-
+    
     pDestination->x = V.vector4_f32[0];
     pDestination->y = V.vector4_f32[1];
     pDestination->z = V.vector4_f32[2];
@@ -4210,7 +4239,7 @@ XMFINLINE VOID XMStoreFloat4
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pDestination);
-
+    
     _mm_storeu_ps( &pDestination->x, V );
 
 #else // _XM_VMX128_INTRINSICS_
@@ -4255,7 +4284,8 @@ XMFINLINE VOID XMStoreFloat4NC
 #if defined(_XM_NO_INTRINSICS_)
 
     XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
     pDestination->x = V.vector4_f32[0];
     pDestination->y = V.vector4_f32[1];
     pDestination->z = V.vector4_f32[2];
@@ -4263,7 +4293,8 @@ XMFINLINE VOID XMStoreFloat4NC
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
     _mm_storeu_ps( &pDestination->x, V );
 
 #else // _XM_VMX128_INTRINSICS_
@@ -5452,13 +5483,13 @@ XMFINLINE VOID XMStoreColor
     // Convert to 0-255
     vResult = _mm_mul_ps(vResult,Scale);
     // Shuffle RGBA to ARGB
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,1,0,3));
+    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
     // Convert to int 
     __m128i vInt = _mm_cvtps_epi32(vResult);
     // Mash to shorts
     vInt = _mm_packs_epi32(vInt,vInt);
     // Mash to bytes
-    vInt = _mm_packs_epi16(vInt,vInt);
+    vInt = _mm_packus_epi16(vInt,vInt);
     // Store the color
     _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
 #else // _XM_VMX128_INTRINSICS_
@@ -5698,12 +5729,12 @@ XMFINLINE VOID XMStoreFloat4x4A
     pDestination->m[3][3] = M.r[3].vector4_f32[3];
 
 #elif defined(_XM_SSE_INTRINSICS_)
-	XMASSERT(pDestination);
+    XMASSERT(pDestination);
 
-	_mm_store_ps( &pDestination->_11, M.r[0] );
-	_mm_store_ps( &pDestination->_21, M.r[1] );
-	_mm_store_ps( &pDestination->_31, M.r[2] );
-	_mm_store_ps( &pDestination->_41, M.r[3] );
+    _mm_store_ps( &pDestination->_11, M.r[0] );
+    _mm_store_ps( &pDestination->_21, M.r[1] );
+    _mm_store_ps( &pDestination->_31, M.r[2] );
+    _mm_store_ps( &pDestination->_41, M.r[3] );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
diff --git a/Inc/xnamathmatrix.inl b/Inc/xnamathmatrix.inl
index f35a09f..293501a 100644
--- a/Inc/xnamathmatrix.inl
+++ b/Inc/xnamathmatrix.inl
@@ -2310,7 +2310,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveRH
 XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
 (
     FLOAT FovAngleY, 
-    FLOAT AspectHByW, 
+    FLOAT AspectRatio, 
     FLOAT NearZ, 
     FLOAT FarZ
 )
@@ -2324,13 +2324,13 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
     XMMATRIX M;
 
     XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
     XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
 
     XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
 
     Height = CosFov / SinFov;
-    Width = Height / AspectHByW;
+    Width = Height / AspectRatio;
 
     M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f);
     M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f);
@@ -2341,7 +2341,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
     XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
 	XMMATRIX M;
     FLOAT    SinFov;
@@ -2351,7 +2351,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
     // Note: This is recorded on the stack
     FLOAT Height = CosFov / SinFov;
     XMVECTOR rMem = {
-        Height / AspectHByW,
+        Height / AspectRatio,
         Height,
         fRange,
         -fRange * NearZ
@@ -2363,7 +2363,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
     vTemp = _mm_move_ss(vTemp,vValues);
     // CosFov / SinFov,0,0,0
     M.r[0] = vTemp;
-    // 0,Height / AspectHByW,0,0
+    // 0,Height / AspectRatio,0,0
     vTemp = vValues;
     vTemp = _mm_and_ps(vTemp,g_XMMaskY);
     M.r[1] = vTemp;
@@ -2386,7 +2386,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
 XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
 (
     FLOAT FovAngleY, 
-    FLOAT AspectHByW, 
+    FLOAT AspectRatio, 
     FLOAT NearZ, 
     FLOAT FarZ
 )
@@ -2400,13 +2400,13 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
     XMMATRIX M;
 
     XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
     XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
 
     XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
 
     Height = CosFov / SinFov;
-    Width = Height / AspectHByW;
+    Width = Height / AspectRatio;
 
     M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f);
     M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f);
@@ -2417,7 +2417,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
 
 #elif defined(_XM_SSE_INTRINSICS_)
     XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
     XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
 	XMMATRIX M;
     FLOAT    SinFov;
@@ -2427,7 +2427,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
     // Note: This is recorded on the stack
     FLOAT Height = CosFov / SinFov;
     XMVECTOR rMem = {
-        Height / AspectHByW,
+        Height / AspectRatio,
         Height,
         fRange,
         fRange * NearZ
@@ -2439,7 +2439,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
     vTemp = _mm_move_ss(vTemp,vValues);
     // CosFov / SinFov,0,0,0
     M.r[0] = vTemp;
-    // 0,Height / AspectHByW,0,0
+    // 0,Height / AspectRatio,0,0
     vTemp = vValues;
     vTemp = _mm_and_ps(vTemp,g_XMMaskY);
     M.r[1] = vTemp;
diff --git a/Inc/xnamathmisc.inl b/Inc/xnamathmisc.inl
index 593c45b..c606d1d 100644
--- a/Inc/xnamathmisc.inl
+++ b/Inc/xnamathmisc.inl
@@ -693,7 +693,7 @@ XMFINLINE XMVECTOR XMQuaternionBaryCentric
 
     s = f + g;
 
-    if (s < 0.00001f && s > -0.00001f)
+    if ((s < 0.00001f) && (s > -0.00001f))
     {
         Result = Q0;
     }
@@ -932,28 +932,26 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
     CXMMATRIX M
 )
 {
-#if defined(_XM_NO_INTRINSICS_)
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
 
     XMVECTOR Q0, Q1, Q2;
     XMVECTOR M00, M11, M22;
     XMVECTOR CQ0, CQ1, C;
     XMVECTOR CX, CY, CZ, CW;
     XMVECTOR SQ1, Scale;
-    XMVECTOR Rsq, Sqrt, VEqualsInfinity, VEqualsZero, Select;
+    XMVECTOR Rsq, Sqrt, VEqualsNaN;
     XMVECTOR A, B, P;
     XMVECTOR PermuteSplat, PermuteSplatT;
     XMVECTOR SignB, SignBT;
     XMVECTOR PermuteControl, PermuteControlT;
-    XMVECTOR Zero;
     XMVECTOR Result;
-    static CONST XMVECTOR  OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
-    static CONST XMVECTOR  SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
-    static CONST XMVECTOR  SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTOR  SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTOR  SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTOR  SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTOR  SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
-    static CONST XMVECTOR  SignNNNX = {-1.0f, -1.0f, -1.0f, 2.0e-126f};
+    static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
+    static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
     static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
     static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
     static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
@@ -972,26 +970,23 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
     M11 = XMVectorSplatY(M.r[1]);
     M22 = XMVectorSplatZ(M.r[2]);
 
-    Q0 = XMVectorMultiply(SignPNNP, M00);
-    Q0 = XMVectorMultiplyAdd(SignNPNP, M11, Q0);
-    Q0 = XMVectorMultiplyAdd(SignNNPP, M22, Q0);
+    Q0 = XMVectorMultiply(SignPNNP.v, M00);
+    Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0);
+    Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0);
 
     Q1 = XMVectorAdd(Q0, g_XMOne.v);
 
     Rsq = XMVectorReciprocalSqrt(Q1);
-    Zero = XMVectorZero();
-    VEqualsInfinity = XMVectorEqualInt(Q1, g_XMInfinity.v);
-    VEqualsZero = XMVectorEqual(Q1, Zero);
+    VEqualsNaN = XMVectorIsNaN(Rsq);
     Sqrt = XMVectorMultiply(Q1, Rsq);
-    Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    Q1 = XMVectorSelect(Q1, Sqrt, Select);
+    Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN);
 
     Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v);
 
     SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v);
 
     CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v);
-    CQ1 = XMVectorPermute(Q0, SignNNNX, Permute0Y0Z0Z1W.v);
+    CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v);
     C = XMVectorGreaterOrEqual(CQ0, CQ1);
 
     CX = XMVectorSplatX(C);
@@ -1000,15 +995,15 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
     CW = XMVectorSplatW(C);
 
     PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ);
-    SignB = XMVectorSelect(SignNPPP, SignPPNP, CZ);
+    SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ);
     PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ);
 
     PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX);
-    SignB = XMVectorSelect(SignB, SignNPPP, CX);
+    SignB = XMVectorSelect(SignB, SignNPPP.v, CX);
     PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX);
 
     PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY);
-    SignBT = XMVectorSelect(SignB, SignPNPP, CY);
+    SignBT = XMVectorSelect(SignB, SignPNPP.v, CY);
     PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY);
 
     PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
@@ -1016,7 +1011,7 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
     PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);
 
     PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW);
-    SignB = XMVectorSelect(SignB, SignNNNX, CW);
+    SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW);
     PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW);
 
     Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);
@@ -1032,104 +1027,6 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
 
     return Result;
 
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR Q0, Q1, Q2;
-    XMVECTOR M00, M11, M22;
-    XMVECTOR CQ0, CQ1, C;
-    XMVECTOR CX, CY, CZ, CW;
-    XMVECTOR SQ1, Scale;
-    XMVECTOR Rsq, Sqrt, VEqualsInfinity, VEqualsZero, Select;
-    XMVECTOR A, B, P;
-    XMVECTOR PermuteSplat, PermuteSplatT;
-    XMVECTOR SignB, SignBT;
-    XMVECTOR PermuteControl, PermuteControlT;
-    XMVECTOR Zero;
-    XMVECTOR Result;
-    static CONST XMVECTORF32  OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
-    static CONST XMVECTORF32  SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTORF32  SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTORF32  SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNNNX = {-1.0f, -1.0f, -1.0f, 2.0e-126f};
-    static CONST XMVECTORI32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
-    static CONST XMVECTORI32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
-    static CONST XMVECTORI32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
-    static CONST XMVECTORI32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z};
-    static CONST XMVECTORI32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y};
-    static CONST XMVECTORI32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y};
-    static CONST XMVECTORI32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z};
-    static CONST XMVECTORI32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X};
-    static CONST XMVECTORI32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W};
-
-    M00 = XMVectorSplatX(M.r[0]);
-    M11 = XMVectorSplatY(M.r[1]);
-    M22 = XMVectorSplatZ(M.r[2]);
-
-    Q0 = XMVectorMultiply(SignPNNP, M00);
-    Q0 = XMVectorMultiplyAdd(SignNPNP, M11, Q0);
-    Q0 = XMVectorMultiplyAdd(SignNNPP, M22, Q0);
-
-    Q1 = XMVectorAdd(Q0, g_XMOne);
-
-    Rsq = XMVectorReciprocalSqrt(Q1);
-    Zero = XMVectorZero();
-    VEqualsInfinity = XMVectorEqualInt(Q1, g_XMInfinity);
-    VEqualsZero = XMVectorEqual(Q1, Zero);
-    Sqrt = XMVectorMultiply(Q1, Rsq);
-    Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    Q1 = XMVectorSelect(Q1, Sqrt, Select);
-
-    Q1 = XMVectorMultiply(Q1, g_XMOneHalf);
-
-    SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf);
-
-    CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W);
-    CQ1 = XMVectorPermute(Q0, SignNNNX, Permute0Y0Z0Z1W);
-    C = XMVectorGreaterOrEqual(CQ0, CQ1);
-
-    CX = XMVectorSplatX(C);
-    CY = XMVectorSplatY(C);
-    CZ = XMVectorSplatZ(C);
-    CW = XMVectorSplatW(C);
-
-    PermuteSplat = XMVectorSelect(SplatZ, SplatY, CZ);
-    SignB = XMVectorSelect(SignNPPP, SignPPNP, CZ);
-    PermuteControl = XMVectorSelect(Permute2, Permute1, CZ);
-
-    PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ, CX);
-    SignB = XMVectorSelect(SignB, SignNPPP, CX);
-    PermuteControl = XMVectorSelect(PermuteControl, Permute2, CX);
-
-    PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX, CY);
-    SignBT = XMVectorSelect(SignB, SignPNPP, CY);
-    PermuteControlT = XMVectorSelect(PermuteControl,Permute0, CY);
-
-    PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
-    SignB = XMVectorSelect(SignB, SignBT, CX);
-    PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);
-
-    PermuteSplat = XMVectorSelect(PermuteSplat,SplatW, CW);
-    SignB = XMVectorSelect(SignB, SignNNNX, CW);
-    PermuteControl = XMVectorSelect(PermuteControl,Permute3, CW);
-
-    Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);
-
-    P = XMVectorPermute(M.r[1], M.r[2],PermuteC);  // {M10, M12, M20, M21}
-    A = XMVectorPermute(M.r[0], P, PermuteA);       // {M01, M12, M20, M03}
-    B = XMVectorPermute(M.r[0], P, PermuteB);       // {M10, M21, M02, M03}
-
-    Q2 = XMVectorMultiplyAdd(SignB, B, A);
-    Q2 = XMVectorMultiply(Q2, Scale);
-
-    Result = XMVectorPermute(Q1, Q2, PermuteControl);
-
-    return Result;
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
diff --git a/Inc/xnamathvector.inl b/Inc/xnamathvector.inl
index cb07840..6aa536c 100644
--- a/Inc/xnamathvector.inl
+++ b/Inc/xnamathvector.inl
@@ -1256,7 +1256,7 @@ XMFINLINE XMVECTOR XMVectorPermute
         ++pControl;
         VectorIndex = (uIndex>>4)&1;
         uIndex &= 0x0F;
-#if defined(_XM_X86_) || defined(_XM_X64_)
+#if defined(_XM_LITTLEENDIAN_)
         uIndex ^= 3; // Swap byte ordering on little endian machines
 #endif
         pWork[0] = aByte[VectorIndex][uIndex];
@@ -4415,22 +4415,22 @@ XMINLINE XMVECTOR XMVectorATan2
     // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
 
     //     Y == 0 and X is Negative         -> Pi with the sign of Y
-    //     Y == 0 and X is Positive         -> 0 with the sign of Y
+    //     y == 0 and x is positive         -> 0 with the sign of y
     //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
-    //     X == -Infinity and Finite Y > 0  -> Pi with the sign of Y
-    //     X == +Infinity and Finite Y > 0  -> 0 with the sign of Y
+    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
+    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
+    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
     //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
     //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
     //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
-    //     TODO: Return Y / X if the result underflows
 
     XMVECTOR Reciprocal;
     XMVECTOR V;
     XMVECTOR YSign;
     XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour;
-    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity, FiniteYGreaterZero;
+    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity;
     XMVECTOR ATanResultValid;
-    XMVECTOR R0, R1, R2, R3, R4, R5, R6, R7;
+    XMVECTOR R0, R1, R2, R3, R4, R5;
     XMVECTOR Zero;
     XMVECTOR Result;
     static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
@@ -4449,8 +4449,6 @@ XMINLINE XMVECTOR XMVectorATan2
     XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
     YEqualsInfinity = XMVectorIsInfinite(Y);
     XEqualsInfinity = XMVectorIsInfinite(X);
-    FiniteYGreaterZero = XMVectorGreater(Y, Zero);
-    FiniteYGreaterZero = XMVectorSelect(FiniteYGreaterZero, Zero, YEqualsInfinity);
 
     YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
     Pi = XMVectorOrInt(Pi, YSign);
@@ -4463,25 +4461,25 @@ XMINLINE XMVECTOR XMVectorATan2
     R3 = XMVectorSelect(R2, R1, YEqualsZero);
     R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
     R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    R6 = XMVectorSelect(R3, R5, YEqualsInfinity);
-    R7 = XMVectorSelect(R6, R1, FiniteYGreaterZero);
-    Result = XMVectorSelect(R6, R7, XEqualsInfinity);
+    Result = XMVectorSelect(R3, R5, YEqualsInfinity);
     ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
 
     Reciprocal = XMVectorReciprocal(X);
     V = XMVectorMultiply(Y, Reciprocal);
     R0 = XMVectorATan(V);
 
-    Result = XMVectorSelect(Result, R0, ATanResultValid);
+    R1 = XMVectorSelect( Pi, Zero, XIsPositive );
+    R2 = XMVectorAdd(R0, R1);
+
+    Result = XMVectorSelect(Result, R2, ATanResultValid);
 
     return Result;
 
 #elif defined(_XM_SSE_INTRINSICS_)
     static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
     // Mask if Y>0 && Y!=INF
-    XMVECTOR FiniteYGreaterZero = _mm_cmpgt_ps(Y,g_XMZero);
     XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    FiniteYGreaterZero = _mm_andnot_ps(YEqualsInfinity,FiniteYGreaterZero);
     // Get the sign of (Y&0x80000000)
     XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero);
     // Get the sign bits of X
@@ -4489,10 +4487,10 @@ XMINLINE XMVECTOR XMVectorATan2
     // Change them to masks
     XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero);
     // Get Pi
-    XMVECTOR R1 = _mm_load_ps1(&ATan2Constants.f[0]);
+    XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]);
     // Copy the sign of Y
-    R1 = _mm_or_ps(R1,YSign);
-    R1 = XMVectorSelect(R1,YSign,XIsPositive);
+    Pi = _mm_or_ps(Pi,YSign);
+    XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive);
     // Mask for X==0
     XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero);
     // Get Pi/2 with with sign of Y
@@ -4513,7 +4511,7 @@ XMINLINE XMVECTOR XMVectorATan2
     vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity);
 
     XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity);
-    vConstants = XMVectorSelect(vResult,R1,FiniteYGreaterZero);
+    vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity);
     // At this point, any entry that's zero will get the result
     // from XMVectorATan(), otherwise, return the failsafe value
     vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity);
@@ -4523,6 +4521,10 @@ XMINLINE XMVECTOR XMVectorATan2
     vConstants = _mm_div_ps(Y,X);
     vConstants = XMVectorATan(vConstants);
     // Discard entries that have been declared void
+
+    XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive );
+    vConstants = _mm_add_ps( vConstants, R3 );
+
     vResult = XMVectorSelect(vResult,vConstants,ATanResultValid);
     return vResult;
 #else // _XM_VMX128_INTRINSICS_
@@ -5139,9 +5141,9 @@ XMFINLINE XMVECTOR XMVectorATan2Est
     XMVECTOR V;
     XMVECTOR YSign;
     XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour;
-    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity, FiniteYGreaterZero;
+    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity;
     XMVECTOR ATanResultValid;
-    XMVECTOR R0, R1, R2, R3, R4, R5, R6, R7;
+    XMVECTOR R0, R1, R2, R3, R4, R5;
     XMVECTOR Zero;
     XMVECTOR Result;
     static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
@@ -5160,8 +5162,6 @@ XMFINLINE XMVECTOR XMVectorATan2Est
     XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
     YEqualsInfinity = XMVectorIsInfinite(Y);
     XEqualsInfinity = XMVectorIsInfinite(X);
-    FiniteYGreaterZero = XMVectorGreater(Y, Zero);
-    FiniteYGreaterZero = XMVectorSelect(FiniteYGreaterZero, Zero, YEqualsInfinity);
 
     YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
     Pi = XMVectorOrInt(Pi, YSign);
@@ -5174,25 +5174,25 @@ XMFINLINE XMVECTOR XMVectorATan2Est
     R3 = XMVectorSelect(R2, R1, YEqualsZero);
     R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
     R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    R6 = XMVectorSelect(R3, R5, YEqualsInfinity);
-    R7 = XMVectorSelect(R6, R1, FiniteYGreaterZero);
-    Result = XMVectorSelect(R6, R7, XEqualsInfinity);
+    Result = XMVectorSelect(R3, R5, YEqualsInfinity);
     ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
 
     Reciprocal = XMVectorReciprocalEst(X);
     V = XMVectorMultiply(Y, Reciprocal);
     R0 = XMVectorATanEst(V);
 
-    Result = XMVectorSelect(Result, R0, ATanResultValid);
+    R1 = XMVectorSelect( Pi, Zero, XIsPositive );
+    R2 = XMVectorAdd(R0, R1);
+
+    Result = XMVectorSelect(Result, R2, ATanResultValid);
 
     return Result;
 
 #elif defined(_XM_SSE_INTRINSICS_)
     static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
     // Mask if Y>0 && Y!=INF
-    XMVECTOR FiniteYGreaterZero = _mm_cmpgt_ps(Y,g_XMZero);
     XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    FiniteYGreaterZero = _mm_andnot_ps(YEqualsInfinity,FiniteYGreaterZero);
     // Get the sign of (Y&0x80000000)
     XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero);
     // Get the sign bits of X
@@ -5200,10 +5200,10 @@ XMFINLINE XMVECTOR XMVectorATan2Est
     // Change them to masks
     XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero);
     // Get Pi
-    XMVECTOR R1 = _mm_load_ps1(&ATan2Constants.f[0]);
+    XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]);
     // Copy the sign of Y
-    R1 = _mm_or_ps(R1,YSign);
-    R1 = XMVectorSelect(R1,YSign,XIsPositive);
+    Pi = _mm_or_ps(Pi,YSign);
+    XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive);
     // Mask for X==0
     XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero);
     // Get Pi/2 with with sign of Y
@@ -5224,16 +5224,21 @@ XMFINLINE XMVECTOR XMVectorATan2Est
     vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity);
 
     XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity);
-    vConstants = XMVectorSelect(vResult,R1,FiniteYGreaterZero);
+    vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity);
     // At this point, any entry that's zero will get the result
     // from XMVectorATan(), otherwise, return the failsafe value
     vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity);
     // Any entries not 0xFFFFFFFF, are considered precalculated
     XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask);
     // Let's do the ATan2 function
-    vConstants = _mm_div_ps(Y,X);
+    XMVECTOR Reciprocal = _mm_rcp_ps(X);
+    vConstants = _mm_mul_ps(Y, Reciprocal);
     vConstants = XMVectorATanEst(vConstants);
     // Discard entries that have been declared void
+
+    XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive );
+    vConstants = _mm_add_ps( vConstants, R3 );
+
     vResult = XMVectorSelect(vResult,vConstants,ATanResultValid);
     return vResult;
 #else // _XM_VMX128_INTRINSICS_
@@ -12777,13 +12782,13 @@ XMFINLINE _XMUICO4& _XMUICO4::operator=
 
 XMFINLINE _XMCOLOR::_XMCOLOR
 (
-    FLOAT _x,
-    FLOAT _y,
-    FLOAT _z,
-    FLOAT _w
+    FLOAT _r,
+    FLOAT _g,
+    FLOAT _b,
+    FLOAT _a
 )
 {
-    XMStoreColor(this, XMVectorSet(_x, _y, _z, _w));
+    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
 }
 
 //------------------------------------------------------------------------------