XNAMath 2.02

2024-11-09 14:10:09 +00:00 · 2016-05-23 12:45:32 -07:00 · 2016-05-23 12:45:32 -07:00 · 9a1e0b5318
commit 9a1e0b5318
parent 9521debcc7
5 changed files with 208 additions and 276 deletions
--- a/Inc/xnamath.h
+++ b/Inc/xnamath.h
@ -22,7 +22,7 @@ Abstract:
 #error XNAMATH and XBOXMATH are incompatible in the same compilation module. Use one or the other.
 #endif

-#define XNAMATH_VERSION 201
+#define XNAMATH_VERSION 202

 #if !defined(_XM_X64_) && !defined(_XM_X86_)
 #if defined(_M_AMD64) || defined(_AMD64_)
@ -32,6 +32,16 @@ Abstract:
 #endif
 #endif

+#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_)
+#if defined(_XM_X64_) || defined(_XM_X86_)
+#define _XM_LITTLEENDIAN_
+#elif defined(_XBOX_VER)
+#define _XM_BIGENDIAN_
+#else
+#error xnamath.h only supports x86, x64, or XBox 360 targets
+#endif
+#endif
+
 #if defined(_XM_X86_) || defined(_XM_X64_)
 #define _XM_SSE_INTRINSICS_
 #if !defined(__cplusplus) && !defined(_XM_NO_INTRINSICS_)
@ -174,7 +184,7 @@ XMFINLINE FLOAT XMConvertToDegrees(FLOAT fRadians) { return fRadians * (180.0f /
 ****************************************************************************/

 #pragma warning(push)
-#pragma warning(disable:4201)
+#pragma warning(disable:4201 4365)

 #if !defined (_XM_X86_) && !defined(_XM_X64_)
 #pragma bitfield_order(push)
@ -278,9 +288,9 @@ typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU32 {
 } XMVECTORU32;

 // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86 and Xbox 360, but not for other targets
-#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINISCS_)
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 typedef const XMVECTOR FXMVECTOR;
-#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINISCS_)
+#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINSICS_)
 typedef const XMVECTOR FXMVECTOR;
 #elif defined(__cplusplus)
 typedef const XMVECTOR& FXMVECTOR;
@ -289,7 +299,7 @@ typedef const XMVECTOR FXMVECTOR;
 #endif

 // Fix-up for (4th+) XMVECTOR parameters to pass in-register for Xbox 360 and by reference otherwise
-#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINISCS_)
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 typedef const XMVECTOR CXMVECTOR;
 #elif defined(__cplusplus)
 typedef const XMVECTOR& CXMVECTOR;
@ -354,11 +364,11 @@ typedef _DECLSPEC_ALIGN_16_ struct _XMMATRIX
    FLOAT       operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
    FLOAT&      operator() (UINT Row, UINT Column) { return m[Row][Column]; }

-    _XMMATRIX&  operator= (CONST _XMMATRIX&);
+    _XMMATRIX&  operator= (CONST _XMMATRIX& M);

 #ifndef XM_NO_OPERATOR_OVERLOADS
-    _XMMATRIX&  operator*= (CONST _XMMATRIX&);
-    _XMMATRIX   operator* (CONST _XMMATRIX&) CONST;
+    _XMMATRIX&  operator*= (CONST _XMMATRIX& M);
+    _XMMATRIX   operator* (CONST _XMMATRIX& M) CONST;
 #endif // !XM_NO_OPERATOR_OVERLOADS

 #endif // __cplusplus
@ -1500,7 +1510,7 @@ typedef struct _XMCOLOR

    _XMCOLOR() {};
    _XMCOLOR(UINT Color) : c(Color) {};
-    _XMCOLOR(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+    _XMCOLOR(FLOAT _r, FLOAT _g, FLOAT _b, FLOAT _a);
    _XMCOLOR(CONST FLOAT *pArray);

    operator UINT () { return c; }
@ -2582,9 +2592,6 @@ XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0       = {-1.0f,0.0f, 0.0f, 0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1       = {0.0f,-1.0f, 0.0f, 0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2       = {0.0f, 0.0f,-1.0f, 0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3       = {0.0f, 0.0f, 0.0f,-1.0f};
-
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
-
 XMGLOBALCONST XMVECTORI32 g_XMNegativeZero      = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
 XMGLOBALCONST XMVECTORI32 g_XMNegate3           = {0x80000000, 0x80000000, 0x80000000, 0x00000000};
 XMGLOBALCONST XMVECTORI32 g_XMMask3             = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
@ -2638,8 +2645,6 @@ XMGLOBALCONST XMVECTORF32 g_XMNegateW           = { 1.0f, 1.0f, 1.0f,-1.0f};
 XMGLOBALCONST XMVECTORI32 g_XMSelect0101        = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1};
 XMGLOBALCONST XMVECTORI32 g_XMSelect1010        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0};
 XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD};
-
-#ifdef _XM_NO_INTRINSICS_
 XMGLOBALCONST XMVECTORI32 g_XMSelect1000        = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0};
 XMGLOBALCONST XMVECTORI32 g_XMSelect1100        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0};
 XMGLOBALCONST XMVECTORI32 g_XMSelect1110        = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0};
@ -2650,9 +2655,6 @@ XMGLOBALCONST XMVECTORI32 g_XMSwizzleYZXW       = {XM_PERMUTE_0Y, XM_PERMUTE_0Z,
 XMGLOBALCONST XMVECTORI32 g_XMSwizzleZXYW       = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
 XMGLOBALCONST XMVECTORI32 g_XMPermute0X0Y1X1Y   = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y};
 XMGLOBALCONST XMVECTORI32 g_XMPermute0Z0W1Z1W   = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_1W};
-#endif // !_XM_NO_INTRINSICS_
-
-#ifdef _XM_SSE_INTRINSICS_
 XMGLOBALCONST XMVECTORF32 g_XMFixupY16          = {1.0f,1.0f/65536.0f,0.0f,0.0f};
 XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16       = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f};
 XMGLOBALCONST XMVECTORI32 g_XMFlipY             = {0,0x80000000,0,0};
@ -2685,9 +2687,6 @@ XMGLOBALCONST XMVECTORF32 g_XMMulDec4           = {1.0f,1.0f/1024.0f,1.0f/(1024.
 XMGLOBALCONST XMVECTORI32 g_XMMaskByte4         = {0xFF,0xFF00,0xFF0000,0xFF000000};
 XMGLOBALCONST XMVECTORI32 g_XMXorByte4          = {0x80,0x8000,0x800000,0x00000000};
 XMGLOBALCONST XMVECTORF32 g_XMAddByte4          = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0};
-#endif
-
-#endif // _XM_NO_INTRINSICS_

 /****************************************************************************
 *
@ -2696,7 +2695,7 @@ XMGLOBALCONST XMVECTORF32 g_XMAddByte4          = {-128.0f,-128.0f*256.0f,-128.0
 ****************************************************************************/

 #pragma warning(push)
-#pragma warning(disable:4214 4204 4616 6001)
+#pragma warning(disable:4214 4204 4365 4616 6001)

 #if !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_)

@ -2861,10 +2860,10 @@ XMFINLINE XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateE

 //------------------------------------------------------------------------------

-#include <xnamathconvert.inl>
-#include <xnamathvector.inl>
-#include <xnamathmatrix.inl>
-#include <xnamathmisc.inl>
+#include "xnamathconvert.inl"
+#include "xnamathvector.inl"
+#include "xnamathmatrix.inl"
+#include "xnamathmisc.inl"

 #pragma warning(pop)

--- a/Inc/xnamathconvert.inl
+++ b/Inc/xnamathconvert.inl
@ -431,8 +431,8 @@ XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
    XMASSERT(((UINT_PTR)pSource & 3) == 0);
-    __m128i V = _mm_set_epi32( 0, 0, 0, *pSource );
-    return reinterpret_cast<__m128 *>(&V)[0];
+
+    return _mm_load_ss( (const float*)pSource );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -480,9 +480,10 @@ XMFINLINE XMVECTOR XMLoadInt2
 #elif defined(_XM_SSE_INTRINSICS_)

    XMASSERT(pSource);
-    __m128i V = _mm_set_epi32( 0, 0, *(pSource+1), *pSource );
-    return reinterpret_cast<__m128 *>(&V)[0];

+    __m128 x = _mm_load_ss( (const float*)pSource );
+    __m128 y = _mm_load_ss( (const float*)(pSource+1) );
+    return _mm_unpacklo_ps( x, y );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -509,6 +510,8 @@ XMFINLINE XMVECTOR XMLoadInt2A
 #elif defined(_XM_SSE_INTRINSICS_)

    XMASSERT(pSource);
+    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
    __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
    return reinterpret_cast<__m128 *>(&V)[0];

@ -526,20 +529,16 @@ XMFINLINE XMVECTOR XMLoadFloat2
 #if defined(_XM_NO_INTRINSICS_)
    XMVECTOR V;
    XMASSERT(pSource);
+
    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
-    V.vector4_f32[2] = V.vector4_f32[3] = 0.0f;
    return V;
 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
-#ifdef _XM_X86_
+
    __m128 x = _mm_load_ss( &pSource->x );
    __m128 y = _mm_load_ss( &pSource->y );
    return _mm_unpacklo_ps( x, y );
-#else // _XM_X64_
-    // This reads 2 floats past the memory that should be ignored.
-    return _mm_loadu_ps( &pSource->x );
-#endif
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -565,14 +564,10 @@ XMFINLINE XMVECTOR XMLoadFloat2A

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
-#ifdef _XM_X86_
-    __m128 x = _mm_load_ss( &pSource->x );
-    __m128 y = _mm_load_ss( &pSource->y );
-    return _mm_unpacklo_ps( x, y );
-#else // _XM_X64_
-    // This reads 2 floats past the memory that should be ignored.
-    return _mm_load_ps( &pSource->x );
-#endif
+    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+    __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
+    return reinterpret_cast<__m128 *>(&V)[0];
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -792,8 +787,17 @@ XMFINLINE XMVECTOR XMLoadInt3

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
    __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
    return reinterpret_cast<__m128 *>(&V)[0];
+#else
+    __m128 x = _mm_load_ss( (const float*)pSource );
+    __m128 y = _mm_load_ss( (const float*)(pSource+1) );
+    __m128 z = _mm_load_ss( (const float*)(pSource+2) );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    return _mm_movelh_ps( xy, z );
+#endif // !_XM_ISVS2005_
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -839,15 +843,26 @@ XMFINLINE XMVECTOR XMLoadFloat3
 #if defined(_XM_NO_INTRINSICS_)
    XMVECTOR V;
    XMASSERT(pSource);
+
    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
    ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
-    V.vector4_f32[3] = 0.0f;
    return V;
 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
    // This reads 1 floats past the memory that should be ignored.
+    // Need to continue to do this for VS 2005 due to compiler issue but prefer new method
+    // to avoid triggering issues with memory debug tools (like AV)
    return _mm_loadu_ps( &pSource->x );
+#else
+    __m128 x = _mm_load_ss( &pSource->x );
+    __m128 y = _mm_load_ss( &pSource->y );
+    __m128 z = _mm_load_ss( &pSource->z );
+    __m128 xy = _mm_unpacklo_ps( x, y );
+    return _mm_movelh_ps( xy, z );
+#endif // !_XM_ISVS2005_
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -874,10 +889,10 @@ XMFINLINE XMVECTOR XMLoadFloat3A

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
+    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);

-	// This reads 1 floats past the memory that should be ignored.
-
-	return _mm_load_ps( &pSource->x );
+    // This reads 1 floats past the memory that should be ignored.
+    return _mm_load_ps( &pSource->x );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1298,9 +1313,9 @@ XMFINLINE XMVECTOR XMLoadFloat3PK
    CONST XMFLOAT3PK* pSource
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT Result[4];
    UINT Mantissa;
    UINT Exponent;
-    UINT Result[3];

    XMASSERT(pSource);

@ -1406,7 +1421,7 @@ XMFINLINE XMVECTOR XMLoadFloat3PK
        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
    }

-    return XMLoadFloat3( (XMFLOAT3*)&Result );
+    return XMLoadFloat3A( (XMFLOAT3A*)&Result );
 }

 //------------------------------------------------------------------------------
@ -1416,9 +1431,9 @@ XMFINLINE XMVECTOR XMLoadFloat3SE
    CONST XMFLOAT3SE* pSource
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT Result[4];
    UINT Mantissa;
    UINT Exponent, ExpBits;
-    UINT Result[3];

    XMASSERT(pSource);

@ -1515,7 +1530,7 @@ XMFINLINE XMVECTOR XMLoadFloat3SE
        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
    }

-    return XMLoadFloat3( (XMFLOAT3*)&Result );
+    return XMLoadFloat3A( (XMFLOAT3A*)&Result );
 }

 //------------------------------------------------------------------------------
@ -1541,6 +1556,7 @@ XMFINLINE XMVECTOR XMLoadInt4
 #elif defined(_XM_SSE_INTRINSICS_)

    XMASSERT(pSource);
+
    __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
    return reinterpret_cast<__m128 *>(&V)[0];

@ -1577,7 +1593,6 @@ XMFINLINE XMVECTOR XMLoadInt4A
    __m128i V = _mm_load_si128( (const __m128i*)pSource );
    return reinterpret_cast<__m128 *>(&V)[0];

-
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1592,6 +1607,7 @@ XMFINLINE XMVECTOR XMLoadFloat4
 #if defined(_XM_NO_INTRINSICS_)
    XMVECTOR V;
    XMASSERT(pSource);
+
    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
    ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
@ -1599,6 +1615,7 @@ XMFINLINE XMVECTOR XMLoadFloat4
    return V;
 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSource);
+
    return _mm_loadu_ps( &pSource->x );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
@ -3055,13 +3072,18 @@ XMFINLINE VOID XMStoreInt
    FXMVECTOR V
 )
 {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_NO_INTRINSICS_)

    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);

    *pDestination = XMVectorGetIntX( V );

+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pDestination);
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+    _mm_store_ss( (float*)pDestination, V );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -3074,13 +3096,18 @@ XMFINLINE VOID XMStoreFloat
    FXMVECTOR V
 )
 {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_NO_INTRINSICS_)

    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);

    *pDestination = XMVectorGetX( V );

+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pDestination);
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+    _mm_store_ss( pDestination, V );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -3102,12 +3129,12 @@ XMFINLINE VOID XMStoreInt2
    pDestination[1] = V.vector4_u32[1];

 #elif defined(_XM_SSE_INTRINSICS_)
-
    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
-    pDestination[0] = XMVectorGetIntX( V );
-    pDestination[1] = XMVectorGetIntY( V );

+    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( (float*)&pDestination[0], V );
+    _mm_store_ss( (float*)&pDestination[1], T );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -3131,7 +3158,7 @@ XMFINLINE VOID XMStoreInt2A
 #elif defined(_XM_SSE_INTRINSICS_)

    XMASSERT(pDestination);
-    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);

    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );

@ -3156,14 +3183,12 @@ XMFINLINE VOID XMStoreFloat2
    pDestination->y = V.vector4_f32[1];

 #elif defined(_XM_SSE_INTRINSICS_)
-
    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);

-	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T );
-
+    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss( &pDestination->y, T );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -3187,11 +3212,9 @@ XMFINLINE VOID XMStoreFloat2A
 #elif defined(_XM_SSE_INTRINSICS_)

    XMASSERT(pDestination);
-    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);

-	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T );
+    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );

 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -3391,9 +3414,12 @@ XMFINLINE VOID XMStoreInt3

    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
-    pDestination[0] = XMVectorGetIntX( V );
-    pDestination[1] = XMVectorGetIntY( V );
-    pDestination[2] = XMVectorGetIntZ( V );
+
+    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( (float*)pDestination, V );
+    _mm_store_ss( (float*)&pDestination[1], T1 );
+    _mm_store_ss( (float*)&pDestination[2], T2 );

 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -3419,10 +3445,11 @@ XMFINLINE VOID XMStoreInt3A
 #elif defined(_XM_SSE_INTRINSICS_)

    XMASSERT(pDestination);
-    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
-    pDestination[0] = XMVectorGetIntX( V );
-    pDestination[1] = XMVectorGetIntY( V );
-    pDestination[2] = XMVectorGetIntZ( V );
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_store_ss( (float*)&pDestination[2], T );

 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -3450,11 +3477,11 @@ XMFINLINE VOID XMStoreFloat3
    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);

-	XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
-	XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T1 );
-	_mm_store_ss( &pDestination->z, T2 );
+    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss( &pDestination->y, T1 );
+    _mm_store_ss( &pDestination->z, T2 );

 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -3482,11 +3509,9 @@ XMFINLINE VOID XMStoreFloat3A
    XMASSERT(pDestination);
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);

-	XMVECTOR T1 = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
-	XMVECTOR T2 = _mm_unpackhi_ps( V, V );
-	_mm_store_ss( &pDestination->x, V );
-	_mm_store_ss( &pDestination->y, T1 );
-	_mm_store_ss( &pDestination->z, T2 );
+    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_store_ss( &pDestination->z, T );

 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -3918,13 +3943,13 @@ XMFINLINE VOID XMStoreFloat3PK
    FXMVECTOR V
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT IValue[4];
    UINT I, Sign, j;
-    UINT IValue[3];
    UINT Result[3];

    XMASSERT(pDestination);

-    XMStoreFloat3( (XMFLOAT3*)&IValue, V );
+    XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );

    // X & Y Channels (5-bit exponent, 6-bit mantissa)
    for(j=0; j < 2; ++j)
@ -4036,14 +4061,15 @@ XMFINLINE VOID XMStoreFloat3SE
    FXMVECTOR V
 )
 {
+    _DECLSPEC_ALIGN_16_ UINT IValue[4];
    UINT I, Sign, j, T;
-    UINT IValue[3];
    UINT Frac[3];
    UINT Exp[3];
+    

    XMASSERT(pDestination);

-    XMStoreFloat3( (XMFLOAT3*)&IValue, V );
+    XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );

    // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
    for(j=0; j < 3; ++j)
@ -4131,7 +4157,7 @@ XMFINLINE VOID XMStoreInt4

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pDestination);
-
+    
    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );

 #else // _XM_VMX128_INTRINSICS_
@ -4158,6 +4184,7 @@ XMFINLINE VOID XMStoreInt4A

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pDestination);
+    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);

    _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );

@ -4176,7 +4203,8 @@ XMFINLINE VOID XMStoreInt4NC
 #if defined(_XM_NO_INTRINSICS_)

    XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
    pDestination[0] = V.vector4_u32[0];
    pDestination[1] = V.vector4_u32[1];
    pDestination[2] = V.vector4_u32[2];
@ -4184,7 +4212,8 @@ XMFINLINE VOID XMStoreInt4NC

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );

 #else // _XM_VMX128_INTRINSICS_
@ -4202,7 +4231,7 @@ XMFINLINE VOID XMStoreFloat4
 #if defined(_XM_NO_INTRINSICS_)

    XMASSERT(pDestination);
-
+    
    pDestination->x = V.vector4_f32[0];
    pDestination->y = V.vector4_f32[1];
    pDestination->z = V.vector4_f32[2];
@ -4210,7 +4239,7 @@ XMFINLINE VOID XMStoreFloat4

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pDestination);
-
+    
    _mm_storeu_ps( &pDestination->x, V );

 #else // _XM_VMX128_INTRINSICS_
@ -4255,7 +4284,8 @@ XMFINLINE VOID XMStoreFloat4NC
 #if defined(_XM_NO_INTRINSICS_)

    XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
    pDestination->x = V.vector4_f32[0];
    pDestination->y = V.vector4_f32[1];
    pDestination->z = V.vector4_f32[2];
@ -4263,7 +4293,8 @@ XMFINLINE VOID XMStoreFloat4NC

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pDestination);
-
+    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+    
    _mm_storeu_ps( &pDestination->x, V );

 #else // _XM_VMX128_INTRINSICS_
@ -5452,13 +5483,13 @@ XMFINLINE VOID XMStoreColor
    // Convert to 0-255
    vResult = _mm_mul_ps(vResult,Scale);
    // Shuffle RGBA to ARGB
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,1,0,3));
+    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
    // Convert to int 
    __m128i vInt = _mm_cvtps_epi32(vResult);
    // Mash to shorts
    vInt = _mm_packs_epi32(vInt,vInt);
    // Mash to bytes
-    vInt = _mm_packs_epi16(vInt,vInt);
+    vInt = _mm_packus_epi16(vInt,vInt);
    // Store the color
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
 #else // _XM_VMX128_INTRINSICS_
@ -5698,12 +5729,12 @@ XMFINLINE VOID XMStoreFloat4x4A
    pDestination->m[3][3] = M.r[3].vector4_f32[3];

 #elif defined(_XM_SSE_INTRINSICS_)
-	XMASSERT(pDestination);
+    XMASSERT(pDestination);

-	_mm_store_ps( &pDestination->_11, M.r[0] );
-	_mm_store_ps( &pDestination->_21, M.r[1] );
-	_mm_store_ps( &pDestination->_31, M.r[2] );
-	_mm_store_ps( &pDestination->_41, M.r[3] );
+    _mm_store_ps( &pDestination->_11, M.r[0] );
+    _mm_store_ps( &pDestination->_21, M.r[1] );
+    _mm_store_ps( &pDestination->_31, M.r[2] );
+    _mm_store_ps( &pDestination->_41, M.r[3] );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
--- a/Inc/xnamathmatrix.inl
+++ b/Inc/xnamathmatrix.inl
@ -2310,7 +2310,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveRH
 XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
 (
    FLOAT FovAngleY, 
-    FLOAT AspectHByW, 
+    FLOAT AspectRatio, 
    FLOAT NearZ, 
    FLOAT FarZ
 )
@ -2324,13 +2324,13 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
    XMMATRIX M;

    XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
    XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);

    Height = CosFov / SinFov;
-    Width = Height / AspectHByW;
+    Width = Height / AspectRatio;

    M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f);
    M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f);
@ -2341,7 +2341,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
    XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
 	XMMATRIX M;
    FLOAT    SinFov;
@ -2351,7 +2351,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
    // Note: This is recorded on the stack
    FLOAT Height = CosFov / SinFov;
    XMVECTOR rMem = {
-        Height / AspectHByW,
+        Height / AspectRatio,
        Height,
        fRange,
        -fRange * NearZ
@ -2363,7 +2363,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
    vTemp = _mm_move_ss(vTemp,vValues);
    // CosFov / SinFov,0,0,0
    M.r[0] = vTemp;
-    // 0,Height / AspectHByW,0,0
+    // 0,Height / AspectRatio,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
    M.r[1] = vTemp;
@ -2386,7 +2386,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
 XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
 (
    FLOAT FovAngleY, 
-    FLOAT AspectHByW, 
+    FLOAT AspectRatio, 
    FLOAT NearZ, 
    FLOAT FarZ
 )
@ -2400,13 +2400,13 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
    XMMATRIX M;

    XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
    XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);

    Height = CosFov / SinFov;
-    Width = Height / AspectHByW;
+    Width = Height / AspectRatio;

    M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f);
    M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f);
@ -2417,7 +2417,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH

 #elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+    XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
    XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
 	XMMATRIX M;
    FLOAT    SinFov;
@ -2427,7 +2427,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
    // Note: This is recorded on the stack
    FLOAT Height = CosFov / SinFov;
    XMVECTOR rMem = {
-        Height / AspectHByW,
+        Height / AspectRatio,
        Height,
        fRange,
        fRange * NearZ
@ -2439,7 +2439,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
    vTemp = _mm_move_ss(vTemp,vValues);
    // CosFov / SinFov,0,0,0
    M.r[0] = vTemp;
-    // 0,Height / AspectHByW,0,0
+    // 0,Height / AspectRatio,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
    M.r[1] = vTemp;
--- a/Inc/xnamathmisc.inl
+++ b/Inc/xnamathmisc.inl
@ -693,7 +693,7 @@ XMFINLINE XMVECTOR XMQuaternionBaryCentric

    s = f + g;

-    if (s < 0.00001f && s > -0.00001f)
+    if ((s < 0.00001f) && (s > -0.00001f))
    {
        Result = Q0;
    }
@ -932,28 +932,26 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
    CXMMATRIX M
 )
 {
-#if defined(_XM_NO_INTRINSICS_)
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)

    XMVECTOR Q0, Q1, Q2;
    XMVECTOR M00, M11, M22;
    XMVECTOR CQ0, CQ1, C;
    XMVECTOR CX, CY, CZ, CW;
    XMVECTOR SQ1, Scale;
-    XMVECTOR Rsq, Sqrt, VEqualsInfinity, VEqualsZero, Select;
+    XMVECTOR Rsq, Sqrt, VEqualsNaN;
    XMVECTOR A, B, P;
    XMVECTOR PermuteSplat, PermuteSplatT;
    XMVECTOR SignB, SignBT;
    XMVECTOR PermuteControl, PermuteControlT;
-    XMVECTOR Zero;
    XMVECTOR Result;
-    static CONST XMVECTOR  OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
-    static CONST XMVECTOR  SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
-    static CONST XMVECTOR  SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTOR  SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTOR  SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTOR  SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTOR  SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
-    static CONST XMVECTOR  SignNNNX = {-1.0f, -1.0f, -1.0f, 2.0e-126f};
+    static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
+    static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
    static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
    static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
    static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
@ -972,26 +970,23 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
    M11 = XMVectorSplatY(M.r[1]);
    M22 = XMVectorSplatZ(M.r[2]);

-    Q0 = XMVectorMultiply(SignPNNP, M00);
-    Q0 = XMVectorMultiplyAdd(SignNPNP, M11, Q0);
-    Q0 = XMVectorMultiplyAdd(SignNNPP, M22, Q0);
+    Q0 = XMVectorMultiply(SignPNNP.v, M00);
+    Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0);
+    Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0);

    Q1 = XMVectorAdd(Q0, g_XMOne.v);

    Rsq = XMVectorReciprocalSqrt(Q1);
-    Zero = XMVectorZero();
-    VEqualsInfinity = XMVectorEqualInt(Q1, g_XMInfinity.v);
-    VEqualsZero = XMVectorEqual(Q1, Zero);
+    VEqualsNaN = XMVectorIsNaN(Rsq);
    Sqrt = XMVectorMultiply(Q1, Rsq);
-    Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    Q1 = XMVectorSelect(Q1, Sqrt, Select);
+    Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN);

    Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v);

    SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v);

    CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v);
-    CQ1 = XMVectorPermute(Q0, SignNNNX, Permute0Y0Z0Z1W.v);
+    CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v);
    C = XMVectorGreaterOrEqual(CQ0, CQ1);

    CX = XMVectorSplatX(C);
@ -1000,15 +995,15 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
    CW = XMVectorSplatW(C);

    PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ);
-    SignB = XMVectorSelect(SignNPPP, SignPPNP, CZ);
+    SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ);
    PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ);

    PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX);
-    SignB = XMVectorSelect(SignB, SignNPPP, CX);
+    SignB = XMVectorSelect(SignB, SignNPPP.v, CX);
    PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX);

    PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY);
-    SignBT = XMVectorSelect(SignB, SignPNPP, CY);
+    SignBT = XMVectorSelect(SignB, SignPNPP.v, CY);
    PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY);

    PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
@ -1016,7 +1011,7 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix
    PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);

    PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW);
-    SignB = XMVectorSelect(SignB, SignNNNX, CW);
+    SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW);
    PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW);

    Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);
@ -1032,104 +1027,6 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix

    return Result;

-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR Q0, Q1, Q2;
-    XMVECTOR M00, M11, M22;
-    XMVECTOR CQ0, CQ1, C;
-    XMVECTOR CX, CY, CZ, CW;
-    XMVECTOR SQ1, Scale;
-    XMVECTOR Rsq, Sqrt, VEqualsInfinity, VEqualsZero, Select;
-    XMVECTOR A, B, P;
-    XMVECTOR PermuteSplat, PermuteSplatT;
-    XMVECTOR SignB, SignBT;
-    XMVECTOR PermuteControl, PermuteControlT;
-    XMVECTOR Zero;
-    XMVECTOR Result;
-    static CONST XMVECTORF32  OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
-    static CONST XMVECTORF32  SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTORF32  SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
-    static CONST XMVECTORF32  SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
-    static CONST XMVECTORF32  SignNNNX = {-1.0f, -1.0f, -1.0f, 2.0e-126f};
-    static CONST XMVECTORI32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
-    static CONST XMVECTORI32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
-    static CONST XMVECTORI32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
-    static CONST XMVECTORI32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z};
-    static CONST XMVECTORI32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y};
-    static CONST XMVECTORI32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W};
-    static CONST XMVECTORI32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y};
-    static CONST XMVECTORI32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z};
-    static CONST XMVECTORI32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X};
-    static CONST XMVECTORI32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W};
-
-    M00 = XMVectorSplatX(M.r[0]);
-    M11 = XMVectorSplatY(M.r[1]);
-    M22 = XMVectorSplatZ(M.r[2]);
-
-    Q0 = XMVectorMultiply(SignPNNP, M00);
-    Q0 = XMVectorMultiplyAdd(SignNPNP, M11, Q0);
-    Q0 = XMVectorMultiplyAdd(SignNNPP, M22, Q0);
-
-    Q1 = XMVectorAdd(Q0, g_XMOne);
-
-    Rsq = XMVectorReciprocalSqrt(Q1);
-    Zero = XMVectorZero();
-    VEqualsInfinity = XMVectorEqualInt(Q1, g_XMInfinity);
-    VEqualsZero = XMVectorEqual(Q1, Zero);
-    Sqrt = XMVectorMultiply(Q1, Rsq);
-    Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    Q1 = XMVectorSelect(Q1, Sqrt, Select);
-
-    Q1 = XMVectorMultiply(Q1, g_XMOneHalf);
-
-    SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf);
-
-    CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W);
-    CQ1 = XMVectorPermute(Q0, SignNNNX, Permute0Y0Z0Z1W);
-    C = XMVectorGreaterOrEqual(CQ0, CQ1);
-
-    CX = XMVectorSplatX(C);
-    CY = XMVectorSplatY(C);
-    CZ = XMVectorSplatZ(C);
-    CW = XMVectorSplatW(C);
-
-    PermuteSplat = XMVectorSelect(SplatZ, SplatY, CZ);
-    SignB = XMVectorSelect(SignNPPP, SignPPNP, CZ);
-    PermuteControl = XMVectorSelect(Permute2, Permute1, CZ);
-
-    PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ, CX);
-    SignB = XMVectorSelect(SignB, SignNPPP, CX);
-    PermuteControl = XMVectorSelect(PermuteControl, Permute2, CX);
-
-    PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX, CY);
-    SignBT = XMVectorSelect(SignB, SignPNPP, CY);
-    PermuteControlT = XMVectorSelect(PermuteControl,Permute0, CY);
-
-    PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
-    SignB = XMVectorSelect(SignB, SignBT, CX);
-    PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);
-
-    PermuteSplat = XMVectorSelect(PermuteSplat,SplatW, CW);
-    SignB = XMVectorSelect(SignB, SignNNNX, CW);
-    PermuteControl = XMVectorSelect(PermuteControl,Permute3, CW);
-
-    Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);
-
-    P = XMVectorPermute(M.r[1], M.r[2],PermuteC);  // {M10, M12, M20, M21}
-    A = XMVectorPermute(M.r[0], P, PermuteA);       // {M01, M12, M20, M03}
-    B = XMVectorPermute(M.r[0], P, PermuteB);       // {M10, M21, M02, M03}
-
-    Q2 = XMVectorMultiplyAdd(SignB, B, A);
-    Q2 = XMVectorMultiply(Q2, Scale);
-
-    Result = XMVectorPermute(Q1, Q2, PermuteControl);
-
-    return Result;
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
--- a/Inc/xnamathvector.inl
+++ b/Inc/xnamathvector.inl
@ -1256,7 +1256,7 @@ XMFINLINE XMVECTOR XMVectorPermute
        ++pControl;
        VectorIndex = (uIndex>>4)&1;
        uIndex &= 0x0F;
-#if defined(_XM_X86_) || defined(_XM_X64_)
+#if defined(_XM_LITTLEENDIAN_)
        uIndex ^= 3; // Swap byte ordering on little endian machines
 #endif
        pWork[0] = aByte[VectorIndex][uIndex];
@ -4415,22 +4415,22 @@ XMINLINE XMVECTOR XMVectorATan2
    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:

    //     Y == 0 and X is Negative         -> Pi with the sign of Y
-    //     Y == 0 and X is Positive         -> 0 with the sign of Y
+    //     y == 0 and x is positive         -> 0 with the sign of y
    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
-    //     X == -Infinity and Finite Y > 0  -> Pi with the sign of Y
-    //     X == +Infinity and Finite Y > 0  -> 0 with the sign of Y
+    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
+    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
+    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
-    //     TODO: Return Y / X if the result underflows

    XMVECTOR Reciprocal;
    XMVECTOR V;
    XMVECTOR YSign;
    XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour;
-    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity, FiniteYGreaterZero;
+    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity;
    XMVECTOR ATanResultValid;
-    XMVECTOR R0, R1, R2, R3, R4, R5, R6, R7;
+    XMVECTOR R0, R1, R2, R3, R4, R5;
    XMVECTOR Zero;
    XMVECTOR Result;
    static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
@ -4449,8 +4449,6 @@ XMINLINE XMVECTOR XMVectorATan2
    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
    YEqualsInfinity = XMVectorIsInfinite(Y);
    XEqualsInfinity = XMVectorIsInfinite(X);
-    FiniteYGreaterZero = XMVectorGreater(Y, Zero);
-    FiniteYGreaterZero = XMVectorSelect(FiniteYGreaterZero, Zero, YEqualsInfinity);

    YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
    Pi = XMVectorOrInt(Pi, YSign);
@ -4463,25 +4461,25 @@ XMINLINE XMVECTOR XMVectorATan2
    R3 = XMVectorSelect(R2, R1, YEqualsZero);
    R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
    R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    R6 = XMVectorSelect(R3, R5, YEqualsInfinity);
-    R7 = XMVectorSelect(R6, R1, FiniteYGreaterZero);
-    Result = XMVectorSelect(R6, R7, XEqualsInfinity);
+    Result = XMVectorSelect(R3, R5, YEqualsInfinity);
    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);

    Reciprocal = XMVectorReciprocal(X);
    V = XMVectorMultiply(Y, Reciprocal);
    R0 = XMVectorATan(V);

-    Result = XMVectorSelect(Result, R0, ATanResultValid);
+    R1 = XMVectorSelect( Pi, Zero, XIsPositive );
+    R2 = XMVectorAdd(R0, R1);
+
+    Result = XMVectorSelect(Result, R2, ATanResultValid);

    return Result;

 #elif defined(_XM_SSE_INTRINSICS_)
    static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
    // Mask if Y>0 && Y!=INF
-    XMVECTOR FiniteYGreaterZero = _mm_cmpgt_ps(Y,g_XMZero);
    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    FiniteYGreaterZero = _mm_andnot_ps(YEqualsInfinity,FiniteYGreaterZero);
    // Get the sign of (Y&0x80000000)
    XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero);
    // Get the sign bits of X
@ -4489,10 +4487,10 @@ XMINLINE XMVECTOR XMVectorATan2
    // Change them to masks
    XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero);
    // Get Pi
-    XMVECTOR R1 = _mm_load_ps1(&ATan2Constants.f[0]);
+    XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]);
    // Copy the sign of Y
-    R1 = _mm_or_ps(R1,YSign);
-    R1 = XMVectorSelect(R1,YSign,XIsPositive);
+    Pi = _mm_or_ps(Pi,YSign);
+    XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive);
    // Mask for X==0
    XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero);
    // Get Pi/2 with with sign of Y
@ -4513,7 +4511,7 @@ XMINLINE XMVECTOR XMVectorATan2
    vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity);

    XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity);
-    vConstants = XMVectorSelect(vResult,R1,FiniteYGreaterZero);
+    vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity);
    // At this point, any entry that's zero will get the result
    // from XMVectorATan(), otherwise, return the failsafe value
    vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity);
@ -4523,6 +4521,10 @@ XMINLINE XMVECTOR XMVectorATan2
    vConstants = _mm_div_ps(Y,X);
    vConstants = XMVectorATan(vConstants);
    // Discard entries that have been declared void
+
+    XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive );
+    vConstants = _mm_add_ps( vConstants, R3 );
+
    vResult = XMVectorSelect(vResult,vConstants,ATanResultValid);
    return vResult;
 #else // _XM_VMX128_INTRINSICS_
@ -5139,9 +5141,9 @@ XMFINLINE XMVECTOR XMVectorATan2Est
    XMVECTOR V;
    XMVECTOR YSign;
    XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour;
-    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity, FiniteYGreaterZero;
+    XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity;
    XMVECTOR ATanResultValid;
-    XMVECTOR R0, R1, R2, R3, R4, R5, R6, R7;
+    XMVECTOR R0, R1, R2, R3, R4, R5;
    XMVECTOR Zero;
    XMVECTOR Result;
    static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
@ -5160,8 +5162,6 @@ XMFINLINE XMVECTOR XMVectorATan2Est
    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
    YEqualsInfinity = XMVectorIsInfinite(Y);
    XEqualsInfinity = XMVectorIsInfinite(X);
-    FiniteYGreaterZero = XMVectorGreater(Y, Zero);
-    FiniteYGreaterZero = XMVectorSelect(FiniteYGreaterZero, Zero, YEqualsInfinity);

    YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
    Pi = XMVectorOrInt(Pi, YSign);
@ -5174,25 +5174,25 @@ XMFINLINE XMVECTOR XMVectorATan2Est
    R3 = XMVectorSelect(R2, R1, YEqualsZero);
    R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
    R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    R6 = XMVectorSelect(R3, R5, YEqualsInfinity);
-    R7 = XMVectorSelect(R6, R1, FiniteYGreaterZero);
-    Result = XMVectorSelect(R6, R7, XEqualsInfinity);
+    Result = XMVectorSelect(R3, R5, YEqualsInfinity);
    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);

    Reciprocal = XMVectorReciprocalEst(X);
    V = XMVectorMultiply(Y, Reciprocal);
    R0 = XMVectorATanEst(V);

-    Result = XMVectorSelect(Result, R0, ATanResultValid);
+    R1 = XMVectorSelect( Pi, Zero, XIsPositive );
+    R2 = XMVectorAdd(R0, R1);
+
+    Result = XMVectorSelect(Result, R2, ATanResultValid);

    return Result;

 #elif defined(_XM_SSE_INTRINSICS_)
    static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
    // Mask if Y>0 && Y!=INF
-    XMVECTOR FiniteYGreaterZero = _mm_cmpgt_ps(Y,g_XMZero);
    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    FiniteYGreaterZero = _mm_andnot_ps(YEqualsInfinity,FiniteYGreaterZero);
    // Get the sign of (Y&0x80000000)
    XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero);
    // Get the sign bits of X
@ -5200,10 +5200,10 @@ XMFINLINE XMVECTOR XMVectorATan2Est
    // Change them to masks
    XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero);
    // Get Pi
-    XMVECTOR R1 = _mm_load_ps1(&ATan2Constants.f[0]);
+    XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]);
    // Copy the sign of Y
-    R1 = _mm_or_ps(R1,YSign);
-    R1 = XMVectorSelect(R1,YSign,XIsPositive);
+    Pi = _mm_or_ps(Pi,YSign);
+    XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive);
    // Mask for X==0
    XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero);
    // Get Pi/2 with with sign of Y
@ -5224,16 +5224,21 @@ XMFINLINE XMVECTOR XMVectorATan2Est
    vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity);

    XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity);
-    vConstants = XMVectorSelect(vResult,R1,FiniteYGreaterZero);
+    vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity);
    // At this point, any entry that's zero will get the result
    // from XMVectorATan(), otherwise, return the failsafe value
    vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity);
    // Any entries not 0xFFFFFFFF, are considered precalculated
    XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask);
    // Let's do the ATan2 function
-    vConstants = _mm_div_ps(Y,X);
+    XMVECTOR Reciprocal = _mm_rcp_ps(X);
+    vConstants = _mm_mul_ps(Y, Reciprocal);
    vConstants = XMVectorATanEst(vConstants);
    // Discard entries that have been declared void
+
+    XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive );
+    vConstants = _mm_add_ps( vConstants, R3 );
+
    vResult = XMVectorSelect(vResult,vConstants,ATanResultValid);
    return vResult;
 #else // _XM_VMX128_INTRINSICS_
@ -12777,13 +12782,13 @@ XMFINLINE _XMUICO4& _XMUICO4::operator=

 XMFINLINE _XMCOLOR::_XMCOLOR
 (
-    FLOAT _x,
-    FLOAT _y,
-    FLOAT _z,
-    FLOAT _w
+    FLOAT _r,
+    FLOAT _g,
+    FLOAT _b,
+    FLOAT _a
 )
 {
-    XMStoreColor(this, XMVectorSet(_x, _y, _z, _w));
+    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
 }

 //------------------------------------------------------------------------------