DirectXMath 3.03

2024-11-08 21:50:09 +00:00 · 2016-05-23 14:15:41 -07:00 · 2016-05-23 14:15:41 -07:00 · fd7f30458d
commit fd7f30458d
parent 409c3a3646
6 changed files with 500 additions and 534 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -17,7 +17,7 @@
 #error DirectX Math requires C++
 #endif

-#define DIRECTX_MATH_VERSION 302
+#define DIRECTX_MATH_VERSION 303

 #if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_)
 #if defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM)
@ -29,6 +29,8 @@
 #endif
 #endif // !_XM_BIGENDIAN_ && !_XM_LITTLEENDIAN_

+
+
 #if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 #if defined(_M_IX86) || defined(_M_AMD64)
 #define _XM_SSE_INTRINSICS_
@ -62,15 +64,7 @@
 #endif
 #endif

-#ifdef _WIN32_WCE
-inline float powf(float _X, float _Y) { return ((float)pow((double)_X, (double)_Y)); }
-inline float logf(float _X) { return ((float)log((double)_X)); }
-inline float tanf(float _X) { return ((float)tan((double)_X)); }
-inline float atanf(float _X) { return ((float)atan((double)_X)); }
-inline float sinhf(float _X) { return ((float)sinh((double)_X)); }
-inline float coshf(float _X) { return ((float)cosh((double)_X)); }
-inline float tanhf(float _X) { return ((float)tanh((double)_X)); }
-#endif
+

 #include <sal.h>
 #include <assert.h>
@ -261,8 +255,8 @@ __declspec(align(16)) struct XMVECTORF32
    inline operator XMVECTOR() const { return v; }
    inline operator const float*() const { return f; }
 #if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
-    inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
 #endif
 };

@ -276,8 +270,8 @@ __declspec(align(16)) struct XMVECTORI32

    inline operator XMVECTOR() const { return v; }
 #if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
-    inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
 #endif
 };

@ -291,8 +285,8 @@ __declspec(align(16)) struct XMVECTORU8

    inline operator XMVECTOR() const { return v; }
 #if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
-    inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
 #endif
 };

@ -306,8 +300,8 @@ __declspec(align(16)) struct XMVECTORU32

    inline operator XMVECTOR() const { return v; }
 #if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
-    inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+    inline operator __m128i() const { return _mm_castps_si128(v); }
+    inline operator __m128d() const { return _mm_castps_pd(v); }
 #endif
 };

@ -350,6 +344,7 @@ struct XMMATRIX
 __declspec(align(16)) struct XMMATRIX
 #endif
 {
+#ifdef _XM_NO_INTRINSICS_
    union
    {
        XMVECTOR r[4];
@ -362,6 +357,9 @@ __declspec(align(16)) struct XMMATRIX
        };
        float m[4][4];
    };
+#else
+    XMVECTOR r[4];
+#endif

    XMMATRIX() {}
    XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, GXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; }
@ -371,8 +369,10 @@ __declspec(align(16)) struct XMMATRIX
             float m30, float m31, float m32, float m33);
    explicit XMMATRIX(_In_reads_(16) const float *pArray);

+#ifdef _XM_NO_INTRINSICS_
    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+#endif

    XMMATRIX&   operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; }

@ -403,7 +403,7 @@ struct XMFLOAT2

    XMFLOAT2() {}
    XMFLOAT2(float _x, float _y) : x(_x), y(_y) {}
-    XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {}
+    explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {}

    XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; }
 };
@ -413,7 +413,7 @@ __declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2
 {
    XMFLOAT2A() : XMFLOAT2() {}
    XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {}
-    XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {}
+    explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {}

    XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; }
 };
@ -455,7 +455,7 @@ struct XMFLOAT3

    XMFLOAT3() {}
    XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {}
-    XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+    explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}

    XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
 };
@ -465,7 +465,7 @@ __declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3
 {
    XMFLOAT3A() : XMFLOAT3() {}
    XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {}
-    XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {}
+    explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {}

    XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
 };
@ -482,7 +482,7 @@ struct XMINT3
    XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {}
    explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}

-    XMINT3& operator= (const XMINT3& Int3) { x = Int3.x; y = Int3.y; z = Int3.z; return *this; }
+    XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; }
 };

 // 3D Vector; 32 bit unsigned integer components
@ -496,7 +496,7 @@ struct XMUINT3
    XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {}
    explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}

-    XMUINT3& operator= (const XMUINT3& UInt3) { x = UInt3.x; y = UInt3.y; z = UInt3.z; return *this; }
+    XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; }
 };

 //------------------------------------------------------------------------------
@ -510,7 +510,7 @@ struct XMFLOAT4

    XMFLOAT4() {}
    XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {}
-    XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+    explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}

    XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
 };
@ -520,7 +520,7 @@ __declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4
 {
    XMFLOAT4A() : XMFLOAT4() {}
    XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {}
-    XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {}
+    explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {}

    XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
 };
@ -1368,6 +1368,8 @@ template<class T> inline T XMMax(T a, T b) { return (a > b) ? a : b; }

 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

+#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
+
 // PermuteHelper internal template (SSE only)
 namespace Internal
 {
@ -1384,8 +1386,8 @@ namespace Internal
                WhichW ? 0xFFFFFFFF : 0,
            };

-            XMVECTOR shuffled1 = _mm_shuffle_ps(v1, v1, Shuffle);
-            XMVECTOR shuffled2 = _mm_shuffle_ps(v2, v2, Shuffle);
+            XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
+            XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);

            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
@ -1397,13 +1399,13 @@ namespace Internal
    // Fast path for permutes that only read from the first vector.
    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
    {
-        static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_shuffle_ps(v1, v1, Shuffle); }
+        static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); }
    };

    // Fast path for permutes that only read from the second vector.
    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
    {
-        static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_shuffle_ps(v2, v2, Shuffle); }
+        static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); }
    };

    // Fast path for permutes that read XY from the first vector, ZW from the second.
@ -1488,7 +1490,7 @@ template<> inline XMVECTOR XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2)
 template<> inline XMVECTOR XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); }
 template<> inline XMVECTOR XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); }

-#endif _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_

 //------------------------------------------------------------------------------

@ -1502,7 +1504,7 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");

 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    return _mm_shuffle_ps( V, V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+    return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
 #elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    return __vpermwi(V, ((SwizzleX & 3) << 6) | ((SwizzleY & 3) << 4) | ((SwizzleZ & 3) << 2) | (SwizzleW & 3) );
 #else
@ -1515,6 +1517,7 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
 // Specialized swizzles
 template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }

+
 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

 template<> inline XMVECTOR XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
@ -1548,7 +1551,7 @@ template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_
 template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); }
 template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); }

-#endif _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_

 //------------------------------------------------------------------------------

@ -1760,7 +1763,7 @@ inline XMVECTOR XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2,
    vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
    // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
    vTemp = _mm_and_si128(vTemp,g_XMOne);
-    return reinterpret_cast<const __m128 *>(&vTemp)[0];
+    return _mm_castsi128_ps(vTemp);
 #endif
 }

@ -1799,7 +1802,7 @@ inline XMVECTOR XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent)
    // Splat the scalar value (It's really a float)
    vScale = _mm_set1_epi32(uScale);
    // Multiply by the reciprocal (Perform a right shift by DivExponent)
-    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
+    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
    return vResult;
 #endif
 }
@ -1824,13 +1827,14 @@ inline XMVECTOR XMVectorSplatConstantInt(int32_t IntConstant)
 }

 // Implemented for VMX128 intrinsics as #defines aboves
-#endif _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_
+#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_

 #include "DirectXMathConvert.inl"
 #include "DirectXMathVector.inl"
 #include "DirectXMathMatrix.inl"
 #include "DirectXMathMisc.inl"

+
 #pragma prefast(pop)
 #pragma warning(pop)

--- a/Inc/DirectXMathConvert.inl
+++ b/Inc/DirectXMathConvert.inl
@ -50,12 +50,12 @@ inline XMVECTOR XMConvertVectorIntToFloat
    return vmulq_f32( vResult, vScale );
 #else // _XM_SSE_INTRINSICS_
    // Convert to floats
-    XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
+    XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
    // Convert DivExponent into 1.0f/(1<<DivExponent)
    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
    // Splat the scalar value
    __m128i vScale = _mm_set1_epi32(uScale);
-    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
+    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
    return vResult;
 #endif
 }
@ -108,7 +108,7 @@ inline XMVECTOR XMConvertVectorFloatToInt
    __m128i vResulti = _mm_cvttps_epi32(vResult);
    // If there was positive overflow, set to 0x7FFFFFFF
    vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
    vOverflow = _mm_or_ps(vOverflow,vResult);
    return vOverflow;
 #endif
@ -143,17 +143,17 @@ inline XMVECTOR XMConvertVectorUIntToFloat
    // Force all values positive
    XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
    // Convert to floats
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
    vResult = _mm_add_ps(vResult,vMask);
    // Convert DivExponent into 1.0f/(1<<DivExponent)
    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
    // Splat
    iMask = _mm_set1_epi32(uScale);
-    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
+    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
    return vResult;
 #endif
 }
@ -213,7 +213,7 @@ inline XMVECTOR XMConvertVectorFloatToUInt
    __m128i vResulti = _mm_cvttps_epi32(vResult);
    // Convert from signed to unsigned pnly if greater than 0x80000000
    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
    // On those that are too large, set to 0xFFFFFFFF
    vResult = _mm_or_ps(vResult,vOverflow);
    return vResult;
@ -404,7 +404,7 @@ inline XMVECTOR XMLoadSInt2
    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
    __m128 V = _mm_unpacklo_ps( x, y );
-    return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
+    return _mm_cvtepi32_ps(_mm_castps_si128(V));
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -439,11 +439,11 @@ inline XMVECTOR XMLoadUInt2
    // Force all values positive
    XMVECTOR vResult = _mm_xor_ps(V,vMask);
    // Convert to floats
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
    vResult = _mm_add_ps(vResult,vMask);
    return vResult;
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
@ -596,7 +596,7 @@ inline XMVECTOR XMLoadSInt3
    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
    __m128 xy = _mm_unpacklo_ps( x, y );
    __m128 V = _mm_movelh_ps( xy, z );
-    return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
+    return _mm_cvtepi32_ps(_mm_castps_si128(V));
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -634,11 +634,11 @@ inline XMVECTOR XMLoadUInt3
    // Force all values positive
    XMVECTOR vResult = _mm_xor_ps(V,vMask);
    // Convert to floats
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
    vResult = _mm_add_ps(vResult,vMask);
    return vResult; 

@ -792,15 +792,15 @@ inline XMVECTOR XMLoadUInt4
    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
+    XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
+    XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
    // Convert to floats
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
    vResult = _mm_add_ps(vResult,vMask);
    return vResult;
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
@ -934,7 +934,7 @@ inline XMMATRIX XMLoadFloat4x3
    // vTemp2 = y2,z2,x2,x2
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
    // vTemp2 = x2,y2,z2,z2
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
    // vTemp1 = x1,y1,z1,0
    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
    // vTemp2 = x2,y2,z2,0
@ -942,13 +942,13 @@ inline XMMATRIX XMLoadFloat4x3
    // vTemp3 = x3,y3,z3,0
    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
    // vTemp4i = x4,y4,z4,0
-    __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
+    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
    // vTemp4i = x4,y4,z4,1.0f
    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
    XMMATRIX M(vTemp1,
            vTemp2,
            vTemp3,
-            reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
+            _mm_castsi128_ps(vTemp4i));
    return M;
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
@ -1016,7 +1016,7 @@ inline XMMATRIX XMLoadFloat4x3A
    // vTemp2 = y2,z2,x2,x2
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
    // vTemp2 = x2,y2,z2,z2
-    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
+    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
    // vTemp1 = x1,y1,z1,0
    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
    // vTemp2 = x2,y2,z2,0
@ -1024,13 +1024,13 @@ inline XMMATRIX XMLoadFloat4x3A
    // vTemp3 = x3,y3,z3,0
    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
    // vTemp4i = x4,y4,z4,0
-    __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
+    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
    // vTemp4i = x4,y4,z4,1.0f
    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
    XMMATRIX M(vTemp1,
            vTemp2,
            vTemp3,
-            reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
+            _mm_castsi128_ps(vTemp4i));
    return M;
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -1195,7 +1195,7 @@ inline void XMStoreInt2
    __n64 VL = vget_low_u32(V);
    vst1_u32( pDestination, VL );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
 #else // _XM_VMX128_INTRINSICS_
@ -1219,7 +1219,7 @@ inline void XMStoreInt2A
    __n64 VL = vget_low_u32(V);
    vst1_u32_ex( pDestination, VL, 64 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1240,7 +1240,7 @@ inline void XMStoreFloat2
    __n64 VL = vget_low_f32(V);
    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    _mm_store_ss( &pDestination->x, V );
    _mm_store_ss( &pDestination->y, T );
 #else // _XM_VMX128_INTRINSICS_
@ -1264,7 +1264,7 @@ inline void XMStoreFloat2A
    __n64 VL = vget_low_f32(V);
    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1292,10 +1292,10 @@ inline void XMStoreSInt2
    __m128i vResulti = _mm_cvttps_epi32(V);
    // If there was positive overflow, set to 0x7FFFFFFF
    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
    vOverflow = _mm_or_ps(vOverflow,vResult);
    // Write two ints
-    XMVECTOR T = _mm_shuffle_ps( vOverflow, vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
 #else // _XM_VMX128_INTRINSICS_
@ -1333,11 +1333,11 @@ inline void XMStoreUInt2
    __m128i vResulti = _mm_cvttps_epi32(vResult);
    // Convert from signed to unsigned pnly if greater than 0x80000000
    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
    // On those that are too large, set to 0xFFFFFFFF
    vResult = _mm_or_ps(vResult,vOverflow);
    // Write two uints
-    XMVECTOR T = _mm_shuffle_ps( vResult, vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
 #else // _XM_VMX128_INTRINSICS_
@ -1362,8 +1362,8 @@ inline void XMStoreInt3
    vst1_u32( pDestination, VL );
    vst1q_lane_u32( pDestination+2, V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
@ -1390,8 +1390,8 @@ inline void XMStoreInt3A
    vst1_u32_ex( pDestination, VL, 64 );
    vst1q_lane_u32( pDestination+2, V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
+    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -1415,8 +1415,8 @@ inline void XMStoreFloat3
    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss( &pDestination->x, V );
    _mm_store_ss( &pDestination->y, T1 );
    _mm_store_ss( &pDestination->z, T2 );
@ -1443,8 +1443,8 @@ inline void XMStoreFloat3A
    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
+    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
    _mm_store_ss( &pDestination->z, T );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
@ -1475,11 +1475,11 @@ inline void XMStoreSInt3
    __m128i vResulti = _mm_cvttps_epi32(V);
    // If there was positive overflow, set to 0x7FFFFFFF
    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
    vOverflow = _mm_or_ps(vOverflow,vResult);
    // Write 3 uints
-    XMVECTOR T1 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(2,2,2,2));
+    XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
@ -1520,12 +1520,12 @@ inline void XMStoreUInt3
    __m128i vResulti = _mm_cvttps_epi32(vResult);
    // Convert from signed to unsigned pnly if greater than 0x80000000
    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
    // On those that are too large, set to 0xFFFFFFFF
    vResult = _mm_or_ps(vResult,vOverflow);
    // Write 3 uints
-    XMVECTOR T1 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR T2 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
+    XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
@ -1550,7 +1550,7 @@ inline void XMStoreInt4
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_u32( pDestination, V );
 #elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1573,7 +1573,7 @@ inline void XMStoreInt4A
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_u32_ex( pDestination, V, 128 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
+    _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1649,9 +1649,9 @@ inline void XMStoreSInt4
    __m128i vResulti = _mm_cvttps_epi32(V);
    // If there was positive overflow, set to 0x7FFFFFFF
    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
    vOverflow = _mm_or_ps(vOverflow,vResult);
-    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&vOverflow)[0] );
+    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1688,10 +1688,10 @@ inline void XMStoreUInt4
    __m128i vResulti = _mm_cvttps_epi32(vResult);
    // Convert from signed to unsigned pnly if greater than 0x80000000
    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
-    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
    // On those that are too large, set to 0xFFFFFFFF
    vResult = _mm_or_ps(vResult,vOverflow);
-    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&vResult)[0] );
+    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1738,7 +1738,7 @@ inline void XMStoreFloat3x3
    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
    _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
-    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
+    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss(&pDestination->m[2][2],vTemp3);
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
--- a/Inc/DirectXMathMatrix.inl
+++ b/Inc/DirectXMathMatrix.inl
@ -319,10 +319,10 @@ inline XMMATRIX XMMatrixMultiply
    // Use vW to hold the original row
    XMVECTOR vW = M1.r[0];
    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    // Perform the operation on the first row
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
@ -335,10 +335,10 @@ inline XMMATRIX XMMatrixMultiply
    mResult.r[0] = vX;
    // Repeat for the other 3 rows
    vW = M1.r[1];
-    vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
    vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -348,10 +348,10 @@ inline XMMATRIX XMMatrixMultiply
    vX = _mm_add_ps(vX,vY);
    mResult.r[1] = vX;
    vW = M1.r[2];
-    vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
    vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -361,10 +361,10 @@ inline XMMATRIX XMMatrixMultiply
    vX = _mm_add_ps(vX,vY);
    mResult.r[2] = vX;
    vW = M1.r[3];
-    vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
    vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -490,10 +490,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
    // Use vW to hold the original row
    XMVECTOR vW = M1.r[0];
    // Splat the component X,Y,Z then W
-    XMVECTOR vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    XMVECTOR vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    XMVECTOR vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    // Perform the operation on the first row
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
@ -506,10 +506,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
    __m128 r0 = vX;
    // Repeat for the other 3 rows
    vW = M1.r[1];
-    vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
    vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -519,10 +519,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
    vX = _mm_add_ps(vX,vY);
    __m128 r1 = vX;
    vW = M1.r[2];
-    vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
    vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -532,10 +532,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
    vX = _mm_add_ps(vX,vY);
    __m128 r2 = vX;
    vW = M1.r[3];
-    vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
-    vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
-    vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
-    vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+    vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
    vX = _mm_mul_ps(vX,M2.r[0]);
    vY = _mm_mul_ps(vY,M2.r[1]);
    vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -738,10 +738,10 @@ inline XMMATRIX XMMatrixInverse

 #elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX MT = XMMatrixTranspose(M);
-    XMVECTOR V00 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,1,0,0));
-    XMVECTOR V10 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(3,2,3,2));
-    XMVECTOR V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,1,0,0));
-    XMVECTOR V11 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(3,2,3,2));
+    XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0));
+    XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2));
+    XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0));
+    XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2));
    XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0));
    XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1));

@ -749,10 +749,10 @@ inline XMMATRIX XMMatrixInverse
    XMVECTOR D1 = _mm_mul_ps(V01,V11);
    XMVECTOR D2 = _mm_mul_ps(V02,V12);

-    V00 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(3,2,3,2));
-    V10 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(1,1,0,0));
-    V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(3,2,3,2));
-    V11 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(1,1,0,0));
+    V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2));
+    V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0));
+    V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2));
+    V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0));
    V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1));
    V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0));

@ -764,15 +764,15 @@ inline XMMATRIX XMMatrixInverse
    D2 = _mm_sub_ps(D2,V02);
    // V11 = D0Y,D0W,D2Y,D2Y
    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1));
-    V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(1,0,2,1));
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1));
    V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2));
-    V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(0,1,0,2));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2));
    V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1));
    // V13 = D1Y,D1W,D2W,D2W
    XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1));
-    V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(1,0,2,1));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1));
    V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2));
-    XMVECTOR V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(0,1,0,2));
+    XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2));
    V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1));

    XMVECTOR C0 = _mm_mul_ps(V00,V10);
@ -782,15 +782,15 @@ inline XMMATRIX XMMatrixInverse

    // V11 = D0X,D0Y,D2X,D2X
    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0));
-    V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(2,1,3,2));
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2));
    V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3));
-    V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,3,2,3));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3));
    V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2));
    // V13 = D1X,D1Y,D2Z,D2Z
    V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0));
-    V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(2,1,3,2));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2));
    V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3));
-    V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,3,2,3));
+    V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3));
    V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2));

    V00 = _mm_mul_ps(V00,V10);
@ -802,22 +802,22 @@ inline XMMATRIX XMMatrixInverse
    C4 = _mm_sub_ps(C4,V02);
    C6 = _mm_sub_ps(C6,V03);

-    V00 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(0,3,0,3));
+    V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3));
    // V10 = D0Z,D0Z,D2X,D2Y
    V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2));
-    V10 = _mm_shuffle_ps(V10,V10,_MM_SHUFFLE(0,2,3,0));
-    V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(2,0,3,1));
+    V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0));
+    V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1));
    // V11 = D0X,D0W,D2X,D2Y
    V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0));
-    V11 = _mm_shuffle_ps(V11,V11,_MM_SHUFFLE(2,1,0,3));
-    V02 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(0,3,0,3));
+    V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3));
+    V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3));
    // V12 = D1Z,D1Z,D2Z,D2W
    V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2));
-    V12 = _mm_shuffle_ps(V12,V12,_MM_SHUFFLE(0,2,3,0));
-    V03 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(2,0,3,1));
+    V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0));
+    V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1));
    // V13 = D1X,D1W,D2Z,D2W
    V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0));
-    V13 = _mm_shuffle_ps(V13,V13,_MM_SHUFFLE(2,1,0,3));
+    V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3));

    V00 = _mm_mul_ps(V00,V10);
    V01 = _mm_mul_ps(V01,V11);
@ -836,10 +836,10 @@ inline XMMATRIX XMMatrixInverse
    C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0));
    C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0));
    C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0));
-    C0 = _mm_shuffle_ps(C0,C0,_MM_SHUFFLE(3,1,2,0));
-    C2 = _mm_shuffle_ps(C2,C2,_MM_SHUFFLE(3,1,2,0));
-    C4 = _mm_shuffle_ps(C4,C4,_MM_SHUFFLE(3,1,2,0));
-    C6 = _mm_shuffle_ps(C6,C6,_MM_SHUFFLE(3,1,2,0));
+    C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0));
+    C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0));
+    C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0));
+    C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0));
    // Get the determinate
    XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]);
    if (pDeterminant != nullptr)
@ -1357,7 +1357,7 @@ inline XMMATRIX XMMatrixRotationX
    M.r[0] = g_XMIdentityR0;
    M.r[1] = vCos;
    // x = 0,y = sin,z = cos, w = 0
-    vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,1,2,0));
+    vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0));
    // x = 0,y = -sin,z = cos, w = 0
    vCos = _mm_mul_ps(vCos,g_XMNegateY);
    M.r[2] = vCos;
@ -1434,7 +1434,7 @@ inline XMMATRIX XMMatrixRotationY
    M.r[2] = vSin;
    M.r[1] = g_XMIdentityR1;
    // x = cos,y = 0,z = sin, w = 0
-    vSin = _mm_shuffle_ps(vSin,vSin,_MM_SHUFFLE(3,0,1,2));
+    vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2));
    // x = cos,y = 0,z = -sin, w = 0
    vSin = _mm_mul_ps(vSin,g_XMNegateZ);
    M.r[0] = vSin;
@ -1510,7 +1510,7 @@ inline XMMATRIX XMMatrixRotationZ
    XMMATRIX M;
    M.r[0] = vCos;
    // x = sin,y = cos,z = 0, w = 0
-    vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,2,0,1));
+    vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1));
    // x = cos,y = -sin,z = 0, w = 0
    vCos = _mm_mul_ps(vCos,g_XMNegateX);
    M.r[1] = vCos;
@ -1597,8 +1597,8 @@ inline XMMATRIX XMMatrixRotationNormal
    XMVECTOR C1 = _mm_set_ps1(fCosAngle);
    XMVECTOR C0 = _mm_set_ps1(fSinAngle);

-    XMVECTOR N0 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,0,2,1));
-    XMVECTOR N1 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,1,0,2));
+    XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1));
+    XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2));

    XMVECTOR V0 = _mm_mul_ps(C2, N0);
    V0 = _mm_mul_ps(V0, N1);
@ -1614,18 +1614,18 @@ inline XMMATRIX XMMatrixRotationNormal

    V0 = _mm_and_ps(R0,g_XMMask3);
    XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0));
-    V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(0,3,2,1));
+    V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1));
    XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1));
-    V2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,0,2,0));
+    V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0));

    R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0));
-    R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,2,0));
+    R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0));

    XMMATRIX M;
    M.r[0] = R2;

    R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1));
-    R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,0,2));
+    R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2));
    M.r[1] = R2;

    V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0));
@ -1702,37 +1702,37 @@ inline XMMATRIX XMMatrixRotationQuaternion
    XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion);
    XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0);

-    XMVECTOR V0 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,0,0,1));
+    XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1));
    V0 = _mm_and_ps(V0,g_XMMask3);
-    XMVECTOR V1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,1,2,2));
+    XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2));
    V1 = _mm_and_ps(V1,g_XMMask3);
    XMVECTOR R0 = _mm_sub_ps(Constant1110,V0);
    R0 = _mm_sub_ps(R0, V1);

-    V0 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,1,0,0));
-    V1 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,2,1,2));
+    V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0));
+    V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2));
    V0 = _mm_mul_ps(V0, V1);

-    V1 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,3,3,3));
-    XMVECTOR V2 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,0,2,1));
+    V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3));
+    XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1));
    V1 = _mm_mul_ps(V1, V2);

    XMVECTOR R1 = _mm_add_ps(V0, V1);
    XMVECTOR R2 = _mm_sub_ps(V0, V1);

    V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1));
-    V0 = _mm_shuffle_ps(V0,V0,_MM_SHUFFLE(1,3,2,0));
+    V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0));
    V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0));
-    V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(2,0,2,0));
+    V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0));

    Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0));
-    Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,2,0));
+    Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0));

    XMMATRIX M;
    M.r[0] = Q1;

    Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1));
-    Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,0,2));
+    Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2));
    M.r[1] = Q1;

    Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0));
@ -2487,10 +2487,10 @@ inline XMMATRIX XMMatrixPerspectiveOffCenterLH
    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
    M.r[1] = vTemp;
    // 0,0,fRange,1.0f
-    M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
+    M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth,
+                          -(ViewTop + ViewBottom) * ReciprocalHeight,
+                          fRange,
+                          1.0f );
    // 0,0,-fRange * NearZ,0.0f
    vValues = _mm_and_ps(vValues,g_XMMaskZ);
    M.r[3] = vValues;
@ -2585,10 +2585,10 @@ inline XMMATRIX XMMatrixPerspectiveOffCenterRH
    vTemp = _mm_and_ps(vTemp,g_XMMaskY);
    M.r[1] = vTemp;
    // 0,0,fRange,1.0f
-    M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
+    M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth,
+                          (ViewTop + ViewBottom) * ReciprocalHeight,
+                          fRange,
+                          -1.0f );
    // 0,0,-fRange * NearZ,0.0f
    vValues = _mm_and_ps(vValues,g_XMMaskZ);
    M.r[3] = vValues;
--- a/Inc/DirectXMathMisc.inl
+++ b/Inc/DirectXMathMisc.inl
@ -156,23 +156,23 @@ inline XMVECTOR XMQuaternionMultiply
    XMVECTOR Q2Z = Q2;
    XMVECTOR vResult = Q2;
    // Splat with one instruction
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
-    Q2X = _mm_shuffle_ps(Q2X,Q2X,_MM_SHUFFLE(0,0,0,0));
-    Q2Y = _mm_shuffle_ps(Q2Y,Q2Y,_MM_SHUFFLE(1,1,1,1));
-    Q2Z = _mm_shuffle_ps(Q2Z,Q2Z,_MM_SHUFFLE(2,2,2,2));
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3));
+    Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0));
+    Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1));
+    Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2));
    // Retire Q1 and perform Q1*Q2W
    vResult = _mm_mul_ps(vResult,Q1);
    XMVECTOR Q1Shuffle = Q1;
    // Shuffle the copies of Q1
-    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
    // Mul by Q1WZYX
    Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
-    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
    // Flip the signs on y and z
    Q2X = _mm_mul_ps(Q2X,ControlWZYX);
    // Mul by Q1ZWXY
    Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
-    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
    // Flip the signs on z and w
    Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
    // Mul by Q1YXWZ
@ -438,7 +438,7 @@ inline XMVECTOR XMQuaternionSlerpV

    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);

-    XMVECTOR V01 = _mm_shuffle_ps(T,T,_MM_SHUFFLE(2,3,0,1));
+    XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1));
    V01 = _mm_and_ps(V01,MaskXY);
    V01 = _mm_xor_ps(V01,SignMask2);
    V01 = _mm_add_ps(g_XMIdentityR0, V01);
@ -897,11 +897,11 @@ inline XMVECTOR XMQuaternionRotationMatrix
    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)

    // (r00, r00, r00, r00)
-    XMVECTOR r00 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(0,0,0,0));
+    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0));
    // (r11, r11, r11, r11)
-    XMVECTOR r11 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1,1,1,1));
+    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1));
    // (r22, r22, r22, r22)
-    XMVECTOR r22 = _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(2,2,2,2));
+    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2));

    // x^2 >= y^2 equivalent to r11 - r00 <= 0
    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
@ -935,7 +935,7 @@ inline XMVECTOR XMQuaternionRotationMatrix
    // (r10, r10, r20, r21)
    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0));
    // (r10, r20, r21, r10)
-    t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1,3,2,0));
+    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
    // (4*x*y, 4*x*z, 4*y*z, unused)
    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);

@ -944,7 +944,7 @@ inline XMVECTOR XMQuaternionRotationMatrix
    // (r12, r12, r02, r01)
    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2));
    // (r12, r02, r01, r12)
-    t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1,3,2,0));
+    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
    // (4*x*w, 4*y*w, 4*z*w, unused)
    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
@ -1137,15 +1137,15 @@ inline XMVECTOR XMPlaneNormalizeEst
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(P,P);
    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
    // Result.x = x+y
    vDot = _mm_add_ss(vDot,vTemp);
    // x=Dot.z
-    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
    // Result.x = (x+y)+z
    vDot = _mm_add_ss(vDot,vTemp);
    // Splat x
-    vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
    // Get the reciprocal
    vDot = _mm_rsqrt_ps(vDot);
    // Get the reciprocal
@ -1183,11 +1183,11 @@ inline XMVECTOR XMPlaneNormalize
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
-    XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1));
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
-    vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Failsafe on zero (Or epsilon) length planes
@ -1531,24 +1531,11 @@ inline XMVECTOR XMColorAdjustSaturation
    return vbslq_f32( g_XMSelect1110, vResult, vColor );
 #elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
-// Mul RGB by intensity constants
-    XMVECTOR vLuminance = _mm_mul_ps(vColor,gvLuminance);      
-// vResult.x = vLuminance.y, vResult.y = vLuminance.y,
-// vResult.z = vLuminance.z, vResult.w = vLuminance.z 
-    XMVECTOR vResult = vLuminance;
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,1,1)); 
-// vLuminance.x += vLuminance.y
-    vLuminance = _mm_add_ss(vLuminance,vResult);
-// Splat vLuminance.z
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
-// vLuminance.x += vLuminance.z (Dot product)
-    vLuminance = _mm_add_ss(vLuminance,vResult);
-// Splat vLuminance
-    vLuminance = _mm_shuffle_ps(vLuminance,vLuminance,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
 // Splat fSaturation
    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
 // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
-    vResult = _mm_sub_ps(vColor,vLuminance);
+    XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance);
    vResult = _mm_mul_ps(vResult,vSaturation);
    vResult = _mm_add_ps(vResult,vLuminance);
 // Retain w from the source color
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@ -250,7 +250,7 @@ inline XMVECTOR PackedVector::XMLoadShortN2
    // x needs to be sign extended
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // x - 0x8000 to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
    // Convert -1.0f - 1.0f
@ -286,7 +286,7 @@ inline XMVECTOR PackedVector::XMLoadShort2
    // x needs to be sign extended
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // x - 0x8000 to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
    // Y is 65536 too large
@ -322,7 +322,7 @@ inline XMVECTOR PackedVector::XMLoadUShortN2
    // y needs to be sign flipped
    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // y + 0x8000 to undo the signed order.
    vTemp = _mm_add_ps(vTemp,FixaddY16);
    // Y is 65536 times too large
@ -358,7 +358,7 @@ inline XMVECTOR PackedVector::XMLoadUShort2
    // y needs to be sign flipped
    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // Y is 65536 times too large
    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
    // y + 0x8000 to undo the signed order.
@ -452,7 +452,7 @@ inline XMVECTOR PackedVector::XMLoadU565
    // Mask off x, y and z
    vResult = _mm_and_ps(vResult,U565And);
    // Convert to float
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Normalize x, y, and z
    vResult = _mm_mul_ps(vResult,U565Mul);
    return vResult;
@ -741,17 +741,17 @@ inline XMVECTOR PackedVector::XMLoadShortN4
    // Splat the color in all four entries (x,z,y,w)
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
    // x and z are unsigned! Flip the bits to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // x and z - 0x8000 to complete the conversion
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
    // Convert to -1.0f - 1.0f
    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
    // Clamp result (for case of -32768)
    return _mm_max_ps( vTemp, g_XMNegativeOne );
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
@ -782,17 +782,17 @@ inline XMVECTOR PackedVector::XMLoadShort4
    // Splat the color in all four entries (x,z,y,w)
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
    // x and z are unsigned! Flip the bits to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // x and z - 0x8000 to complete the conversion
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
    // Fix y and w because they are 65536 too large
    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -825,17 +825,17 @@ inline XMVECTOR PackedVector::XMLoadUShortN4
    // Splat the color in all four entries (x,z,y,w)
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
    // y and w are signed! Flip the bits to convert the order to unsigned
    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // y and w + 0x8000 to complete the conversion
    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
    // Fix y and w because they are 65536 too large
    vTemp = _mm_mul_ps(vTemp,FixupY16W16);
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -865,17 +865,17 @@ inline XMVECTOR PackedVector::XMLoadUShort4
    // Splat the color in all four entries (x,z,y,w)
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
    // y and w are signed! Flip the bits to convert the order to unsigned
    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // Fix y and w because they are 65536 too large
    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
    // y and w + 0x8000 to complete the conversion
    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -910,7 +910,7 @@ inline XMVECTOR PackedVector::XMLoadXDecN4
    // a is unsigned! Flip the bit to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // RGB + 0, A + 0x80000000.f to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
    // Convert 0-255 to 0.0f-1.0f
@ -953,7 +953,7 @@ inline XMVECTOR PackedVector::XMLoadXDec4
    // a is unsigned! Flip the bit to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,XDec4Xor);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // RGB + 0, A + 0x80000000.f to undo the signed order.
    vTemp = _mm_add_ps(vTemp,XDec4Add);
    // Convert 0-255 to 0.0f-1.0f
@ -993,7 +993,7 @@ inline XMVECTOR PackedVector::XMLoadUDecN4
    // a is unsigned! Flip the bit to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // RGB + 0, A + 0x80000000.f to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
    // Convert 0-255 to 0.0f-1.0f
@ -1031,7 +1031,7 @@ inline XMVECTOR PackedVector::XMLoadUDec4
    // a is unsigned! Flip the bit to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // RGB + 0, A + 0x80000000.f to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
    // Convert 0-255 to 0.0f-1.0f
@ -1074,7 +1074,7 @@ inline XMVECTOR PackedVector::XMLoadDecN4
    // a is unsigned! Flip the bit to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // RGB + 0, A + 0x80000000.f to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
    // Convert 0-255 to 0.0f-1.0f
@ -1117,7 +1117,7 @@ inline XMVECTOR PackedVector::XMLoadDec4
    // a is unsigned! Flip the bit to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // RGB + 0, A + 0x80000000.f to undo the signed order.
    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
    // Convert 0-255 to 0.0f-1.0f
@ -1152,7 +1152,7 @@ inline XMVECTOR PackedVector::XMLoadUByteN4
    // w is signed! Flip the bits to convert the order to unsigned
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // w + 0x80 to complete the conversion
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
    // Fix y, z and w because they are too large
@ -1187,7 +1187,7 @@ inline XMVECTOR PackedVector::XMLoadUByte4
    // w is signed! Flip the bits to convert the order to unsigned
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // w + 0x80 to complete the conversion
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
    // Fix y, z and w because they are too large
@ -1222,7 +1222,7 @@ inline XMVECTOR PackedVector::XMLoadByteN4
    // x,y and z are unsigned! Flip the bits to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // x, y and z - 0x80 to complete the conversion
    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
    // Fix y, z and w because they are too large
@ -1258,7 +1258,7 @@ inline XMVECTOR PackedVector::XMLoadByte4
    // x,y and z are unsigned! Flip the bits to convert the order to signed
    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
    // x, y and z - 0x80 to complete the conversion
    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
    // Fix y, z and w because they are too large
@ -1284,7 +1284,7 @@ inline XMVECTOR PackedVector::XMLoadUNibble4
    // Mask off x, y and z
    vResult = _mm_and_ps(vResult,UNibble4And);
    // Convert to float
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Normalize x, y, and z
    vResult = _mm_mul_ps(vResult,UNibble4Mul);
    return vResult;
@ -1315,7 +1315,7 @@ inline XMVECTOR PackedVector::XMLoadU555
    // Mask off x, y and z
    vResult = _mm_and_ps(vResult,U555And);
    // Convert to float
-    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
    // Normalize x, y, and z
    vResult = _mm_mul_ps(vResult,U555Mul);
    return vResult;
@ -1369,7 +1369,7 @@ inline void PackedVector::XMStoreColor
    // Convert to 0-255
    vResult = _mm_mul_ps(vResult,Scale);
    // Shuffle RGBA to ARGB
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
    // Convert to int 
    __m128i vInt = _mm_cvtps_epi32(vResult);
    // Mash to shorts
@ -1431,7 +1431,7 @@ inline void PackedVector::XMStoreShortN2
    vResult = _mm_mul_ps(vResult,Scale);
    __m128i vResulti = _mm_cvtps_epi32(vResult);
    vResulti = _mm_packs_epi32(vResulti,vResulti);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1469,7 +1469,7 @@ inline void PackedVector::XMStoreShort2
    __m128i vInt = _mm_cvtps_epi32(vResult);
    // Pack the ints into shorts
    vInt = _mm_packs_epi32(vInt,vInt);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1942,7 +1942,7 @@ inline void PackedVector::XMStoreShortN4
    vResult = _mm_mul_ps(vResult,Scale);
    __m128i vResulti = _mm_cvtps_epi32(vResult);
    vResulti = _mm_packs_epi32(vResulti,vResulti);
-    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -1991,7 +1991,7 @@ inline void PackedVector::XMStoreShort4
    __m128i vInt = _mm_cvtps_epi32(vResult);
    // Pack the ints into shorts
    vInt = _mm_packs_epi32(vInt,vInt);
-    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
+    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2138,13 +2138,13 @@ inline void PackedVector::XMStoreXDecN4
    __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
    vResulti = _mm_add_epi32(vResulti,vResultw);
    // Do a horizontal or of all 4 entries
-    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
-    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
-    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
-    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
-    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1));
+    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
+    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
+    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
+    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2197,7 +2197,7 @@ inline void PackedVector::XMStoreXDec4
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2248,7 +2248,7 @@ inline void PackedVector::XMStoreUDecN4
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2299,7 +2299,7 @@ inline void PackedVector::XMStoreUDec4
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2348,7 +2348,7 @@ inline void PackedVector::XMStoreDecN4
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2399,7 +2399,7 @@ inline void PackedVector::XMStoreDec4
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2451,7 +2451,7 @@ inline void PackedVector::XMStoreUByteN4
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2503,7 +2503,7 @@ inline void PackedVector::XMStoreUByte4
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2553,7 +2553,7 @@ inline void PackedVector::XMStoreByteN4
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }
@ -2605,7 +2605,7 @@ inline void PackedVector::XMStoreByte4
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
    // i = x|y|z|w
    vResulti = _mm_or_si128(vResulti,vResulti2);
-    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
 #else // _XM_VMX128_INTRINSICS_
 #endif // _XM_VMX128_INTRINSICS_
 }