Optimize SSE use of mm_load/store_ss pairs by using mm_load/store_pd (#94)

2024-11-09 22:20:08 +00:00 · 2020-02-29 15:56:27 -08:00 · 2020-02-29 15:56:27 -08:00 · b0dd6fc1a5
commit b0dd6fc1a5
parent da730052e5
3 changed files with 84 additions and 110 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -2068,6 +2068,11 @@ XMGLOBALCONST XMVECTORF32 g_UShortMax               = { { { 65535.0f, 65535.0f,
 #pragma prefast(disable : 26495, "Union initialization confuses /analyze")
 #endif
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 #endif
 //------------------------------------------------------------------------------
 inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3)
@ -2164,6 +2169,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant)
 #include "DirectXMathMatrix.inl"
 #include "DirectXMathMisc.inl"
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 #ifdef _PREFAST_
 #pragma prefast(pop)
 #endif
--- a/Inc/DirectXMathConvert.inl
+++ b/Inc/DirectXMathConvert.inl
@ -278,9 +278,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2
    uint32x2_t zero = vdup_n_u32(0);
    return vcombine_u32( x, zero );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
    return _mm_unpacklo_ps( x, y );
 #endif
 }
@ -305,8 +303,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2A
    uint32x2_t zero = vdup_n_u32(0);
    return vcombine_u32( x, zero );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    return _mm_castsi128_ps(V);
 #endif
 }
@ -330,9 +327,7 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat2
    float32x2_t zero = vdup_n_f32(0);
    return vcombine_f32( x, zero );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( &pSource->x );
+    return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 y = _mm_load_ss( &pSource->y );
    return _mm_unpacklo_ps( x, y );
 #endif
 }
@ -357,8 +352,7 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat2A
    float32x2_t zero = vdup_n_f32(0);
    return vcombine_f32( x, zero );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    return _mm_castsi128_ps(V);
 #endif
 }
@ -383,9 +377,7 @@ inline XMVECTOR XM_CALLCONV XMLoadSInt2
    float32x2_t zero = vdup_n_f32(0);
    return vcombine_f32( v, zero );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 V = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
    __m128 V = _mm_unpacklo_ps( x, y );
    return _mm_cvtepi32_ps(_mm_castps_si128(V));
 #endif
 }
@ -411,9 +403,7 @@ inline XMVECTOR XM_CALLCONV XMLoadUInt2
    float32x2_t zero = vdup_n_f32(0);
    return vcombine_f32( v, zero );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 V = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
    __m128 V = _mm_unpacklo_ps( x, y );
    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
    // Determine which ones need the fix.
    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
@ -450,11 +440,13 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3
    uint32x2_t zero = vdup_n_u32(0);
    uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
    return vcombine_u32( x, y );
-#elif defined(_XM_SSE_INTRINSICS_)
+#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
-    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
+    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
    return _mm_insert_ps( xy, z, 0x20 );
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
    __m128 xy = _mm_unpacklo_ps( x, y );
    return _mm_movelh_ps( xy, z );
 #endif
 }
@ -479,11 +471,14 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3A
    // Reads an extra integer which is zero'd
    uint32x4_t V = vld1q_u32_ex( pSource, 128 );
    return vsetq_lane_u32( 0, V, 3 );
 #elif defined(_XM_SSE4_INTRINSICS_)
    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
    return _mm_insert_ps( xy, z, 0x20 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    // Reads an extra integer which is zero'd
+    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
-    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
+    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
-    V = _mm_and_si128( V, g_XMMask3 );
+    return _mm_movelh_ps( xy, z );
    return _mm_castsi128_ps(V);
 #endif
 }
@ -507,11 +502,13 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3
    float32x2_t zero = vdup_n_f32(0);
    float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
    return vcombine_f32( x, y );
-#elif defined(_XM_SSE_INTRINSICS_)
+#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128 x = _mm_load_ss( &pSource->x );
+    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
-    __m128 y = _mm_load_ss( &pSource->y );
+    __m128 z = _mm_load_ss( &pSource->z );
    return _mm_insert_ps( xy, z, 0x20 );
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 z = _mm_load_ss( &pSource->z );
    __m128 xy = _mm_unpacklo_ps( x, y );
    return _mm_movelh_ps( xy, z );
 #endif
 }
@ -567,10 +564,8 @@ inline XMVECTOR XM_CALLCONV XMLoadSInt3
    int32x4_t v = vcombine_s32( x, y );
    return vcvtq_f32_s32( v );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
    __m128 xy = _mm_unpacklo_ps( x, y );
    __m128 V = _mm_movelh_ps( xy, z );
    return _mm_cvtepi32_ps(_mm_castps_si128(V));
 #endif
@ -598,10 +593,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUInt3
    uint32x4_t v = vcombine_u32( x, y );
    return vcvtq_f32_u32( v );
 #elif defined(_XM_SSE_INTRINSICS_)
-    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
+    __m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
    __m128 xy = _mm_unpacklo_ps( x, y );
    __m128 V = _mm_movelh_ps( xy, z );
    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
    // Determine which ones need the fix.
@ -1318,9 +1311,7 @@ inline void XM_CALLCONV XMStoreInt2
    uint32x2_t VL = vget_low_u32(V);
    vst1_u32( pDestination, VL );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
 #endif
 }
@ -1341,7 +1332,7 @@ inline void XM_CALLCONV XMStoreInt2A
    uint32x2_t VL = vget_low_u32(V);
    vst1_u32_ex( pDestination, VL, 64 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+    _mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
 #endif
 }
@ -1361,9 +1352,7 @@ inline void XM_CALLCONV XMStoreFloat2
    float32x2_t VL = vget_low_f32(V);
    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
    _mm_store_ss( &pDestination->x, V );
    _mm_store_ss( &pDestination->y, T );
 #endif
 }
@ -1384,7 +1373,7 @@ inline void XM_CALLCONV XMStoreFloat2A
    float32x2_t VL = vget_low_f32(V);
    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+    _mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
 #endif
 }
@ -1414,9 +1403,7 @@ inline void XM_CALLCONV XMStoreSInt2
    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
    vOverflow = _mm_or_ps(vOverflow,vResult);
    // Write two ints
-    XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
 #endif
 }
@ -1455,9 +1442,7 @@ inline void XM_CALLCONV XMStoreUInt2
    // On those that are too large, set to 0xFFFFFFFF
    vResult = _mm_or_ps(vResult,vOverflow);
    // Write two uints
-    XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
 #endif
 }
@ -1479,11 +1464,9 @@ inline void XM_CALLCONV XMStoreInt3
    vst1_u32( pDestination, VL );
    vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    __m128 z = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), z );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
 #endif
 }
@ -1506,9 +1489,9 @@ inline void XM_CALLCONV XMStoreInt3A
    vst1_u32_ex( pDestination, VL, 64 );
    vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+    __m128 z = _mm_movehl_ps( V, V );
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), z );
 #endif
 }
@ -1529,12 +1512,14 @@ inline void XM_CALLCONV XMStoreFloat3
    float32x2_t VL = vget_low_f32(V);
    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
 #elif defined(_XM_SSE4_INTRINSICS_)
    *reinterpret_cast<int*>( &pDestination->x ) = _mm_extract_ps( V, 0 );
    *reinterpret_cast<int*>( &pDestination->y ) = _mm_extract_ps( V, 1 );
    *reinterpret_cast<int*>( &pDestination->z ) = _mm_extract_ps( V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    __m128 z = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( &pDestination->x, V );
+    _mm_store_ss(&pDestination->z, z);
    _mm_store_ss( &pDestination->y, T1 );
    _mm_store_ss( &pDestination->z, T2 );
 #endif
 }
@ -1556,10 +1541,13 @@ inline void XM_CALLCONV XMStoreFloat3A
    float32x2_t VL = vget_low_f32(V);
    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
 #elif defined(_XM_SSE4_INTRINSICS_)
    _mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
    *reinterpret_cast<int*>( &pDestination->z ) = _mm_extract_ps( V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
-    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
+    __m128 z = _mm_movehl_ps( V, V );
-    _mm_store_ss( &pDestination->z, T );
+    _mm_store_ss( &pDestination->z, z );
 #endif
 }
@ -1591,11 +1579,9 @@ inline void XM_CALLCONV XMStoreSInt3
    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
    vOverflow = _mm_or_ps(vOverflow,vResult);
    // Write 3 uints
-    XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
-    XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
+    __m128 z = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), z );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
 #endif
 }
@ -1636,11 +1622,9 @@ inline void XM_CALLCONV XMStoreUInt3
    // On those that are too large, set to 0xFFFFFFFF
    vResult = _mm_or_ps(vResult,vOverflow);
    // Write 3 uints
-    XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
-    XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
+    __m128 z = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
-    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
+    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), z );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
 #endif
 }
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
@ -7934,12 +7934,11 @@ inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
        // Unaligned input
        for (; i < VectorCount; i++)
        {
-            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
            pInputVector += InputStride;
-            XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0));
+            XMVECTOR Y = XM_PERMUTE_PS(xy,_MM_SHUFFLE(1,1,1,1));
-            XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0));
+            XMVECTOR X = XM_PERMUTE_PS(xy,_MM_SHUFFLE(0,0,0,0));
            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
@ -8254,10 +8253,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
                    XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
                    vTemp = _mm_div_ps( vTemp, W );
                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
                    pOutputVector += OutputStride;
                    // Result 2
@ -8272,10 +8269,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
                    W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
                    vTemp = _mm_div_ps( vTemp, W );
                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
                    pOutputVector += OutputStride;
                    i += 2;
@ -8303,10 +8298,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
            XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
            vTemp = _mm_div_ps( vTemp, W );
            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
            pOutputVector += OutputStride;
        }
    }
@ -8315,12 +8308,11 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
        // Unaligned input
        for (; i < VectorCount; i++)
        {
-            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
            pInputVector += InputStride;
-            XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
@ -8330,10 +8322,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
            XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
            vTemp = _mm_div_ps( vTemp, W );
            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
            pOutputVector += OutputStride;
        }
    }
@ -8584,10 +8574,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
                    XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
                    XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
                    vTemp = _mm_add_ps( vTemp, vTemp2 );
                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
                    pOutputVector += OutputStride;
                    // Result 2
@ -8597,10 +8585,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
                    vTemp = _mm_mul_ps( Y, row1 );
                    vTemp2 = _mm_mul_ps( X, row0 );
                    vTemp = _mm_add_ps( vTemp, vTemp2 );
                    vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
                    pOutputVector += OutputStride;
                    i += 2;
@ -8623,10 +8609,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
            vTemp = _mm_add_ps( vTemp, vTemp2 );
            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
            pOutputVector += OutputStride;
        }
    }
@ -8635,20 +8619,17 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
        // Unaligned input
        for (; i < VectorCount; i++)
        {
-            __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
            pInputVector += InputStride;
-            XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
            XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
            XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
            vTemp = _mm_add_ps( vTemp, vTemp2 );
            vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
-            _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
            pOutputVector += OutputStride;
        }
    }