1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-09-19 14:49:54 +00:00

Optimize SSE use of mm_load/store_ss pairs by using mm_load/store_pd (#94)

This commit is contained in:
Chuck Walbourn 2020-02-29 15:56:27 -08:00 committed by GitHub
parent da730052e5
commit b0dd6fc1a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 84 additions and 110 deletions

View File

@ -2068,6 +2068,11 @@ XMGLOBALCONST XMVECTORF32 g_UShortMax = { { { 65535.0f, 65535.0f,
#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
#endif
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
#endif
//------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3)
@ -2164,6 +2169,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant)
#include "DirectXMathMatrix.inl"
#include "DirectXMathMisc.inl"
#ifdef __clang__
#pragma clang diagnostic pop
#endif
#ifdef _PREFAST_
#pragma prefast(pop)
#endif

View File

@ -278,9 +278,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2
uint32x2_t zero = vdup_n_u32(0);
return vcombine_u32( x, zero );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
return _mm_unpacklo_ps( x, y );
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
#endif
}
@ -305,8 +303,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2A
uint32x2_t zero = vdup_n_u32(0);
return vcombine_u32( x, zero );
#elif defined(_XM_SSE_INTRINSICS_)
__m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
return _mm_castsi128_ps(V);
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
#endif
}
@ -330,9 +327,7 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat2
float32x2_t zero = vdup_n_f32(0);
return vcombine_f32( x, zero );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( &pSource->x );
__m128 y = _mm_load_ss( &pSource->y );
return _mm_unpacklo_ps( x, y );
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
#endif
}
@ -357,8 +352,7 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat2A
float32x2_t zero = vdup_n_f32(0);
return vcombine_f32( x, zero );
#elif defined(_XM_SSE_INTRINSICS_)
__m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
return _mm_castsi128_ps(V);
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
#endif
}
@ -383,9 +377,7 @@ inline XMVECTOR XM_CALLCONV XMLoadSInt2
float32x2_t zero = vdup_n_f32(0);
return vcombine_f32( v, zero );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
__m128 V = _mm_unpacklo_ps( x, y );
__m128 V = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
return _mm_cvtepi32_ps(_mm_castps_si128(V));
#endif
}
@ -411,9 +403,7 @@ inline XMVECTOR XM_CALLCONV XMLoadUInt2
float32x2_t zero = vdup_n_f32(0);
return vcombine_f32( v, zero );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
__m128 V = _mm_unpacklo_ps( x, y );
__m128 V = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
// Determine which ones need the fix.
XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
@ -450,11 +440,13 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3
uint32x2_t zero = vdup_n_u32(0);
uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
return vcombine_u32( x, y );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
#elif defined(_XM_SSE4_INTRINSICS_)
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
return _mm_insert_ps( xy, z, 0x20 );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
__m128 xy = _mm_unpacklo_ps( x, y );
return _mm_movelh_ps( xy, z );
#endif
}
@ -479,11 +471,14 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3A
// Reads an extra integer which is zero'd
uint32x4_t V = vld1q_u32_ex( pSource, 128 );
return vsetq_lane_u32( 0, V, 3 );
#elif defined(_XM_SSE4_INTRINSICS_)
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
return _mm_insert_ps( xy, z, 0x20 );
#elif defined(_XM_SSE_INTRINSICS_)
// Reads an extra integer which is zero'd
__m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
V = _mm_and_si128( V, g_XMMask3 );
return _mm_castsi128_ps(V);
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
return _mm_movelh_ps( xy, z );
#endif
}
@ -507,11 +502,13 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3
float32x2_t zero = vdup_n_f32(0);
float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
return vcombine_f32( x, y );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( &pSource->x );
__m128 y = _mm_load_ss( &pSource->y );
#elif defined(_XM_SSE4_INTRINSICS_)
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( &pSource->z );
return _mm_insert_ps( xy, z, 0x20 );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( &pSource->z );
__m128 xy = _mm_unpacklo_ps( x, y );
return _mm_movelh_ps( xy, z );
#endif
}
@ -567,10 +564,8 @@ inline XMVECTOR XM_CALLCONV XMLoadSInt3
int32x4_t v = vcombine_s32( x, y );
return vcvtq_f32_s32( v );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
__m128 xy = _mm_unpacklo_ps( x, y );
__m128 V = _mm_movelh_ps( xy, z );
return _mm_cvtepi32_ps(_mm_castps_si128(V));
#endif
@ -598,10 +593,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUInt3
uint32x4_t v = vcombine_u32( x, y );
return vcvtq_f32_u32( v );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
__m128 xy = _mm_unpacklo_ps( x, y );
__m128 V = _mm_movelh_ps( xy, z );
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
// Determine which ones need the fix.
@ -1318,9 +1311,7 @@ inline void XM_CALLCONV XMStoreInt2
uint32x2_t VL = vget_low_u32(V);
vst1_u32( pDestination, VL );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
#endif
}
@ -1341,7 +1332,7 @@ inline void XM_CALLCONV XMStoreInt2A
uint32x2_t VL = vget_low_u32(V);
vst1_u32_ex( pDestination, VL, 64 );
#elif defined(_XM_SSE_INTRINSICS_)
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
#endif
}
@ -1361,9 +1352,7 @@ inline void XM_CALLCONV XMStoreFloat2
float32x2_t VL = vget_low_f32(V);
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( &pDestination->x, V );
_mm_store_ss( &pDestination->y, T );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
#endif
}
@ -1384,7 +1373,7 @@ inline void XM_CALLCONV XMStoreFloat2A
float32x2_t VL = vget_low_f32(V);
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
#elif defined(_XM_SSE_INTRINSICS_)
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
#endif
}
@ -1414,9 +1403,7 @@ inline void XM_CALLCONV XMStoreSInt2
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
vOverflow = _mm_or_ps(vOverflow,vResult);
// Write two ints
XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
#endif
}
@ -1455,9 +1442,7 @@ inline void XM_CALLCONV XMStoreUInt2
// On those that are too large, set to 0xFFFFFFFF
vResult = _mm_or_ps(vResult,vOverflow);
// Write two uints
XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
#endif
}
@ -1479,11 +1464,9 @@ inline void XM_CALLCONV XMStoreInt3
vst1_u32( pDestination, VL );
vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(pDestination), V );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
__m128 z = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), z );
#endif
}
@ -1506,9 +1489,9 @@ inline void XM_CALLCONV XMStoreInt3A
vst1_u32_ex( pDestination, VL, 64 );
vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
__m128 z = _mm_movehl_ps( V, V );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), z );
#endif
}
@ -1529,12 +1512,14 @@ inline void XM_CALLCONV XMStoreFloat3
float32x2_t VL = vget_low_f32(V);
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
#elif defined(_XM_SSE4_INTRINSICS_)
*reinterpret_cast<int*>( &pDestination->x ) = _mm_extract_ps( V, 0 );
*reinterpret_cast<int*>( &pDestination->y ) = _mm_extract_ps( V, 1 );
*reinterpret_cast<int*>( &pDestination->z ) = _mm_extract_ps( V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( &pDestination->x, V );
_mm_store_ss( &pDestination->y, T1 );
_mm_store_ss( &pDestination->z, T2 );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
__m128 z = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss(&pDestination->z, z);
#endif
}
@ -1556,10 +1541,13 @@ inline void XM_CALLCONV XMStoreFloat3A
float32x2_t VL = vget_low_f32(V);
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
#elif defined(_XM_SSE4_INTRINSICS_)
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
*reinterpret_cast<int*>( &pDestination->z ) = _mm_extract_ps( V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
_mm_store_ss( &pDestination->z, T );
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
__m128 z = _mm_movehl_ps( V, V );
_mm_store_ss( &pDestination->z, z );
#endif
}
@ -1591,11 +1579,9 @@ inline void XM_CALLCONV XMStoreSInt3
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
vOverflow = _mm_or_ps(vOverflow,vResult);
// Write 3 uints
XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
__m128 z = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), z );
#endif
}
@ -1636,11 +1622,9 @@ inline void XM_CALLCONV XMStoreUInt3
// On those that are too large, set to 0xFFFFFFFF
vResult = _mm_or_ps(vResult,vOverflow);
// Write 3 uints
XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
__m128 z = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), z );
#endif
}

View File

@ -7934,12 +7934,11 @@ inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
// Unaligned input
for (; i < VectorCount; i++)
{
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
pInputVector += InputStride;
XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0));
XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0));
XMVECTOR Y = XM_PERMUTE_PS(xy,_MM_SHUFFLE(1,1,1,1));
XMVECTOR X = XM_PERMUTE_PS(xy,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
@ -8254,10 +8253,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
vTemp = _mm_div_ps( vTemp, W );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
// Result 2
@ -8272,10 +8269,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
vTemp = _mm_div_ps( vTemp, W );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
i += 2;
@ -8303,10 +8298,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
vTemp = _mm_div_ps( vTemp, W );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
}
}
@ -8315,12 +8308,11 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
// Unaligned input
for (; i < VectorCount; i++)
{
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
pInputVector += InputStride;
XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
@ -8330,10 +8322,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
vTemp = _mm_div_ps( vTemp, W );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
}
}
@ -8584,10 +8574,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
vTemp = _mm_add_ps( vTemp, vTemp2 );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
// Result 2
@ -8597,10 +8585,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
vTemp = _mm_mul_ps( Y, row1 );
vTemp2 = _mm_mul_ps( X, row0 );
vTemp = _mm_add_ps( vTemp, vTemp2 );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
i += 2;
@ -8623,10 +8609,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
vTemp = _mm_add_ps( vTemp, vTemp2 );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
}
}
@ -8635,20 +8619,17 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
// Unaligned input
for (; i < VectorCount; i++)
{
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
pInputVector += InputStride;
XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
vTemp = _mm_add_ps( vTemp, vTemp2 );
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
pOutputVector += OutputStride;
}
}