mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-09 22:20:08 +00:00
Optimize SSE use of mm_load/store_ss pairs by using mm_load/store_pd (#94)
This commit is contained in:
parent
da730052e5
commit
b0dd6fc1a5
@ -2068,6 +2068,11 @@ XMGLOBALCONST XMVECTORF32 g_UShortMax = { { { 65535.0f, 65535.0f,
|
|||||||
#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
|
#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __clang__
|
||||||
|
#pragma clang diagnostic push
|
||||||
|
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
|
||||||
|
#endif
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3)
|
inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3)
|
||||||
@ -2164,6 +2169,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant)
|
|||||||
#include "DirectXMathMatrix.inl"
|
#include "DirectXMathMatrix.inl"
|
||||||
#include "DirectXMathMisc.inl"
|
#include "DirectXMathMisc.inl"
|
||||||
|
|
||||||
|
#ifdef __clang__
|
||||||
|
#pragma clang diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _PREFAST_
|
#ifdef _PREFAST_
|
||||||
#pragma prefast(pop)
|
#pragma prefast(pop)
|
||||||
#endif
|
#endif
|
||||||
|
@ -278,9 +278,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2
|
|||||||
uint32x2_t zero = vdup_n_u32(0);
|
uint32x2_t zero = vdup_n_u32(0);
|
||||||
return vcombine_u32( x, zero );
|
return vcombine_u32( x, zero );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
|
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
|
|
||||||
return _mm_unpacklo_ps( x, y );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -305,8 +303,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2A
|
|||||||
uint32x2_t zero = vdup_n_u32(0);
|
uint32x2_t zero = vdup_n_u32(0);
|
||||||
return vcombine_u32( x, zero );
|
return vcombine_u32( x, zero );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
|
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
return _mm_castsi128_ps(V);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -330,9 +327,7 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat2
|
|||||||
float32x2_t zero = vdup_n_f32(0);
|
float32x2_t zero = vdup_n_f32(0);
|
||||||
return vcombine_f32( x, zero );
|
return vcombine_f32( x, zero );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( &pSource->x );
|
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( &pSource->y );
|
|
||||||
return _mm_unpacklo_ps( x, y );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -357,8 +352,7 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat2A
|
|||||||
float32x2_t zero = vdup_n_f32(0);
|
float32x2_t zero = vdup_n_f32(0);
|
||||||
return vcombine_f32( x, zero );
|
return vcombine_f32( x, zero );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
|
return _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
return _mm_castsi128_ps(V);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -383,9 +377,7 @@ inline XMVECTOR XM_CALLCONV XMLoadSInt2
|
|||||||
float32x2_t zero = vdup_n_f32(0);
|
float32x2_t zero = vdup_n_f32(0);
|
||||||
return vcombine_f32( v, zero );
|
return vcombine_f32( v, zero );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
|
__m128 V = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
|
|
||||||
__m128 V = _mm_unpacklo_ps( x, y );
|
|
||||||
return _mm_cvtepi32_ps(_mm_castps_si128(V));
|
return _mm_cvtepi32_ps(_mm_castps_si128(V));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -411,9 +403,7 @@ inline XMVECTOR XM_CALLCONV XMLoadUInt2
|
|||||||
float32x2_t zero = vdup_n_f32(0);
|
float32x2_t zero = vdup_n_f32(0);
|
||||||
return vcombine_f32( v, zero );
|
return vcombine_f32( v, zero );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
|
__m128 V = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
|
|
||||||
__m128 V = _mm_unpacklo_ps( x, y );
|
|
||||||
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
|
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
|
||||||
// Determine which ones need the fix.
|
// Determine which ones need the fix.
|
||||||
XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
|
XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
|
||||||
@ -450,11 +440,13 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3
|
|||||||
uint32x2_t zero = vdup_n_u32(0);
|
uint32x2_t zero = vdup_n_u32(0);
|
||||||
uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
|
uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
|
||||||
return vcombine_u32( x, y );
|
return vcombine_u32( x, y );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
|
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
|
||||||
|
return _mm_insert_ps( xy, z, 0x20 );
|
||||||
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
|
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
|
||||||
__m128 xy = _mm_unpacklo_ps( x, y );
|
|
||||||
return _mm_movelh_ps( xy, z );
|
return _mm_movelh_ps( xy, z );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -479,11 +471,14 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3A
|
|||||||
// Reads an extra integer which is zero'd
|
// Reads an extra integer which is zero'd
|
||||||
uint32x4_t V = vld1q_u32_ex( pSource, 128 );
|
uint32x4_t V = vld1q_u32_ex( pSource, 128 );
|
||||||
return vsetq_lane_u32( 0, V, 3 );
|
return vsetq_lane_u32( 0, V, 3 );
|
||||||
|
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||||
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
|
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
|
||||||
|
return _mm_insert_ps( xy, z, 0x20 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
// Reads an extra integer which is zero'd
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
|
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
|
||||||
V = _mm_and_si128( V, g_XMMask3 );
|
return _mm_movelh_ps( xy, z );
|
||||||
return _mm_castsi128_ps(V);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -507,11 +502,13 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3
|
|||||||
float32x2_t zero = vdup_n_f32(0);
|
float32x2_t zero = vdup_n_f32(0);
|
||||||
float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
|
float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
|
||||||
return vcombine_f32( x, y );
|
return vcombine_f32( x, y );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( &pSource->x );
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( &pSource->y );
|
__m128 z = _mm_load_ss( &pSource->z );
|
||||||
|
return _mm_insert_ps( xy, z, 0x20 );
|
||||||
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 z = _mm_load_ss( &pSource->z );
|
__m128 z = _mm_load_ss( &pSource->z );
|
||||||
__m128 xy = _mm_unpacklo_ps( x, y );
|
|
||||||
return _mm_movelh_ps( xy, z );
|
return _mm_movelh_ps( xy, z );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -567,10 +564,8 @@ inline XMVECTOR XM_CALLCONV XMLoadSInt3
|
|||||||
int32x4_t v = vcombine_s32( x, y );
|
int32x4_t v = vcombine_s32( x, y );
|
||||||
return vcvtq_f32_s32( v );
|
return vcvtq_f32_s32( v );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
|
|
||||||
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
|
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
|
||||||
__m128 xy = _mm_unpacklo_ps( x, y );
|
|
||||||
__m128 V = _mm_movelh_ps( xy, z );
|
__m128 V = _mm_movelh_ps( xy, z );
|
||||||
return _mm_cvtepi32_ps(_mm_castps_si128(V));
|
return _mm_cvtepi32_ps(_mm_castps_si128(V));
|
||||||
#endif
|
#endif
|
||||||
@ -598,10 +593,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUInt3
|
|||||||
uint32x4_t v = vcombine_u32( x, y );
|
uint32x4_t v = vcombine_u32( x, y );
|
||||||
return vcvtq_f32_u32( v );
|
return vcvtq_f32_u32( v );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
|
__m128 xy = _mm_castpd_ps( _mm_load_sd( reinterpret_cast<const double*>( pSource ) ) );
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
|
|
||||||
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
|
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
|
||||||
__m128 xy = _mm_unpacklo_ps( x, y );
|
|
||||||
__m128 V = _mm_movelh_ps( xy, z );
|
__m128 V = _mm_movelh_ps( xy, z );
|
||||||
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
|
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
|
||||||
// Determine which ones need the fix.
|
// Determine which ones need the fix.
|
||||||
@ -1318,9 +1311,7 @@ inline void XM_CALLCONV XMStoreInt2
|
|||||||
uint32x2_t VL = vget_low_u32(V);
|
uint32x2_t VL = vget_low_u32(V);
|
||||||
vst1_u32( pDestination, VL );
|
vst1_u32( pDestination, VL );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1341,7 +1332,7 @@ inline void XM_CALLCONV XMStoreInt2A
|
|||||||
uint32x2_t VL = vget_low_u32(V);
|
uint32x2_t VL = vget_low_u32(V);
|
||||||
vst1_u32_ex( pDestination, VL, 64 );
|
vst1_u32_ex( pDestination, VL, 64 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
|
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1361,9 +1352,7 @@ inline void XM_CALLCONV XMStoreFloat2
|
|||||||
float32x2_t VL = vget_low_f32(V);
|
float32x2_t VL = vget_low_f32(V);
|
||||||
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
|
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
|
||||||
_mm_store_ss( &pDestination->x, V );
|
|
||||||
_mm_store_ss( &pDestination->y, T );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1384,7 +1373,7 @@ inline void XM_CALLCONV XMStoreFloat2A
|
|||||||
float32x2_t VL = vget_low_f32(V);
|
float32x2_t VL = vget_low_f32(V);
|
||||||
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
|
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
|
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1414,9 +1403,7 @@ inline void XM_CALLCONV XMStoreSInt2
|
|||||||
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
|
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
|
||||||
vOverflow = _mm_or_ps(vOverflow,vResult);
|
vOverflow = _mm_or_ps(vOverflow,vResult);
|
||||||
// Write two ints
|
// Write two ints
|
||||||
XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1455,9 +1442,7 @@ inline void XM_CALLCONV XMStoreUInt2
|
|||||||
// On those that are too large, set to 0xFFFFFFFF
|
// On those that are too large, set to 0xFFFFFFFF
|
||||||
vResult = _mm_or_ps(vResult,vOverflow);
|
vResult = _mm_or_ps(vResult,vOverflow);
|
||||||
// Write two uints
|
// Write two uints
|
||||||
XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1479,11 +1464,9 @@ inline void XM_CALLCONV XMStoreInt3
|
|||||||
vst1_u32( pDestination, VL );
|
vst1_u32( pDestination, VL );
|
||||||
vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
|
vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
|
||||||
XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
__m128 z = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pDestination), V );
|
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), z );
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1506,9 +1489,9 @@ inline void XM_CALLCONV XMStoreInt3A
|
|||||||
vst1_u32_ex( pDestination, VL, 64 );
|
vst1_u32_ex( pDestination, VL, 64 );
|
||||||
vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
|
vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
|
||||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
|
__m128 z = _mm_movehl_ps( V, V );
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
|
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), z );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1529,12 +1512,14 @@ inline void XM_CALLCONV XMStoreFloat3
|
|||||||
float32x2_t VL = vget_low_f32(V);
|
float32x2_t VL = vget_low_f32(V);
|
||||||
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
|
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
|
||||||
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
|
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
|
||||||
|
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||||
|
*reinterpret_cast<int*>( &pDestination->x ) = _mm_extract_ps( V, 0 );
|
||||||
|
*reinterpret_cast<int*>( &pDestination->y ) = _mm_extract_ps( V, 1 );
|
||||||
|
*reinterpret_cast<int*>( &pDestination->z ) = _mm_extract_ps( V, 2 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
|
||||||
XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
__m128 z = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
||||||
_mm_store_ss( &pDestination->x, V );
|
_mm_store_ss(&pDestination->z, z);
|
||||||
_mm_store_ss( &pDestination->y, T1 );
|
|
||||||
_mm_store_ss( &pDestination->z, T2 );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1556,10 +1541,13 @@ inline void XM_CALLCONV XMStoreFloat3A
|
|||||||
float32x2_t VL = vget_low_f32(V);
|
float32x2_t VL = vget_low_f32(V);
|
||||||
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
|
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
|
||||||
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
|
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
|
||||||
|
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||||
|
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
|
||||||
|
*reinterpret_cast<int*>( &pDestination->z ) = _mm_extract_ps( V, 2 );
|
||||||
#elif defined(_XM_SSE_INTRINSICS_)
|
#elif defined(_XM_SSE_INTRINSICS_)
|
||||||
XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
_mm_store_sd( reinterpret_cast<double*>(pDestination), _mm_castps_pd(V) );
|
||||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
|
__m128 z = _mm_movehl_ps( V, V );
|
||||||
_mm_store_ss( &pDestination->z, T );
|
_mm_store_ss( &pDestination->z, z );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1591,11 +1579,9 @@ inline void XM_CALLCONV XMStoreSInt3
|
|||||||
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
|
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
|
||||||
vOverflow = _mm_or_ps(vOverflow,vResult);
|
vOverflow = _mm_or_ps(vOverflow,vResult);
|
||||||
// Write 3 uints
|
// Write 3 uints
|
||||||
XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
|
||||||
XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
|
__m128 z = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
|
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), z );
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1636,11 +1622,9 @@ inline void XM_CALLCONV XMStoreUInt3
|
|||||||
// On those that are too large, set to 0xFFFFFFFF
|
// On those that are too large, set to 0xFFFFFFFF
|
||||||
vResult = _mm_or_ps(vResult,vOverflow);
|
vResult = _mm_or_ps(vResult,vOverflow);
|
||||||
// Write 3 uints
|
// Write 3 uints
|
||||||
XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
|
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
|
||||||
XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
|
__m128 z = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
|
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), z );
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7934,12 +7934,11 @@ inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
|
|||||||
// Unaligned input
|
// Unaligned input
|
||||||
for (; i < VectorCount; i++)
|
for (; i < VectorCount; i++)
|
||||||
{
|
{
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
|
|
||||||
pInputVector += InputStride;
|
pInputVector += InputStride;
|
||||||
|
|
||||||
XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0));
|
XMVECTOR Y = XM_PERMUTE_PS(xy,_MM_SHUFFLE(1,1,1,1));
|
||||||
XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0));
|
XMVECTOR X = XM_PERMUTE_PS(xy,_MM_SHUFFLE(0,0,0,0));
|
||||||
|
|
||||||
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
||||||
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
||||||
@ -8254,10 +8253,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
|||||||
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
||||||
|
|
||||||
vTemp = _mm_div_ps( vTemp, W );
|
vTemp = _mm_div_ps( vTemp, W );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
|
|
||||||
// Result 2
|
// Result 2
|
||||||
@ -8272,10 +8269,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
|||||||
W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
||||||
|
|
||||||
vTemp = _mm_div_ps( vTemp, W );
|
vTemp = _mm_div_ps( vTemp, W );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
|
|
||||||
i += 2;
|
i += 2;
|
||||||
@ -8303,10 +8298,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
|||||||
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
||||||
|
|
||||||
vTemp = _mm_div_ps( vTemp, W );
|
vTemp = _mm_div_ps( vTemp, W );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -8315,12 +8308,11 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
|||||||
// Unaligned input
|
// Unaligned input
|
||||||
for (; i < VectorCount; i++)
|
for (; i < VectorCount; i++)
|
||||||
{
|
{
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
|
|
||||||
pInputVector += InputStride;
|
pInputVector += InputStride;
|
||||||
|
|
||||||
XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
|
XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
|
XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
|
|
||||||
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
||||||
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
||||||
@ -8330,10 +8322,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
|||||||
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
|
||||||
|
|
||||||
vTemp = _mm_div_ps( vTemp, W );
|
vTemp = _mm_div_ps( vTemp, W );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -8584,10 +8574,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
|
|||||||
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
||||||
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
||||||
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
|
|
||||||
// Result 2
|
// Result 2
|
||||||
@ -8597,10 +8585,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
|
|||||||
vTemp = _mm_mul_ps( Y, row1 );
|
vTemp = _mm_mul_ps( Y, row1 );
|
||||||
vTemp2 = _mm_mul_ps( X, row0 );
|
vTemp2 = _mm_mul_ps( X, row0 );
|
||||||
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
|
|
||||||
i += 2;
|
i += 2;
|
||||||
@ -8623,10 +8609,8 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
|
|||||||
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
||||||
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
||||||
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -8635,20 +8619,17 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
|
|||||||
// Unaligned input
|
// Unaligned input
|
||||||
for (; i < VectorCount; i++)
|
for (; i < VectorCount; i++)
|
||||||
{
|
{
|
||||||
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
|
||||||
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
|
|
||||||
pInputVector += InputStride;
|
pInputVector += InputStride;
|
||||||
|
|
||||||
XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
|
XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
|
XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
|
|
||||||
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
|
||||||
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
|
||||||
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
vTemp = _mm_add_ps( vTemp, vTemp2 );
|
||||||
vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
|
|
||||||
|
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
|
_mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
|
||||||
_mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
|
|
||||||
pOutputVector += OutputStride;
|
pOutputVector += OutputStride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user