mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-21 20:00:12 +00:00
AVX / AVX2 otimizations
This commit is contained in:
parent
ef45bb75fa
commit
3af6a349bb
@ -38,7 +38,30 @@
|
||||
#define XM_CTOR_DEFAULT =default;
|
||||
#endif
|
||||
|
||||
#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#define _XM_F16C_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#ifdef _XM_F16C_INTRINSICS_
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1700)
|
||||
#error DirectX Math use of F16C intrinsics requires Visual C++ 2012 or later.
|
||||
#endif
|
||||
#ifndef _XM_AVX_INTRINSICS_
|
||||
#define _XM_AVX_INTRINSICS_
|
||||
#endif
|
||||
#endif // _XM_F16C_INTRINSICS_
|
||||
|
||||
#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#define _XM_AVX_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
|
||||
#define _XM_SSE4_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
|
||||
#define _XM_SSE_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if defined(_M_IX86) || defined(_M_X64)
|
||||
@ -77,7 +100,17 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include <sal.h>
|
||||
#include <assert.h>
|
||||
@ -129,7 +162,11 @@
|
||||
#define XM_SFENCE() _mm_sfence()
|
||||
#endif
|
||||
|
||||
#if defined(_XM_AVX_INTRINSICS_)
|
||||
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
|
||||
#else
|
||||
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
|
||||
#endif
|
||||
|
||||
#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
|
||||
|
||||
@ -1506,6 +1543,22 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
|
||||
|
||||
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
|
||||
#endif
|
||||
|
||||
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
|
||||
@ -1570,6 +1623,10 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
|
||||
// Specialized swizzles
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
|
||||
|
||||
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
|
||||
#endif
|
||||
|
||||
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
|
||||
|
@ -1092,6 +1092,10 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
|
||||
XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
|
||||
return XMVectorMultiply(P, Result);
|
||||
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
|
||||
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
|
||||
return _mm_mul_ps(vResult, P);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product
|
||||
XMVECTOR vDot = _mm_mul_ps(P,P);
|
||||
@ -1138,6 +1142,18 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
XMVECTOR vLength = XMVector3ReciprocalLength(P);
|
||||
return XMVectorMultiply( P, vLength );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
|
||||
// Prepare for the division
|
||||
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
||||
// Failsafe on zero (Or epsilon) length planes
|
||||
// If the length is infinity, set the elements to zero
|
||||
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
|
||||
// Reciprocal mul to perform the normalization
|
||||
vResult = _mm_div_ps(P,vResult);
|
||||
// Any that are infinity, set to zero
|
||||
vResult = _mm_and_ps(vResult,vLengthSq);
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y and z only
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(P,P);
|
||||
@ -1967,6 +1983,35 @@ inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
|
||||
inline bool XMVerifyCPUSupport()
|
||||
{
|
||||
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if defined(_XM_F16C_INTRINSICS_) || defined(_XM_AVX_INTRINSICS_)
|
||||
int avxCPUInfo[4] = {-1};
|
||||
__cpuid( avxCPUInfo, 0 );
|
||||
|
||||
if ( avxCPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(avxCPUInfo, 1 );
|
||||
|
||||
#ifdef _XM_F16C_INTRINSICS_
|
||||
if ( (avxCPUInfo[2] & 0x38000000 ) != 0x38000000 )
|
||||
return false; // No F16C/AVX/OSXSAVE support
|
||||
#else
|
||||
if ( (avxCPUInfo[2] & 0x18000000 ) != 0x18000000 )
|
||||
return false; // No AVX/OSXSAVE support
|
||||
#endif
|
||||
#endif
|
||||
#ifdef _XM_SSE4_INTRINSICS_
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
|
||||
if ( (CPUInfo[2] & 0x80001) != 0x80001 )
|
||||
return false; // Missing SSE3 or SSE 4.1 support
|
||||
#endif
|
||||
#if defined(_M_X64)
|
||||
// The X64 processor model requires SSE2 support
|
||||
return true;
|
||||
|
@ -143,6 +143,8 @@ inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr
|
||||
return vResult;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vld1q_dup_f32( pValue );
|
||||
#elif defined(_XM_AVX_INTRINSICS_)
|
||||
return _mm_broadcast_ss( pValue );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return _mm_load_ps1( pValue );
|
||||
#endif
|
||||
@ -508,6 +510,8 @@ inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V)
|
||||
*y = V.vector4_f32[1];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_lane_f32(y,V,1);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
*((int*)y) = _mm_extract_ps( V, 1 );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
|
||||
_mm_store_ss(y,vResult);
|
||||
@ -523,6 +527,8 @@ inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V)
|
||||
*z = V.vector4_f32[2];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_lane_f32(z,V,2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
*((int*)z) = _mm_extract_ps( V, 2 );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
||||
_mm_store_ss(z,vResult);
|
||||
@ -538,6 +544,8 @@ inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V)
|
||||
*w = V.vector4_f32[3];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_lane_f32(w,V,3);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
*((int*)w) = _mm_extract_ps( V, 3 );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
|
||||
_mm_store_ss(w,vResult);
|
||||
@ -582,6 +590,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
|
||||
return V.vector4_u32[1];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vgetq_lane_u32(V, 1);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i V1 = _mm_castps_si128( V );
|
||||
return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
|
||||
return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
|
||||
@ -595,6 +606,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
|
||||
return V.vector4_u32[2];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vgetq_lane_u32(V, 2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i V1 = _mm_castps_si128( V );
|
||||
return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
|
||||
return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
|
||||
@ -608,6 +622,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
|
||||
return V.vector4_u32[3];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vgetq_lane_u32(V, 3);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i V1 = _mm_castps_si128( V );
|
||||
return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
|
||||
return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
|
||||
@ -657,6 +674,9 @@ inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
|
||||
*y = V.vector4_u32[1];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_lane_u32(y,*reinterpret_cast<const uint32x4_t*>(&V),1);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i V1 = _mm_castps_si128( V );
|
||||
*y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
|
||||
_mm_store_ss(reinterpret_cast<float *>(y),vResult);
|
||||
@ -672,6 +692,9 @@ inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
|
||||
*z = V.vector4_u32[2];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_lane_u32(z,*reinterpret_cast<const uint32x4_t*>(&V),2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i V1 = _mm_castps_si128( V );
|
||||
*z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
|
||||
_mm_store_ss(reinterpret_cast<float *>(z),vResult);
|
||||
@ -687,6 +710,9 @@ inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
|
||||
*w = V.vector4_u32[3];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_lane_u32(w,*reinterpret_cast<const uint32x4_t*>(&V),3);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i V1 = _mm_castps_si128( V );
|
||||
*w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
|
||||
_mm_store_ss(reinterpret_cast<float *>(w),vResult);
|
||||
@ -749,6 +775,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
|
||||
return U;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vsetq_lane_f32(y,V,1);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vResult = _mm_set_ss(y);
|
||||
vResult = _mm_insert_ps( V, vResult, 0x10 );
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Swap y and x
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
|
||||
@ -773,6 +803,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
|
||||
return U;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vsetq_lane_f32(z,V,2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vResult = _mm_set_ss(z);
|
||||
vResult = _mm_insert_ps( V, vResult, 0x20 );
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Swap z and x
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
|
||||
@ -798,6 +832,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
|
||||
return U;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vsetq_lane_f32(w,V,3);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vResult = _mm_set_ss(w);
|
||||
vResult = _mm_insert_ps( V, vResult, 0x30 );
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Swap w and x
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
|
||||
@ -998,6 +1036,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
|
||||
return U;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vsetq_lane_u32(y,V,1);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i vResult = _mm_castps_si128( V );
|
||||
vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
|
||||
return _mm_castsi128_ps( vResult );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Swap y and x
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
|
||||
@ -1023,6 +1065,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
|
||||
return U;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vsetq_lane_u32(z,V,2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i vResult = _mm_castps_si128( V );
|
||||
vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
|
||||
return _mm_castsi128_ps( vResult );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Swap z and x
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
|
||||
@ -1048,6 +1094,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
|
||||
return U;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vsetq_lane_u32(w,V,3);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128i vResult = _mm_castps_si128( V );
|
||||
vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
|
||||
return _mm_castsi128_ps( vResult );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Swap w and x
|
||||
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
|
||||
@ -1233,6 +1283,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSwizzle
|
||||
const uint8x8_t rH = vtbl2_u8( tbl, idx );
|
||||
|
||||
return vcombine_f32( rL, rH );
|
||||
#elif defined(_XM_AVX_INTRINSICS_)
|
||||
unsigned int elem[4] = { E0, E1, E2, E3 };
|
||||
__m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
|
||||
return _mm_permutevar_ps( V, vControl );
|
||||
#else
|
||||
const uint32_t *aPtr = (const uint32_t* )(&V);
|
||||
|
||||
@ -1288,6 +1342,22 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute
|
||||
const uint8x8_t rH = vtbl4_u8( tbl, idx );
|
||||
|
||||
return vcombine_f32( rL, rH );
|
||||
#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
static const XMVECTORU32 three = { 3, 3, 3, 3 };
|
||||
|
||||
_declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
|
||||
__m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
|
||||
|
||||
__m128i vSelect = _mm_cmpgt_epi32( vControl, three );
|
||||
vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
|
||||
|
||||
__m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
|
||||
__m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
|
||||
|
||||
__m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
|
||||
__m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
|
||||
|
||||
return _mm_or_ps( masked1, masked2 );
|
||||
#else
|
||||
|
||||
const uint32_t *aPtr[2];
|
||||
@ -2302,6 +2372,8 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
|
||||
uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
|
||||
XMVECTOR vResult = vbslq_f32( mask, R1, V );
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128 sign = _mm_and_ps( V, g_XMNegativeZero );
|
||||
__m128 sMagic = _mm_or_ps( g_XMNoFraction, sign );
|
||||
@ -2361,6 +2433,8 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
|
||||
// All numbers less than 8388608 will use the round to int
|
||||
// All others, use the ORIGINAL value
|
||||
return vbslq_f32( vTest, vResult, V );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// To handle NAN, INF and numbers greater than 8388608, use masking
|
||||
// Get the abs value
|
||||
@ -2407,6 +2481,8 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
|
||||
// All numbers less than 8388608 will use the round to int
|
||||
// All others, use the ORIGINAL value
|
||||
return vbslq_f32( vTest, vResult, V );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_floor_ps( V );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// To handle NAN, INF and numbers greater than 8388608, use masking
|
||||
__m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
|
||||
@ -2454,6 +2530,8 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
|
||||
// All numbers less than 8388608 will use the round to int
|
||||
// All others, use the ORIGINAL value
|
||||
return vbslq_f32( vTest, vResult, V );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_ceil_ps( V );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// To handle NAN, INF and numbers greater than 8388608, use masking
|
||||
__m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
|
||||
@ -6649,6 +6727,8 @@ inline XMVECTOR XM_CALLCONV XMVector2Dot
|
||||
float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
|
||||
vTemp = vpadd_f32( vTemp, vTemp );
|
||||
return vcombine_f32( vTemp, vTemp );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_dp_ps( V1, V2, 0x3f );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
|
||||
@ -6733,6 +6813,9 @@ inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst
|
||||
// Reciprocal sqrt (estimate)
|
||||
vTemp = vrsqrte_f32( vTemp );
|
||||
return vcombine_f32( vTemp, vTemp );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
|
||||
return _mm_rsqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -6774,6 +6857,10 @@ inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength
|
||||
float32x2_t R1 = vrsqrts_f32( P1, S1 );
|
||||
float32x2_t Result = vmul_f32( S1, R1 );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
|
||||
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
|
||||
return _mm_div_ps( g_XMOne, vLengthSq );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -6814,6 +6901,9 @@ inline XMVECTOR XM_CALLCONV XMVector2LengthEst
|
||||
Result = vmul_f32( vTemp, Result );
|
||||
Result = vbsl_f32( VEqualsZero, zero, Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
|
||||
return _mm_sqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -6859,6 +6949,9 @@ inline XMVECTOR XM_CALLCONV XMVector2Length
|
||||
Result = vmul_f32( vTemp, Result );
|
||||
Result = vbsl_f32( VEqualsZero, zero, Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
|
||||
return _mm_sqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -6898,6 +6991,10 @@ inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst
|
||||
// Normalize
|
||||
float32x2_t Result = vmul_f32( VL, vTemp );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
|
||||
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
|
||||
return _mm_mul_ps(vResult, V);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -6955,6 +7052,26 @@ inline XMVECTOR XM_CALLCONV XMVector2Normalize
|
||||
Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
|
||||
Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
|
||||
// Prepare for the division
|
||||
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
||||
// Create zero with a single instruction
|
||||
XMVECTOR vZeroMask = _mm_setzero_ps();
|
||||
// Test for a divide by zero (Must be FP to detect -0.0)
|
||||
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
|
||||
// Failsafe on zero (Or epsilon) length planes
|
||||
// If the length is infinity, set the elements to zero
|
||||
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
|
||||
// Reciprocal mul to perform the normalization
|
||||
vResult = _mm_div_ps(V,vResult);
|
||||
// Any that are infinity, set to zero
|
||||
vResult = _mm_and_ps(vResult,vZeroMask);
|
||||
// Select qnan or result based on infinite length
|
||||
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
|
||||
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
|
||||
vResult = _mm_or_ps(vTemp1,vTemp2);
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x and y only
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -8902,6 +9019,8 @@ inline XMVECTOR XM_CALLCONV XMVector3Dot
|
||||
v2 = vdup_lane_f32( v2, 0 );
|
||||
v1 = vadd_f32( v1, v2 );
|
||||
return vcombine_f32( v1, v1 );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_dp_ps( V1, V2, 0x7f );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product
|
||||
XMVECTOR vDot = _mm_mul_ps(V1,V2);
|
||||
@ -9007,6 +9126,9 @@ inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst
|
||||
// Reciprocal sqrt (estimate)
|
||||
v2 = vrsqrte_f32( v1 );
|
||||
return vcombine_f32(v2, v2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
|
||||
return _mm_rsqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y and z
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -9059,6 +9181,10 @@ inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength
|
||||
float32x2_t R1 = vrsqrts_f32( P1, S1 );
|
||||
float32x2_t Result = vmul_f32( S1, R1 );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
|
||||
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
|
||||
return _mm_div_ps( g_XMOne, vLengthSq );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product
|
||||
XMVECTOR vDot = _mm_mul_ps(V,V);
|
||||
@ -9111,6 +9237,9 @@ inline XMVECTOR XM_CALLCONV XMVector3LengthEst
|
||||
Result = vmul_f32( v1, Result );
|
||||
Result = vbsl_f32( VEqualsZero, zero, Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
|
||||
return _mm_sqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y and z
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -9167,6 +9296,9 @@ inline XMVECTOR XM_CALLCONV XMVector3Length
|
||||
Result = vmul_f32( v1, Result );
|
||||
Result = vbsl_f32( VEqualsZero, zero, Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
|
||||
return _mm_sqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y and z
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -9214,6 +9346,10 @@ inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst
|
||||
v2 = vrsqrte_f32( v1 );
|
||||
// Normalize
|
||||
return vmulq_f32( V, vcombine_f32(v2,v2) );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
|
||||
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
|
||||
return _mm_mul_ps(vResult, V);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product
|
||||
XMVECTOR vDot = _mm_mul_ps(V,V);
|
||||
@ -9282,6 +9418,26 @@ inline XMVECTOR XM_CALLCONV XMVector3Normalize
|
||||
XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
|
||||
vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
|
||||
return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
|
||||
// Prepare for the division
|
||||
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
||||
// Create zero with a single instruction
|
||||
XMVECTOR vZeroMask = _mm_setzero_ps();
|
||||
// Test for a divide by zero (Must be FP to detect -0.0)
|
||||
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
|
||||
// Failsafe on zero (Or epsilon) length planes
|
||||
// If the length is infinity, set the elements to zero
|
||||
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
|
||||
// Divide to perform the normalization
|
||||
vResult = _mm_div_ps(V,vResult);
|
||||
// Any that are infinity, set to zero
|
||||
vResult = _mm_and_ps(vResult,vZeroMask);
|
||||
// Select qnan or result based on infinite length
|
||||
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
|
||||
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
|
||||
vResult = _mm_or_ps(vTemp1,vTemp2);
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y and z only
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -12743,6 +12899,8 @@ inline XMVECTOR XM_CALLCONV XMVector4Dot
|
||||
v2 = vpadd_f32( v2, v2 );
|
||||
v1 = vadd_f32( v1, v2 );
|
||||
return vcombine_f32( v1, v1 );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_dp_ps( V1, V2, 0xff );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vTemp2 = V2;
|
||||
XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
|
||||
@ -12940,6 +13098,9 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
|
||||
// Reciprocal sqrt (estimate)
|
||||
v2 = vrsqrte_f32( v1 );
|
||||
return vcombine_f32(v2, v2);
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
|
||||
return _mm_rsqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y,z and w
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -12994,6 +13155,10 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
|
||||
float32x2_t R1 = vrsqrts_f32( P1, S1 );
|
||||
float32x2_t Result = vmul_f32( S1, R1 );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
|
||||
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
|
||||
return _mm_div_ps( g_XMOne, vLengthSq );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y,z and w
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -13048,6 +13213,9 @@ inline XMVECTOR XM_CALLCONV XMVector4LengthEst
|
||||
Result = vmul_f32( v1, Result );
|
||||
Result = vbsl_f32( VEqualsZero, zero, Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
|
||||
return _mm_sqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y,z and w
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -13106,6 +13274,9 @@ inline XMVECTOR XM_CALLCONV XMVector4Length
|
||||
Result = vmul_f32( v1, Result );
|
||||
Result = vbsl_f32( VEqualsZero, zero, Result );
|
||||
return vcombine_f32( Result, Result );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
|
||||
return _mm_sqrt_ps( vTemp );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y,z and w
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -13155,6 +13326,10 @@ inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
|
||||
v2 = vrsqrte_f32( v1 );
|
||||
// Normalize
|
||||
return vmulq_f32( V, vcombine_f32(v2,v2) );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
|
||||
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
|
||||
return _mm_mul_ps(vResult, V);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y,z and w
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
@ -13225,6 +13400,26 @@ inline XMVECTOR XM_CALLCONV XMVector4Normalize
|
||||
XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
|
||||
vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
|
||||
return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
|
||||
// Prepare for the division
|
||||
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
||||
// Create zero with a single instruction
|
||||
XMVECTOR vZeroMask = _mm_setzero_ps();
|
||||
// Test for a divide by zero (Must be FP to detect -0.0)
|
||||
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
|
||||
// Failsafe on zero (Or epsilon) length planes
|
||||
// If the length is infinity, set the elements to zero
|
||||
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
|
||||
// Divide to perform the normalization
|
||||
vResult = _mm_div_ps(V,vResult);
|
||||
// Any that are infinity, set to zero
|
||||
vResult = _mm_and_ps(vResult,vZeroMask);
|
||||
// Select qnan or result based on infinite length
|
||||
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
|
||||
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
|
||||
vResult = _mm_or_ps(vTemp1,vTemp2);
|
||||
return vResult;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Perform the dot product on x,y,z and w
|
||||
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
|
||||
|
@ -27,6 +27,11 @@ inline float PackedVector::XMConvertHalfToFloat
|
||||
HALF Value
|
||||
)
|
||||
{
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
|
||||
__m128 V2 = _mm_cvtph_ps( V1 );
|
||||
return _mm_cvtss_f32( V2 );
|
||||
#else
|
||||
uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
|
||||
|
||||
uint32_t Exponent = (Value & 0x7C00);
|
||||
@ -61,6 +66,7 @@ inline float PackedVector::XMConvertHalfToFloat
|
||||
(Mantissa << 13); // Mantissa
|
||||
|
||||
return reinterpret_cast<float*>(&Result)[0];
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -86,6 +92,139 @@ inline float* PackedVector::XMConvertHalfToFloatStream
|
||||
assert(OutputStride >= sizeof(float));
|
||||
_Analysis_assume_(OutputStride >= sizeof(float));
|
||||
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = HalfCount >> 2;
|
||||
if ( four > 0 )
|
||||
{
|
||||
if (InputStride == sizeof(HALF))
|
||||
{
|
||||
if (OutputStride == sizeof(float))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
{
|
||||
// Packed input, aligned & packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
|
||||
XM_STREAM_PS( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
|
||||
_mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
|
||||
_mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (OutputStride == sizeof(float))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
{
|
||||
// Scattered input, aligned & packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16( HV, H1, 0 );
|
||||
HV = _mm_insert_epi16( HV, H2, 1 );
|
||||
HV = _mm_insert_epi16( HV, H3, 2 );
|
||||
HV = _mm_insert_epi16( HV, H4, 3 );
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
|
||||
XM_STREAM_PS( reinterpret_cast<float*>(pFloat ), FV );
|
||||
pFloat += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16( HV, H1, 0 );
|
||||
HV = _mm_insert_epi16( HV, H2, 1 );
|
||||
HV = _mm_insert_epi16( HV, H3, 2 );
|
||||
HV = _mm_insert_epi16( HV, H4, 3 );
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
|
||||
_mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
|
||||
pFloat += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < HalfCount; ++i)
|
||||
{
|
||||
*reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
|
||||
pHalf += InputStride;
|
||||
pFloat += OutputStride;
|
||||
}
|
||||
|
||||
XM_SFENCE();
|
||||
|
||||
return pOutputStream;
|
||||
#else
|
||||
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
@ -97,6 +236,7 @@ inline float* PackedVector::XMConvertHalfToFloatStream
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -106,6 +246,11 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
|
||||
float Value
|
||||
)
|
||||
{
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
__m128 V1 = _mm_set_ss( Value );
|
||||
__m128i V2 = _mm_cvtps_ph( V1, 0 );
|
||||
return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
|
||||
#else
|
||||
uint32_t Result;
|
||||
|
||||
uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
|
||||
@ -142,6 +287,7 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
|
||||
Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
|
||||
}
|
||||
return (HALF)(Result|Sign);
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -164,6 +310,134 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
|
||||
assert(OutputStride >= sizeof(HALF));
|
||||
_Analysis_assume_(OutputStride >= sizeof(HALF));
|
||||
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = FloatCount >> 2;
|
||||
if (four > 0)
|
||||
{
|
||||
if (InputStride == sizeof(float))
|
||||
{
|
||||
if (OutputStride == sizeof(HALF))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
{
|
||||
// Aligned and packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
{
|
||||
// Aligned & packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (OutputStride == sizeof(HALF))
|
||||
{
|
||||
// Scattered input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
|
||||
__m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
|
||||
FV = _mm_blend_ps( FV, FT, 0xC );
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < FloatCount; ++i)
|
||||
{
|
||||
*reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
|
||||
pFloat += InputStride;
|
||||
pHalf += OutputStride;
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#else
|
||||
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
@ -174,6 +448,7 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
|
||||
pHalf += OutputStride;
|
||||
}
|
||||
return pOutputStream;
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
#pragma prefast(pop)
|
||||
@ -236,6 +511,10 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
|
||||
)
|
||||
{
|
||||
assert(pSource);
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
__m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
|
||||
return _mm_cvtph_ps( _mm_castps_si128( V ) );
|
||||
#else
|
||||
XMVECTORF32 vResult = {
|
||||
XMConvertHalfToFloat(pSource->x),
|
||||
XMConvertHalfToFloat(pSource->y),
|
||||
@ -243,6 +522,7 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
|
||||
0.0f
|
||||
};
|
||||
return vResult.v;
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -759,6 +1039,10 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
|
||||
)
|
||||
{
|
||||
assert(pSource);
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
__m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
|
||||
return _mm_cvtph_ps( V );
|
||||
#else
|
||||
XMVECTORF32 vResult = {
|
||||
XMConvertHalfToFloat(pSource->x),
|
||||
XMConvertHalfToFloat(pSource->y),
|
||||
@ -766,6 +1050,7 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
|
||||
XMConvertHalfToFloat(pSource->w)
|
||||
};
|
||||
return vResult.v;
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -1575,8 +1860,13 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf2
|
||||
)
|
||||
{
|
||||
assert(pDestination);
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
__m128i V1 = _mm_cvtps_ph( V, 0 );
|
||||
_mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
|
||||
#else
|
||||
pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
|
||||
pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -2131,6 +2421,10 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf4
|
||||
)
|
||||
{
|
||||
assert(pDestination);
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
__m128i V1 = _mm_cvtps_ph( V, 0 );
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
|
||||
#else
|
||||
XMFLOAT4A t;
|
||||
XMStoreFloat4A(&t, V );
|
||||
|
||||
@ -2138,6 +2432,7 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf4
|
||||
pDestination->y = XMConvertFloatToHalf(t.y);
|
||||
pDestination->z = XMConvertFloatToHalf(t.z);
|
||||
pDestination->w = XMConvertFloatToHalf(t.w);
|
||||
#endif // !_XM_F16C_INTRINSICS_
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user