1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-11-21 20:00:12 +00:00

AVX / AVX2 otimizations

This commit is contained in:
Chuck Walbourn 2016-05-27 12:42:11 -07:00
parent ef45bb75fa
commit 3af6a349bb
4 changed files with 592 additions and 0 deletions

View File

@ -38,7 +38,30 @@
#define XM_CTOR_DEFAULT =default;
#endif
#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_F16C_INTRINSICS_
#endif
#ifdef _XM_F16C_INTRINSICS_
#if defined(_MSC_VER) && (_MSC_VER < 1700)
#error DirectX Math use of F16C intrinsics requires Visual C++ 2012 or later.
#endif
#ifndef _XM_AVX_INTRINSICS_
#define _XM_AVX_INTRINSICS_
#endif
#endif // _XM_F16C_INTRINSICS_
#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif
#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
#define _XM_SSE4_INTRINSICS_
#endif
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
#define _XM_SSE_INTRINSICS_
#endif
#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#if defined(_M_IX86) || defined(_M_X64)
@ -77,7 +100,17 @@
#endif
#endif
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <smmintrin.h>
#endif
#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#include <immintrin.h>
#endif
#include <sal.h>
#include <assert.h>
@ -129,7 +162,11 @@
#define XM_SFENCE() _mm_sfence()
#endif
#if defined(_XM_AVX_INTRINSICS_)
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
#else
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
#endif
#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
@ -1506,6 +1543,22 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
#endif
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
@ -1570,6 +1623,10 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
// Specialized swizzles
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
#endif
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

View File

@ -1092,6 +1092,10 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
return XMVectorMultiply(P, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, P);
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(P,P);
@ -1138,6 +1142,18 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize
#elif defined(_XM_ARM_NEON_INTRINSICS_)
XMVECTOR vLength = XMVector3ReciprocalLength(P);
return XMVectorMultiply( P, vLength );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(P,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vLengthSq);
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z only
XMVECTOR vLengthSq = _mm_mul_ps(P,P);
@ -1967,6 +1983,35 @@ inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
inline bool XMVerifyCPUSupport()
{
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#if defined(_XM_F16C_INTRINSICS_) || defined(_XM_AVX_INTRINSICS_)
int avxCPUInfo[4] = {-1};
__cpuid( avxCPUInfo, 0 );
if ( avxCPUInfo[0] < 1 )
return false;
__cpuid(avxCPUInfo, 1 );
#ifdef _XM_F16C_INTRINSICS_
if ( (avxCPUInfo[2] & 0x38000000 ) != 0x38000000 )
return false; // No F16C/AVX/OSXSAVE support
#else
if ( (avxCPUInfo[2] & 0x18000000 ) != 0x18000000 )
return false; // No AVX/OSXSAVE support
#endif
#endif
#ifdef _XM_SSE4_INTRINSICS_
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
if ( (CPUInfo[2] & 0x80001) != 0x80001 )
return false; // Missing SSE3 or SSE 4.1 support
#endif
#if defined(_M_X64)
// The X64 processor model requires SSE2 support
return true;

View File

@ -143,6 +143,8 @@ inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr
return vResult;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vld1q_dup_f32( pValue );
#elif defined(_XM_AVX_INTRINSICS_)
return _mm_broadcast_ss( pValue );
#elif defined(_XM_SSE_INTRINSICS_)
return _mm_load_ps1( pValue );
#endif
@ -508,6 +510,8 @@ inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V)
*y = V.vector4_f32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_lane_f32(y,V,1);
#elif defined(_XM_SSE4_INTRINSICS_)
*((int*)y) = _mm_extract_ps( V, 1 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
_mm_store_ss(y,vResult);
@ -523,6 +527,8 @@ inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V)
*z = V.vector4_f32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_lane_f32(z,V,2);
#elif defined(_XM_SSE4_INTRINSICS_)
*((int*)z) = _mm_extract_ps( V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss(z,vResult);
@ -538,6 +544,8 @@ inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V)
*w = V.vector4_f32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_lane_f32(w,V,3);
#elif defined(_XM_SSE4_INTRINSICS_)
*((int*)w) = _mm_extract_ps( V, 3 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
_mm_store_ss(w,vResult);
@ -582,6 +590,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
return V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vgetq_lane_u32(V, 1);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i V1 = _mm_castps_si128( V );
return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
#elif defined(_XM_SSE_INTRINSICS_)
__m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
@ -595,6 +606,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
return V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vgetq_lane_u32(V, 2);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i V1 = _mm_castps_si128( V );
return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
#elif defined(_XM_SSE_INTRINSICS_)
__m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
@ -608,6 +622,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
return V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vgetq_lane_u32(V, 3);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i V1 = _mm_castps_si128( V );
return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
#elif defined(_XM_SSE_INTRINSICS_)
__m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
@ -657,6 +674,9 @@ inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
*y = V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_lane_u32(y,*reinterpret_cast<const uint32x4_t*>(&V),1);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i V1 = _mm_castps_si128( V );
*y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
_mm_store_ss(reinterpret_cast<float *>(y),vResult);
@ -672,6 +692,9 @@ inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
*z = V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_lane_u32(z,*reinterpret_cast<const uint32x4_t*>(&V),2);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i V1 = _mm_castps_si128( V );
*z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss(reinterpret_cast<float *>(z),vResult);
@ -687,6 +710,9 @@ inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
*w = V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_lane_u32(w,*reinterpret_cast<const uint32x4_t*>(&V),3);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i V1 = _mm_castps_si128( V );
*w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
_mm_store_ss(reinterpret_cast<float *>(w),vResult);
@ -749,6 +775,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(y,V,1);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(y);
vResult = _mm_insert_ps( V, vResult, 0x10 );
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Swap y and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
@ -773,6 +803,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(z,V,2);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(z);
vResult = _mm_insert_ps( V, vResult, 0x20 );
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Swap z and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
@ -798,6 +832,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(w,V,3);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(w);
vResult = _mm_insert_ps( V, vResult, 0x30 );
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Swap w and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
@ -998,6 +1036,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_u32(y,V,1);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i vResult = _mm_castps_si128( V );
vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
return _mm_castsi128_ps( vResult );
#elif defined(_XM_SSE_INTRINSICS_)
// Swap y and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
@ -1023,6 +1065,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_u32(z,V,2);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i vResult = _mm_castps_si128( V );
vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
return _mm_castsi128_ps( vResult );
#elif defined(_XM_SSE_INTRINSICS_)
// Swap z and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
@ -1048,6 +1094,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_u32(w,V,3);
#elif defined(_XM_SSE4_INTRINSICS_)
__m128i vResult = _mm_castps_si128( V );
vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
return _mm_castsi128_ps( vResult );
#elif defined(_XM_SSE_INTRINSICS_)
// Swap w and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
@ -1233,6 +1283,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSwizzle
const uint8x8_t rH = vtbl2_u8( tbl, idx );
return vcombine_f32( rL, rH );
#elif defined(_XM_AVX_INTRINSICS_)
unsigned int elem[4] = { E0, E1, E2, E3 };
__m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
return _mm_permutevar_ps( V, vControl );
#else
const uint32_t *aPtr = (const uint32_t* )(&V);
@ -1288,6 +1342,22 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute
const uint8x8_t rH = vtbl4_u8( tbl, idx );
return vcombine_f32( rL, rH );
#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
static const XMVECTORU32 three = { 3, 3, 3, 3 };
_declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
__m128i vSelect = _mm_cmpgt_epi32( vControl, three );
vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
__m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
__m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
__m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
__m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
return _mm_or_ps( masked1, masked2 );
#else
const uint32_t *aPtr[2];
@ -2302,6 +2372,8 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
XMVECTOR vResult = vbslq_f32( mask, R1, V );
return vResult;
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
#elif defined(_XM_SSE_INTRINSICS_)
__m128 sign = _mm_and_ps( V, g_XMNegativeZero );
__m128 sMagic = _mm_or_ps( g_XMNoFraction, sign );
@ -2361,6 +2433,8 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
// All numbers less than 8388608 will use the round to int
// All others, use the ORIGINAL value
return vbslq_f32( vTest, vResult, V );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
#elif defined(_XM_SSE_INTRINSICS_)
// To handle NAN, INF and numbers greater than 8388608, use masking
// Get the abs value
@ -2407,6 +2481,8 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
// All numbers less than 8388608 will use the round to int
// All others, use the ORIGINAL value
return vbslq_f32( vTest, vResult, V );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_floor_ps( V );
#elif defined(_XM_SSE_INTRINSICS_)
// To handle NAN, INF and numbers greater than 8388608, use masking
__m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
@ -2454,6 +2530,8 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
// All numbers less than 8388608 will use the round to int
// All others, use the ORIGINAL value
return vbslq_f32( vTest, vResult, V );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_ceil_ps( V );
#elif defined(_XM_SSE_INTRINSICS_)
// To handle NAN, INF and numbers greater than 8388608, use masking
__m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
@ -6649,6 +6727,8 @@ inline XMVECTOR XM_CALLCONV XMVector2Dot
float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
vTemp = vpadd_f32( vTemp, vTemp );
return vcombine_f32( vTemp, vTemp );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_dp_ps( V1, V2, 0x3f );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y
XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
@ -6733,6 +6813,9 @@ inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst
// Reciprocal sqrt (estimate)
vTemp = vrsqrte_f32( vTemp );
return vcombine_f32( vTemp, vTemp );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
return _mm_rsqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6774,6 +6857,10 @@ inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength
float32x2_t R1 = vrsqrts_f32( P1, S1 );
float32x2_t Result = vmul_f32( S1, R1 );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
return _mm_div_ps( g_XMOne, vLengthSq );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6814,6 +6901,9 @@ inline XMVECTOR XM_CALLCONV XMVector2LengthEst
Result = vmul_f32( vTemp, Result );
Result = vbsl_f32( VEqualsZero, zero, Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
return _mm_sqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6859,6 +6949,9 @@ inline XMVECTOR XM_CALLCONV XMVector2Length
Result = vmul_f32( vTemp, Result );
Result = vbsl_f32( VEqualsZero, zero, Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
return _mm_sqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6898,6 +6991,10 @@ inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst
// Normalize
float32x2_t Result = vmul_f32( VL, vTemp );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, V);
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6955,6 +7052,26 @@ inline XMVECTOR XM_CALLCONV XMVector2Normalize
Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
vResult = _mm_or_ps(vTemp1,vTemp2);
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x and y only
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -8902,6 +9019,8 @@ inline XMVECTOR XM_CALLCONV XMVector3Dot
v2 = vdup_lane_f32( v2, 0 );
v1 = vadd_f32( v1, v2 );
return vcombine_f32( v1, v1 );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_dp_ps( V1, V2, 0x7f );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(V1,V2);
@ -9007,6 +9126,9 @@ inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst
// Reciprocal sqrt (estimate)
v2 = vrsqrte_f32( v1 );
return vcombine_f32(v2, v2);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
return _mm_rsqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -9059,6 +9181,10 @@ inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength
float32x2_t R1 = vrsqrts_f32( P1, S1 );
float32x2_t Result = vmul_f32( S1, R1 );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
return _mm_div_ps( g_XMOne, vLengthSq );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(V,V);
@ -9111,6 +9237,9 @@ inline XMVECTOR XM_CALLCONV XMVector3LengthEst
Result = vmul_f32( v1, Result );
Result = vbsl_f32( VEqualsZero, zero, Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
return _mm_sqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -9167,6 +9296,9 @@ inline XMVECTOR XM_CALLCONV XMVector3Length
Result = vmul_f32( v1, Result );
Result = vbsl_f32( VEqualsZero, zero, Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
return _mm_sqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -9214,6 +9346,10 @@ inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst
v2 = vrsqrte_f32( v1 );
// Normalize
return vmulq_f32( V, vcombine_f32(v2,v2) );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, V);
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(V,V);
@ -9282,6 +9418,26 @@ inline XMVECTOR XM_CALLCONV XMVector3Normalize
XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
vResult = _mm_or_ps(vTemp1,vTemp2);
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z only
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -12743,6 +12899,8 @@ inline XMVECTOR XM_CALLCONV XMVector4Dot
v2 = vpadd_f32( v2, v2 );
v1 = vadd_f32( v1, v2 );
return vcombine_f32( v1, v1 );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_dp_ps( V1, V2, 0xff );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vTemp2 = V2;
XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
@ -12940,6 +13098,9 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
// Reciprocal sqrt (estimate)
v2 = vrsqrte_f32( v1 );
return vcombine_f32(v2, v2);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
return _mm_rsqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y,z and w
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -12994,6 +13155,10 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
float32x2_t R1 = vrsqrts_f32( P1, S1 );
float32x2_t Result = vmul_f32( S1, R1 );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
return _mm_div_ps( g_XMOne, vLengthSq );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y,z and w
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13048,6 +13213,9 @@ inline XMVECTOR XM_CALLCONV XMVector4LengthEst
Result = vmul_f32( v1, Result );
Result = vbsl_f32( VEqualsZero, zero, Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
return _mm_sqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y,z and w
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13106,6 +13274,9 @@ inline XMVECTOR XM_CALLCONV XMVector4Length
Result = vmul_f32( v1, Result );
Result = vbsl_f32( VEqualsZero, zero, Result );
return vcombine_f32( Result, Result );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
return _mm_sqrt_ps( vTemp );
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y,z and w
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13155,6 +13326,10 @@ inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
v2 = vrsqrte_f32( v1 );
// Normalize
return vmulq_f32( V, vcombine_f32(v2,v2) );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, V);
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y,z and w
XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13225,6 +13400,26 @@ inline XMVECTOR XM_CALLCONV XMVector4Normalize
XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
vResult = _mm_or_ps(vTemp1,vTemp2);
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y,z and w
XMVECTOR vLengthSq = _mm_mul_ps(V,V);

View File

@ -27,6 +27,11 @@ inline float PackedVector::XMConvertHalfToFloat
HALF Value
)
{
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
__m128 V2 = _mm_cvtph_ps( V1 );
return _mm_cvtss_f32( V2 );
#else
uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
uint32_t Exponent = (Value & 0x7C00);
@ -61,6 +66,7 @@ inline float PackedVector::XMConvertHalfToFloat
(Mantissa << 13); // Mantissa
return reinterpret_cast<float*>(&Result)[0];
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------
@ -86,6 +92,139 @@ inline float* PackedVector::XMConvertHalfToFloatStream
assert(OutputStride >= sizeof(float));
_Analysis_assume_(OutputStride >= sizeof(float));
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = HalfCount >> 2;
if ( four > 0 )
{
if (InputStride == sizeof(HALF))
{
if (OutputStride == sizeof(float))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
{
// Packed input, aligned & packed output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128 FV = _mm_cvtph_ps( HV );
XM_STREAM_PS( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride*4;
i += 4;
}
}
else
{
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128 FV = _mm_cvtph_ps( HV );
_mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride*4;
i += 4;
}
}
}
else
{
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128 FV = _mm_cvtph_ps( HV );
_mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
pFloat += OutputStride;
i += 4;
}
}
}
else if (OutputStride == sizeof(float))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
{
// Scattered input, aligned & packed output
for (size_t j = 0; j < four; ++j)
{
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16( HV, H1, 0 );
HV = _mm_insert_epi16( HV, H2, 1 );
HV = _mm_insert_epi16( HV, H3, 2 );
HV = _mm_insert_epi16( HV, H4, 3 );
__m128 FV = _mm_cvtph_ps( HV );
XM_STREAM_PS( reinterpret_cast<float*>(pFloat ), FV );
pFloat += OutputStride*4;
i += 4;
}
}
else
{
// Scattered input, packed output
for (size_t j = 0; j < four; ++j)
{
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16( HV, H1, 0 );
HV = _mm_insert_epi16( HV, H2, 1 );
HV = _mm_insert_epi16( HV, H3, 2 );
HV = _mm_insert_epi16( HV, H4, 3 );
__m128 FV = _mm_cvtph_ps( HV );
_mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
pFloat += OutputStride*4;
i += 4;
}
}
}
}
for (; i < HalfCount; ++i)
{
*reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
pHalf += InputStride;
pFloat += OutputStride;
}
XM_SFENCE();
return pOutputStream;
#else
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
@ -97,6 +236,7 @@ inline float* PackedVector::XMConvertHalfToFloatStream
}
return pOutputStream;
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------
@ -106,6 +246,11 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
float Value
)
{
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
__m128 V1 = _mm_set_ss( Value );
__m128i V2 = _mm_cvtps_ph( V1, 0 );
return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
#else
uint32_t Result;
uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
@ -142,6 +287,7 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
}
return (HALF)(Result|Sign);
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------
@ -164,6 +310,134 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
assert(OutputStride >= sizeof(HALF));
_Analysis_assume_(OutputStride >= sizeof(HALF));
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = FloatCount >> 2;
if (four > 0)
{
if (InputStride == sizeof(float))
{
if (OutputStride == sizeof(HALF))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
{
// Aligned and packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
i += 4;
}
}
else
{
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
i += 4;
}
}
}
else
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
{
// Aligned & packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
pHalf += OutputStride;
i += 4;
}
}
else
{
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
pHalf += OutputStride;
i += 4;
}
}
}
}
else if (OutputStride == sizeof(HALF))
{
// Scattered input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride;
__m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride;
__m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride;
__m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride;
__m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
__m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
FV = _mm_blend_ps( FV, FT, 0xC );
__m128i HV = _mm_cvtps_ph( FV, 0 );
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
i += 4;
}
}
}
for (; i < FloatCount; ++i)
{
*reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
pFloat += InputStride;
pHalf += OutputStride;
}
return pOutputStream;
#else
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
@ -174,6 +448,7 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
pHalf += OutputStride;
}
return pOutputStream;
#endif // !_XM_F16C_INTRINSICS_
}
#pragma prefast(pop)
@ -236,6 +511,10 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
)
{
assert(pSource);
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
__m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
return _mm_cvtph_ps( _mm_castps_si128( V ) );
#else
XMVECTORF32 vResult = {
XMConvertHalfToFloat(pSource->x),
XMConvertHalfToFloat(pSource->y),
@ -243,6 +522,7 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
0.0f
};
return vResult.v;
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------
@ -759,6 +1039,10 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
)
{
assert(pSource);
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
__m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
return _mm_cvtph_ps( V );
#else
XMVECTORF32 vResult = {
XMConvertHalfToFloat(pSource->x),
XMConvertHalfToFloat(pSource->y),
@ -766,6 +1050,7 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
XMConvertHalfToFloat(pSource->w)
};
return vResult.v;
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------
@ -1575,8 +1860,13 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf2
)
{
assert(pDestination);
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
__m128i V1 = _mm_cvtps_ph( V, 0 );
_mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
#else
pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------
@ -2131,6 +2421,10 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf4
)
{
assert(pDestination);
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
__m128i V1 = _mm_cvtps_ph( V, 0 );
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
#else
XMFLOAT4A t;
XMStoreFloat4A(&t, V );
@ -2138,6 +2432,7 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf4
pDestination->y = XMConvertFloatToHalf(t.y);
pDestination->z = XMConvertFloatToHalf(t.z);
pDestination->w = XMConvertFloatToHalf(t.w);
#endif // !_XM_F16C_INTRINSICS_
}
//------------------------------------------------------------------------------