AVX / AVX2 otimizations

2024-11-21 20:00:12 +00:00 · 2016-05-27 12:42:11 -07:00 · 2016-05-27 12:42:11 -07:00 · 3af6a349bb
commit 3af6a349bb
parent ef45bb75fa
4 changed files with 592 additions and 0 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -38,7 +38,30 @@
 #define XM_CTOR_DEFAULT =default;
 #endif

+#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_F16C_INTRINSICS_
+#endif

+#ifdef _XM_F16C_INTRINSICS_
+#if defined(_MSC_VER) && (_MSC_VER < 1700)
+#error DirectX Math use of F16C intrinsics requires Visual C++ 2012 or later.
+#endif
+#ifndef _XM_AVX_INTRINSICS_
+#define _XM_AVX_INTRINSICS_
+#endif
+#endif // _XM_F16C_INTRINSICS_
+
+#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
+#define _XM_SSE4_INTRINSICS_
+#endif
+
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
+#define _XM_SSE_INTRINSICS_
+#endif

 #if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 #if defined(_M_IX86) || defined(_M_X64)
@ -77,7 +100,17 @@
 #endif
 #endif

+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+#include <smmintrin.h>
+#endif

+#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#include <immintrin.h>
+#endif

 #include <sal.h>
 #include <assert.h>
@ -129,7 +162,11 @@
 #define XM_SFENCE() _mm_sfence()
 #endif

+#if defined(_XM_AVX_INTRINSICS_)
+#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
+#else
 #define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
+#endif

 #endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_

@ -1506,6 +1543,22 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }

+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+#endif

 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

@ -1570,6 +1623,10 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
 // Specialized swizzles
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }

+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+#endif

 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

--- a/Inc/DirectXMathMisc.inl
+++ b/Inc/DirectXMathMisc.inl
@ -1092,6 +1092,10 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
    return XMVectorMultiply(P, Result);

+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, P);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(P,P);
@ -1138,6 +1142,18 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR vLength = XMVector3ReciprocalLength(P);
    return XMVectorMultiply( P, vLength );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
@ -1967,6 +1983,35 @@ inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
 inline bool XMVerifyCPUSupport()
 {
 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#if defined(_XM_F16C_INTRINSICS_) || defined(_XM_AVX_INTRINSICS_)
+   int avxCPUInfo[4] = {-1};
+   __cpuid( avxCPUInfo, 0 );
+
+   if ( avxCPUInfo[0] < 1  )
+       return false;
+
+    __cpuid(avxCPUInfo, 1 );
+
+#ifdef _XM_F16C_INTRINSICS_
+    if ( (avxCPUInfo[2] & 0x38000000 ) != 0x38000000 )
+        return false; // No F16C/AVX/OSXSAVE support
+#else
+    if ( (avxCPUInfo[2] & 0x18000000 ) != 0x18000000 )
+        return false; // No AVX/OSXSAVE support
+#endif
+#endif
+#ifdef _XM_SSE4_INTRINSICS_
+   int CPUInfo[4] = {-1};
+   __cpuid( CPUInfo, 0 );
+
+   if ( CPUInfo[0] < 1  )
+       return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    if ( (CPUInfo[2] & 0x80001) != 0x80001 )
+        return false; // Missing SSE3 or SSE 4.1 support
+#endif
 #if defined(_M_X64)
    // The X64 processor model requires SSE2 support
    return true;
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
@ -143,6 +143,8 @@ inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr
    return vResult;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vld1q_dup_f32( pValue );
+#elif defined(_XM_AVX_INTRINSICS_)
+    return _mm_broadcast_ss( pValue );
 #elif defined(_XM_SSE_INTRINSICS_)
    return _mm_load_ps1( pValue );
 #endif
@ -508,6 +510,8 @@ inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V)
    *y = V.vector4_f32[1];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(y,V,1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    *((int*)y) = _mm_extract_ps( V, 1 );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
    _mm_store_ss(y,vResult);
@ -523,6 +527,8 @@ inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V)
    *z = V.vector4_f32[2];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(z,V,2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    *((int*)z) = _mm_extract_ps( V, 2 );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss(z,vResult);
@ -538,6 +544,8 @@ inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V)
    *w = V.vector4_f32[3];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(w,V,3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    *((int*)w) = _mm_extract_ps( V, 3 );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
    _mm_store_ss(w,vResult);
@ -582,6 +590,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
    return V.vector4_u32[1];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(V, 1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
@ -595,6 +606,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
    return V.vector4_u32[2];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(V, 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
@ -608,6 +622,9 @@ inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
    return V.vector4_u32[3];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(V, 3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
@ -657,6 +674,9 @@ inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
    *y = V.vector4_u32[1];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(y,*reinterpret_cast<const uint32x4_t*>(&V),1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
    _mm_store_ss(reinterpret_cast<float *>(y),vResult);
@ -672,6 +692,9 @@ inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
    *z = V.vector4_u32[2];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(z,*reinterpret_cast<const uint32x4_t*>(&V),2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
    _mm_store_ss(reinterpret_cast<float *>(z),vResult);
@ -687,6 +710,9 @@ inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
    *w = V.vector4_u32[3];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(w,*reinterpret_cast<const uint32x4_t*>(&V),3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128( V );
+    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
    _mm_store_ss(reinterpret_cast<float *>(w),vResult);
@ -749,6 +775,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
    return U;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(y,V,1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(y);
+    vResult = _mm_insert_ps( V, vResult, 0x10 );
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Swap y and x
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
@ -773,6 +803,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
    return U;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(z,V,2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(z);
+    vResult = _mm_insert_ps( V, vResult, 0x20 );
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Swap z and x
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
@ -798,6 +832,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
    return U;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(w,V,3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(w);
+    vResult = _mm_insert_ps( V, vResult, 0x30 );
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Swap w and x
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
@ -998,6 +1036,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
    return U;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_u32(y,V,1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
+    return _mm_castsi128_ps( vResult );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Swap y and x
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
@ -1023,6 +1065,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
    return U;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_u32(z,V,2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
+    return _mm_castsi128_ps( vResult );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Swap z and x
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
@ -1048,6 +1094,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
    return U;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_u32(w,V,3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
+    return _mm_castsi128_ps( vResult );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Swap w and x
    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
@ -1233,6 +1283,10 @@ inline XMVECTOR XM_CALLCONV XMVectorSwizzle
    const uint8x8_t rH = vtbl2_u8( tbl, idx );

    return vcombine_f32( rL, rH );
+#elif defined(_XM_AVX_INTRINSICS_)
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    return _mm_permutevar_ps( V, vControl );
 #else
    const uint32_t *aPtr = (const uint32_t* )(&V);

@ -1288,6 +1342,22 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute
    const uint8x8_t rH = vtbl4_u8( tbl, idx );

    return vcombine_f32( rL, rH );
+#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORU32 three = { 3, 3, 3, 3 };
+
+    _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    
+    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
+    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
+
+    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
+    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
+
+    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
+    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
+
+    return _mm_or_ps( masked1, masked2 );
 #else
 
    const uint32_t *aPtr[2];
@ -2302,6 +2372,8 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
    uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
    XMVECTOR vResult = vbslq_f32( mask, R1, V );
    return vResult;
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128 sign = _mm_and_ps( V, g_XMNegativeZero );
    __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign );
@ -2361,6 +2433,8 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32( vTest, vResult, V );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
 #elif defined(_XM_SSE_INTRINSICS_)
    // To handle NAN, INF and numbers greater than 8388608, use masking
    // Get the abs value
@ -2407,6 +2481,8 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32( vTest, vResult, V );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_floor_ps( V );
 #elif defined(_XM_SSE_INTRINSICS_)
    // To handle NAN, INF and numbers greater than 8388608, use masking
    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
@ -2454,6 +2530,8 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32( vTest, vResult, V );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_ceil_ps( V );
 #elif defined(_XM_SSE_INTRINSICS_)
    // To handle NAN, INF and numbers greater than 8388608, use masking
    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
@ -6649,6 +6727,8 @@ inline XMVECTOR XM_CALLCONV XMVector2Dot
    float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
    vTemp = vpadd_f32( vTemp, vTemp );
    return vcombine_f32( vTemp, vTemp );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps( V1, V2, 0x3f );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
@ -6733,6 +6813,9 @@ inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst
    // Reciprocal sqrt (estimate)
    vTemp = vrsqrte_f32( vTemp );
    return vcombine_f32( vTemp, vTemp );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_rsqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6774,6 +6857,10 @@ inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength
    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
    float32x2_t Result = vmul_f32( S1, R1 );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6814,6 +6901,9 @@ inline XMVECTOR XM_CALLCONV XMVector2LengthEst
    Result = vmul_f32( vTemp, Result );
    Result = vbsl_f32( VEqualsZero, zero, Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6859,6 +6949,9 @@ inline XMVECTOR XM_CALLCONV XMVector2Length
    Result = vmul_f32( vTemp, Result );
    Result = vbsl_f32( VEqualsZero, zero, Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6898,6 +6991,10 @@ inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst
    // Normalize
    float32x2_t Result = vmul_f32( VL, vTemp );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -6955,6 +7052,26 @@ inline XMVECTOR XM_CALLCONV XMVector2Normalize
    Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
    Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y only
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -8902,6 +9019,8 @@ inline XMVECTOR XM_CALLCONV XMVector3Dot
    v2 = vdup_lane_f32( v2, 0 );
    v1 = vadd_f32( v1, v2 );
    return vcombine_f32( v1, v1 );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps( V1, V2, 0x7f );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(V1,V2);
@ -9007,6 +9126,9 @@ inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32( v1 );
    return vcombine_f32(v2, v2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_rsqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -9059,6 +9181,10 @@ inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength
    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
    float32x2_t Result = vmul_f32( S1, R1 );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
 #elif defined(_XM_SSE_INTRINSICS_)
     // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(V,V);
@ -9111,6 +9237,9 @@ inline XMVECTOR XM_CALLCONV XMVector3LengthEst
    Result = vmul_f32( v1, Result );
    Result = vbsl_f32( VEqualsZero, zero, Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -9167,6 +9296,9 @@ inline XMVECTOR XM_CALLCONV XMVector3Length
    Result = vmul_f32( v1, Result );
    Result = vbsl_f32( VEqualsZero, zero, Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -9214,6 +9346,10 @@ inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst
    v2 = vrsqrte_f32( v1 );
    // Normalize
    return vmulq_f32( V, vcombine_f32(v2,v2) );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
 #elif defined(_XM_SSE_INTRINSICS_)
     // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(V,V);
@ -9282,6 +9418,26 @@ inline XMVECTOR XM_CALLCONV XMVector3Normalize
    XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -12743,6 +12899,8 @@ inline XMVECTOR XM_CALLCONV XMVector4Dot
    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
    return vcombine_f32( v1, v1 );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps( V1, V2, 0xff );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp2 = V2;
    XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
@ -12940,6 +13098,9 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32( v1 );
    return vcombine_f32(v2, v2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_rsqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -12994,6 +13155,10 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
    float32x2_t  R1 = vrsqrts_f32( P1, S1 );
    float32x2_t Result = vmul_f32( S1, R1 );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13048,6 +13213,9 @@ inline XMVECTOR XM_CALLCONV XMVector4LengthEst
    Result = vmul_f32( v1, Result );
    Result = vbsl_f32( VEqualsZero, zero, Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13106,6 +13274,9 @@ inline XMVECTOR XM_CALLCONV XMVector4Length
    Result = vmul_f32( v1, Result );
    Result = vbsl_f32( VEqualsZero, zero, Result );
    return vcombine_f32( Result, Result );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13155,6 +13326,10 @@ inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
    v2 = vrsqrte_f32( v1 );
    // Normalize
    return vmulq_f32( V, vcombine_f32(v2,v2) );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
@ -13225,6 +13400,26 @@ inline XMVECTOR XM_CALLCONV XMVector4Normalize
    XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@ -27,6 +27,11 @@ inline float PackedVector::XMConvertHalfToFloat
    HALF Value
 )
 {
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
+    __m128 V2 = _mm_cvtph_ps( V1 );
+    return _mm_cvtss_f32( V2 );
+#else
    uint32_t Mantissa = (uint32_t)(Value & 0x03FF);

    uint32_t Exponent = (Value & 0x7C00);
@ -61,6 +66,7 @@ inline float PackedVector::XMConvertHalfToFloat
                      (Mantissa << 13);          // Mantissa

    return reinterpret_cast<float*>(&Result)[0];
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------
@ -86,6 +92,139 @@ inline float* PackedVector::XMConvertHalfToFloatStream
    assert(OutputStride >= sizeof(float));
    _Analysis_assume_(OutputStride >= sizeof(float));

+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if ( four > 0 )
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        XM_STREAM_PS( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                        pHalf += InputStride*4;
+
+                        __m128 FV = _mm_cvtph_ps( HV );
+
+                        _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
+                        pFloat += OutputStride*4; 
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
+                    pHalf += InputStride*4;
+
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
+                    pFloat += OutputStride; 
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
+                    pFloat += OutputStride; 
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ( ((uintptr_t)pFloat & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    XM_STREAM_PS( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16( HV, H1, 0 );
+                    HV = _mm_insert_epi16( HV, H2, 1 );
+                    HV = _mm_insert_epi16( HV, H3, 2 );
+                    HV = _mm_insert_epi16( HV, H4, 3 );
+                    __m128 FV = _mm_cvtph_ps( HV );
+
+                    _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
+                    pFloat += OutputStride*4; 
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride; 
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#else
    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);

@ -97,6 +236,7 @@ inline float* PackedVector::XMConvertHalfToFloatStream
    }

    return pOutputStream;
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------
@ -106,6 +246,11 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
    float Value
 )
 {
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128 V1 = _mm_set_ss( Value );
+    __m128i V2 = _mm_cvtps_ph( V1, 0 );
+    return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
+#else
    uint32_t Result;

    uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
@ -142,6 +287,7 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; 
    }
    return (HALF)(Result|Sign);
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------
@ -164,6 +310,134 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
    assert(OutputStride >= sizeof(HALF));
    _Analysis_assume_(OutputStride >= sizeof(HALF));

+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                        pHalf += OutputStride*4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ( ((uintptr_t)pFloat & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
+                        pFloat += InputStride*4;
+
+                        __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
+                __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
+                FV = _mm_blend_ps( FV, FT, 0xC );
+
+                __m128i HV = _mm_cvtps_ph( FV, 0 );
+
+                _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
+                pHalf += OutputStride*4;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+#else
    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);

@ -174,6 +448,7 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
        pHalf += OutputStride;
    }
    return pOutputStream;
+#endif // !_XM_F16C_INTRINSICS_
 }

 #pragma prefast(pop)
@ -236,6 +511,10 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
 )
 {
    assert(pSource);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_cvtph_ps( _mm_castps_si128( V ) );
+#else
    XMVECTORF32 vResult = {
        XMConvertHalfToFloat(pSource->x),
        XMConvertHalfToFloat(pSource->y),
@ -243,6 +522,7 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2
        0.0f
    };
    return vResult.v;
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------
@ -759,6 +1039,10 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
 )
 {
    assert(pSource);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtph_ps( V );
+#else
    XMVECTORF32 vResult = {
        XMConvertHalfToFloat(pSource->x),
        XMConvertHalfToFloat(pSource->y),
@ -766,6 +1050,7 @@ inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4
        XMConvertHalfToFloat(pSource->w)
    };
    return vResult.v;
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------
@ -1575,8 +1860,13 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf2
 )
 {
    assert(pDestination);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
+#else
    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------
@ -2131,6 +2421,10 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf4
 )
 {
    assert(pDestination);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
+#else
    XMFLOAT4A t;
    XMStoreFloat4A(&t, V );

@ -2138,6 +2432,7 @@ inline void XM_CALLCONV PackedVector::XMStoreHalf4
    pDestination->y = XMConvertFloatToHalf(t.y);
    pDestination->z = XMConvertFloatToHalf(t.z);
    pDestination->w = XMConvertFloatToHalf(t.w);
+#endif // !_XM_F16C_INTRINSICS_
 }

 //------------------------------------------------------------------------------