1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-11-09 14:10:09 +00:00

AVX2/FMA3 optimization

This commit is contained in:
Chuck Walbourn 2017-05-18 16:14:07 -07:00
parent 58df665c97
commit b83bff1f1c
3 changed files with 31 additions and 3 deletions

View File

@ -53,10 +53,22 @@
#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
#endif
#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_AVX2_INTRINSICS_
#endif
#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_FMA3_INTRINSICS_
#endif
#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_F16C_INTRINSICS_
#endif
#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif
@ -1656,6 +1668,10 @@ template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECT
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
#endif
#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps( V ); }
#endif
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }

View File

@ -1993,10 +1993,16 @@ inline bool XMVerifyCPUSupport()
__cpuid(CPUInfo, 1);
#ifdef __AVX2__
#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
// The compiler can emit FMA3 instructions even without explicit intrinsics use
if ((CPUInfo[2] & 0x38081001) != 0x38081001)
return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
if ((CPUInfo[2] & 0x38081001) != 0x38081001)
return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_FMA3_INTRINSICS_)
if ((CPUInfo[2] & 0x18081001) != 0x18081001)
return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_F16C_INTRINSICS_)
if ((CPUInfo[2] & 0x38080001) != 0x38080001)
return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
@ -2015,7 +2021,7 @@ inline bool XMVerifyCPUSupport()
if ((CPUInfo[3] & 0x6000000) != 0x6000000)
return false; // No SSE2/SSE support
#ifdef __AVX2__
#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
__cpuidex(CPUInfo, 7, 0);
if (!(CPUInfo[1] & 0x20))
return false; // No AVX2 support

View File

@ -240,6 +240,8 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatX
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vdupq_lane_f32( vget_low_f32( V ), 0 );
#elif defined(_XM_AVX2_INTRINSICS_)
return _mm_broadcastss_ps( V );
#elif defined(_XM_SSE_INTRINSICS_)
return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
#endif
@ -3005,6 +3007,8 @@ inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vmlaq_f32( V3, V1, V2 );
#elif defined(_XM_FMA3_INTRINSICS_)
return _mm_fmadd_ps( V1, V2, V3 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = _mm_mul_ps( V1, V2 );
return _mm_add_ps(vResult, V3 );
@ -3063,6 +3067,8 @@ inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vmlsq_f32( V3, V1, V2 );
#elif defined(_XM_FMA3_INTRINSICS_)
return _mm_fnmadd_ps(V1, V2, V3);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR R = _mm_mul_ps( V1, V2 );
return _mm_sub_ps( V3, R );