mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-09 14:10:09 +00:00
AVX2/FMA3 optimization
This commit is contained in:
parent
58df665c97
commit
b83bff1f1c
@ -53,10 +53,22 @@
|
||||
#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
|
||||
#endif
|
||||
|
||||
#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#define _XM_AVX2_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#define _XM_FMA3_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#define _XM_F16C_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
|
||||
#define _XM_AVX_INTRINSICS_
|
||||
#endif
|
||||
|
||||
#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
|
||||
#define _XM_AVX_INTRINSICS_
|
||||
#endif
|
||||
@ -1656,6 +1668,10 @@ template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECT
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
|
||||
#endif
|
||||
|
||||
#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps( V ); }
|
||||
#endif
|
||||
|
||||
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
|
||||
|
@ -1993,10 +1993,16 @@ inline bool XMVerifyCPUSupport()
|
||||
|
||||
__cpuid(CPUInfo, 1);
|
||||
|
||||
#ifdef __AVX2__
|
||||
#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
|
||||
// The compiler can emit FMA3 instructions even without explicit intrinsics use
|
||||
if ((CPUInfo[2] & 0x38081001) != 0x38081001)
|
||||
return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
|
||||
#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
|
||||
if ((CPUInfo[2] & 0x38081001) != 0x38081001)
|
||||
return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
|
||||
#elif defined(_XM_FMA3_INTRINSICS_)
|
||||
if ((CPUInfo[2] & 0x18081001) != 0x18081001)
|
||||
return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
|
||||
#elif defined(_XM_F16C_INTRINSICS_)
|
||||
if ((CPUInfo[2] & 0x38080001) != 0x38080001)
|
||||
return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
|
||||
@ -2015,7 +2021,7 @@ inline bool XMVerifyCPUSupport()
|
||||
if ((CPUInfo[3] & 0x6000000) != 0x6000000)
|
||||
return false; // No SSE2/SSE support
|
||||
|
||||
#ifdef __AVX2__
|
||||
#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
|
||||
__cpuidex(CPUInfo, 7, 0);
|
||||
if (!(CPUInfo[1] & 0x20))
|
||||
return false; // No AVX2 support
|
||||
|
@ -240,6 +240,8 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatX
|
||||
return vResult.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vdupq_lane_f32( vget_low_f32( V ), 0 );
|
||||
#elif defined(_XM_AVX2_INTRINSICS_)
|
||||
return _mm_broadcastss_ps( V );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
|
||||
#endif
|
||||
@ -3005,6 +3007,8 @@ inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
|
||||
return Result.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vmlaq_f32( V3, V1, V2 );
|
||||
#elif defined(_XM_FMA3_INTRINSICS_)
|
||||
return _mm_fmadd_ps( V1, V2, V3 );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = _mm_mul_ps( V1, V2 );
|
||||
return _mm_add_ps(vResult, V3 );
|
||||
@ -3063,6 +3067,8 @@ inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vmlsq_f32( V3, V1, V2 );
|
||||
#elif defined(_XM_FMA3_INTRINSICS_)
|
||||
return _mm_fnmadd_ps(V1, V2, V3);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR R = _mm_mul_ps( V1, V2 );
|
||||
return _mm_sub_ps( V3, R );
|
||||
|
Loading…
Reference in New Issue
Block a user