AVX2/FMA3 optimization

2024-11-09 14:10:09 +00:00 · 2017-05-18 16:14:07 -07:00 · 2017-05-18 16:14:07 -07:00 · b83bff1f1c
commit b83bff1f1c
parent 58df665c97
3 changed files with 31 additions and 3 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -53,10 +53,22 @@
 #define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
 #endif

-#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
+#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_AVX2_INTRINSICS_
+#endif
+
+#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_FMA3_INTRINSICS_
+#endif
+
+#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 #define _XM_F16C_INTRINSICS_
 #endif

+#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
 #define _XM_AVX_INTRINSICS_
 #endif
@ -1656,6 +1668,10 @@ template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECT
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
 #endif

+#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps( V ); }
+#endif
+
 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

 template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
--- a/Inc/DirectXMathMisc.inl
+++ b/Inc/DirectXMathMisc.inl
@ -1993,10 +1993,16 @@ inline bool XMVerifyCPUSupport()

    __cpuid(CPUInfo, 1);

-#ifdef __AVX2__
+#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
    // The compiler can emit FMA3 instructions even without explicit intrinsics use
    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
+#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
+    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
+        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
+#elif defined(_XM_FMA3_INTRINSICS_)
+    if ((CPUInfo[2] & 0x18081001) != 0x18081001)
+        return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
 #elif defined(_XM_F16C_INTRINSICS_)
    if ((CPUInfo[2] & 0x38080001) != 0x38080001)
        return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
@ -2015,7 +2021,7 @@ inline bool XMVerifyCPUSupport()
    if ((CPUInfo[3] & 0x6000000) != 0x6000000)
        return false; // No SSE2/SSE support

-#ifdef __AVX2__
+#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
    __cpuidex(CPUInfo, 7, 0);
    if (!(CPUInfo[1] & 0x20))
        return false; // No AVX2 support
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
@ -240,6 +240,8 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatX
    return vResult.v;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_lane_f32( vget_low_f32( V ), 0 );
+#elif defined(_XM_AVX2_INTRINSICS_)
+    return _mm_broadcastss_ps( V );
 #elif defined(_XM_SSE_INTRINSICS_)
    return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
 #endif
@ -3005,6 +3007,8 @@ inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
    return Result.v;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vmlaq_f32( V3, V1, V2 );
+#elif defined(_XM_FMA3_INTRINSICS_)
+    return _mm_fmadd_ps( V1, V2, V3 );
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_mul_ps( V1, V2 );
    return _mm_add_ps(vResult, V3 );
@ -3063,6 +3067,8 @@ inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vmlsq_f32( V3, V1, V2 );
+#elif defined(_XM_FMA3_INTRINSICS_)
+    return _mm_fnmadd_ps(V1, V2, V3);
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR R = _mm_mul_ps( V1, V2 );
    return _mm_sub_ps( V3, R );