AuroraRuntime/Source/Threading/Primitives/SMTYield.hpp

/***
    Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: SMTYield.hpp
    Date: 2023-3-12
    Author: Reece
***/
#pragma once

namespace Aurora::Threading
{
    inline AuUInt32 gHasThreadLocalTimeout {};
    inline thread_local AuUInt32 tlsSpinCountLocal {};
}

extern "C"
{
    AuUInt32 SMTGetAPICNumber(void);
}

#include <Source/Extensions/Clocks.aarch64.hpp>

#define SPIN_FOUR 1

namespace Aurora::Threading::Primitives
{
    namespace ThrdCfg
    {
        inline bool         gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
        inline bool         gEnableAggressiveScheduling {};
        inline bool         gEnableAgrSchedulingRatelimit {};
        inline bool         gPreferNtCondvarModernWinSpin {};
        inline bool         gPreferNtCondvarOlderWinSpin {};
        inline bool         gPreferNtSemaphoreSpinTryLock {};
        inline bool         gPreferNtMutexSpinTryLock {};
        inline bool         gPreferNtCondMutexSpinTryLock {};
        inline bool         gPreferLinuxSemaphoreSpinTryLock {};
        inline bool         gPreferLinuxMutexSpinTryLock {};
        inline bool         gPreferLinuxCondMutexSpinTryLock {};
        inline bool         gPreferEmulatedWakeOnAddress {};
        inline bool         gPreferWaitOnAddressAlwaysSpin {};
        inline bool         gPreferWaitOnAddressAlwaysSpinNative {};
        inline bool         gPreferRWLockReadLockSpin {};
        inline bool         gUWPNanosecondEmulationCheckFirst {};
        inline AuUInt32     gUWPNanosecondEmulationMaxYields {};
        inline bool         gForceEnableAdaptiveSpin {};
        inline bool         gPreferEnableAdaptiveSpin {};
        inline bool         gPreferLinuxAdaptiveSpin {};
        inline bool         gPreferOldWin32AdaptiveSpin {};
        inline bool         gPreferNewWin32AdaptiveSpin {};
        inline AuUInt32     gAdaptiveSpinCUCnt0 {};
        inline AuUInt32     gAdaptiveSpinCUCnt4 {};
        inline AuUInt32     gAdaptiveSpinCUCnt8 {};
        inline AuUInt32     gAdaptiveSpinCUCnt16 {};
        inline bool         gPreferFutexRWLock {};
        inline bool         gPreferFutexEvent {};
        inline bool         gWinXpThrough7BlazeOptimizerPower {};
        inline bool         gPreferLinuxPrimitivesFutexNoSpin {};
        inline bool         gPreferUnixPrimitivesNoSpin {};
        inline bool         gAlwaysRWLockWriteBiasOnReadLock {};
        inline bool         gEnableRWLockWriteBiasOnReadLock {};
        inline AuUInt8      gCountOfPCores {};
    }
    
#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
    using SMTAtomic_t = AuUInt8;
#else
    using SMTAtomic_t = AuUInt32;
#endif
    inline SMTAtomic_t gCoreTable[256] {};

    inline AuUInt32 gSpinAdaptiveThreshold {};
    inline AuUInt32 gSpinAdaptiveCurrentCount {};
    inline AuUInt32 gSpinAdaptiveThreadCount {};

    inline AuUInt32 gUseFutexRWLock {};
    inline AuUInt32 gPreferFutexEvent {};

    void InitAdaptiveThreshold();
    void InitAdaptiveThresholdFirstTime();
    void InitCfg();

    static const bool kEnableSmartScheduling =
    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
        true;
    #else
        // tbd by arch and os
        false;
    #endif

    static auline void SMPPause()
    {
    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
        _mm_pause();
    #elif defined(AURORA_ARCH_ARM)
    #if defined(AURORA_COMPILER_GCC)
        asm volatile("yield");
    #else
        __yield();
    #endif
    #else
        // TODO: your platform here
        AuThreading::ContextYield();
    #endif
    }

#if defined(AURORA_ARCH_ARM)

    static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
    {
        if (uFreq == 10000000)
        {
            return uCounter * 100ull;
        }
        else if (uFreq == 1000000)
        {
            return uCounter * 1000ull;
        }
        else if (uFreq == 100000)
        {
            return uCounter * 10000ull;
        }
        else if (uFreq == 100000000ull)
        {
            return uCounter * 10ull;
        }
        else if (uFreq == 1000000000ull)
        {
            return uCounter;
        }
        else
        {
            const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
            const long long uPart  = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
            return uWhole + uPart;
        }
    }

    static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
    {
        return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
        // context: 
        //     Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
        //     Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
        //     Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it. 
        //     Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
        //     This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
        //     On the aarch side of things, we should be able to match the exact Intel behaviour by:
        //      * Reading the system wide clock (CNTVCT_EL0)
        //      * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
        //      * Divide by approx "3.6 Ghz" ops/ns
        // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
    }

    #define __rdtsc() RdtscArmEmulated(uClockFreq)
    #define ALT_RDT
#endif

    template <typename T>
    bool auline YieldToSharedCore(long spin, T callback)
    {
        if (callback())
        {
            return true;
        }
        
    #if defined(AURORA_ARCH_ARM)
        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
    #endif
      
        if (kEnableSmartScheduling)
        {
            bool bRet { false };
            auto uWord = SMTGetAPICNumber();
            if (uWord < AuArraySize(gCoreTable) &&
                uWord < ThrdCfg::gCountOfPCores)
            {
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);

                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
                if (uNow <= gSpinAdaptiveThreshold)
                {
                    auto uCount = spin;
                        
                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
                    {
                        uCount /= 5;
                    }
                    else if (gHasThreadLocalTimeout)
                    {
                        uCount += tlsSpinCountLocal;
                    }

                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                    auto perfCounter = __rdtsc() + uCount;
                    while (__rdtsc() < perfCounter)
                #else
                    while (uCount > 0)
                #endif
                    {
                        if (callback())
                        {
                            bRet = true;
                            break;
                        }
                        else
                        {
                            SMPPause();
                            uCount--;
                        }
                    }
                }
                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }

            return bRet;
        }
        else if (gSpinAdaptiveThreshold)
        {
            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);

            if (uNow <= gSpinAdaptiveThreshold)
            {
                auto uCount = spin;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        SMPPause();
                        uCount -= 4;
                    #else
                        SMPPause();
                        uCount -= 1;
                    #endif
                    }
                }

                if (gHasThreadLocalTimeout)
                {
                    auto uCount = tlsSpinCountLocal;
                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
                    auto perfCounter = __rdtsc() + uCount;
                    while (__rdtsc() < perfCounter)
                #else
                    while (uCount > 0)
                #endif
                    {
                        if (callback())
                        {
                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                            return true;
                        }
                        else
                        {
                            SMPPause();
                            uCount--;
                        }
                    }
                }

                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
            }
            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
            {
                auto uCount = (spin) / 3;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }

            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
        }
        else
        {
            auto uCount = spin;
        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
            auto perfCounter = __rdtsc() + uCount;
            while (__rdtsc() < perfCounter)
        #else
            while (uCount > 0)
        #endif            
            {
                if (callback())
                {
                    return true;
                }
                else
                {
                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    SMPPause();
                    uCount -= 4;
                #else
                    SMPPause();
                    uCount -= 1;
                #endif
                }
            }

            if (gHasThreadLocalTimeout)
            {
                auto uCount = tlsSpinCountLocal;
            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
                auto perfCounter = __rdtsc() + uCount;
                while (__rdtsc() < perfCounter)
            #else
                while (uCount > 0)
            #endif
                {
                    if (callback())
                    {
                        return true;
                    }
                    else
                    {
                        SMPPause();
                        uCount--;
                    }
                }
            }
        }

        return callback();
    }

    template <typename T>
    bool auline DoTryIf(T callback)
    {
        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
        {
            return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
        }
        else
        {
            return callback();
        }
    }
}
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								/***
-												[*] Update the copyright header of most of the primitives
[*] Fix generic mutex abs yield always returning true

											
										
										
											2024-01-29 14:09:59 +00:00
+								    Copyright (C) 2023-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Allocationless thread primitives
[*] Rename SMPYield to SMTYield

											
										
										
											2023-03-21 03:18:09 +00:00
+								    File: SMTYield.hpp
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								    Date: 2023-3-12
 								    Author: Reece
 								***/
 								#pragma once
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								namespace Aurora::Threading
 								{
 								    inline AuUInt32 gHasThreadLocalTimeout {};
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								    inline thread_local AuUInt32 tlsSpinCountLocal {};
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								}
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								extern "C"
 								{
-												[*] No way should we be using DWORDs here

											
										
										
											2024-01-07 02:26:34 +00:00
+								    AuUInt32 SMTGetAPICNumber(void);
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								}
 								#include <Source/Extensions/Clocks.aarch64.hpp>
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								#define SPIN_FOUR 1
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								namespace Aurora::Threading::Primitives
 								{
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								    namespace ThrdCfg
 								    {
 								        inline bool         gPlatformIsSMPProcessorOptimized {}; // to include or not to include 🤔
 								        inline bool         gEnableAggressiveScheduling {};
 								        inline bool         gEnableAgrSchedulingRatelimit {};
 								        inline bool         gPreferNtCondvarModernWinSpin {};
 								        inline bool         gPreferNtCondvarOlderWinSpin {};
 								        inline bool         gPreferNtSemaphoreSpinTryLock {};
 								        inline bool         gPreferNtMutexSpinTryLock {};
 								        inline bool         gPreferNtCondMutexSpinTryLock {};
 								        inline bool         gPreferLinuxSemaphoreSpinTryLock {};
 								        inline bool         gPreferLinuxMutexSpinTryLock {};
 								        inline bool         gPreferLinuxCondMutexSpinTryLock {};
 								        inline bool         gPreferEmulatedWakeOnAddress {};
 								        inline bool         gPreferWaitOnAddressAlwaysSpin {};
-												[+] WaitOnAddress[Steady](..., AuOptional<bool> optAlreadySpun = {}) arguments
[+] ...slight UWP optimization?
[*] Lift WoA limitation

											
										
										
											2023-10-30 14:50:28 +00:00
+								        inline bool         gPreferWaitOnAddressAlwaysSpinNative {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								        inline bool         gPreferRWLockReadLockSpin {};
 								        inline bool         gUWPNanosecondEmulationCheckFirst {};
 								        inline AuUInt32     gUWPNanosecondEmulationMaxYields {};
 								        inline bool         gForceEnableAdaptiveSpin {};
 								        inline bool         gPreferEnableAdaptiveSpin {};
 								        inline bool         gPreferLinuxAdaptiveSpin {};
 								        inline bool         gPreferOldWin32AdaptiveSpin {};
 								        inline bool         gPreferNewWin32AdaptiveSpin {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt0 {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt4 {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt8 {};
 								        inline AuUInt32     gAdaptiveSpinCUCnt16 {};
 								        inline bool         gPreferFutexRWLock {};
-												[+] Non-mutually exclusive binary semaphore / event wait path
[+] ThreadingConfig::gPreferFutexEvent

											
										
										
											2023-09-10 13:50:59 +00:00
+								        inline bool         gPreferFutexEvent {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								        inline bool         gWinXpThrough7BlazeOptimizerPower {};
 								        inline bool         gPreferLinuxPrimitivesFutexNoSpin {};
 								        inline bool         gPreferUnixPrimitivesNoSpin {};
 								        inline bool         gAlwaysRWLockWriteBiasOnReadLock {};
 								        inline bool         gEnableRWLockWriteBiasOnReadLock {};
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								        inline AuUInt8      gCountOfPCores {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								    }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
 								#if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
 								    using SMTAtomic_t = AuUInt8;
 								#else
 								    using SMTAtomic_t = AuUInt32;
 								#endif
 								    inline SMTAtomic_t gCoreTable[256] {};
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								    inline AuUInt32 gSpinAdaptiveThreshold {};
 								    inline AuUInt32 gSpinAdaptiveCurrentCount {};
-												[*] SMT Yield: minor branch added to SMT Yield

											
										
										
											2023-09-09 17:09:22 +00:00
+								    inline AuUInt32 gSpinAdaptiveThreadCount {};
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
-												[*] RWLock: WakeOnAddress optimization on wait to prevent mutex congestion on modern OSes

											
										
										
											2023-08-23 13:52:47 +00:00
+								    inline AuUInt32 gUseFutexRWLock {};
-												[+] Non-mutually exclusive binary semaphore / event wait path
[+] ThreadingConfig::gPreferFutexEvent

											
										
										
											2023-09-10 13:50:59 +00:00
+								    inline AuUInt32 gPreferFutexEvent {};
-												[*] RWLock: WakeOnAddress optimization on wait to prevent mutex congestion on modern OSes

											
										
										
											2023-08-23 13:52:47 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								    void InitAdaptiveThreshold();
 								    void InitAdaptiveThresholdFirstTime();
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								    void InitCfg();
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								    static const bool kEnableSmartScheduling =
 								    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
 								        true;
 								    #else
 								        // tbd by arch and os
 								        false;
 								    #endif
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								    static auline void SMPPause()
 								    {
 								    #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
 								        _mm_pause();
 								    #elif defined(AURORA_ARCH_ARM)
 								    #if defined(AURORA_COMPILER_GCC)
 								        asm volatile("yield");
 								    #else
 								        __yield();
 								    #endif
 								    #else
 								        // TODO: your platform here
 								        AuThreading::ContextYield();
 								    #endif
 								    }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								#if defined(AURORA_ARCH_ARM)
 								    static AuUInt64 ConvertArmTicks(AuUInt64 uCounter, AuUInt64 uFreq)
 								    {
 								        if (uFreq == 10000000)
 								        {
 								            return uCounter * 100ull;
 								        }
 								        else if (uFreq == 1000000)
 								        {
 								            return uCounter * 1000ull;
 								        }
 								        else if (uFreq == 100000)
 								        {
 								            return uCounter * 10000ull;
 								        }
 								        else if (uFreq == 100000000ull)
 								        {
 								            return uCounter * 10ull;
 								        }
 								        else if (uFreq == 1000000000ull)
 								        {
 								            return uCounter;
 								        }
 								        else
 								        {
 								            const long long uWhole = (uCounter / uFreq) * 1'000'000'000ull;
 								            const long long uPart  = (uCounter % uFreq) * 1'000'000'000ull / uFreq;
 								            return uWhole + uPart;
 								        }
 								    }
 								    static AuUInt64 RdtscArmEmulated(AuUInt64 uClockFreq)
 								    {
 								        return ConvertArmTicks(ArmQueryClockCounter(), uClockFreq) * 4;
 								        // context:
 								        //     Intel recommends we spin, considering the potential for exponential back-offs later on, with a coefficient based on the CPUID brand of the processor.
 								        //     Under most processors, RDTSC is not that of the instruction counter. That'd be worthless; modern processors are ingesting hundreds of instructions to speculate on.
 								        //     Instead, RDTSC reads back a steady system-wide clock (*). It doesn't scale per core, nor can you overclock it.
 								        //     Back to Intels recommentation, instead of spamming your processes execution pipeline with mm_pauses in a loop, you should query RDTSC to solve the ABA problem and normalize for changes in the micro-architecture.
 								        //     This does allow Intel to decrease this potentially-NOP mm_pause sleep period by changing the stated base clock.
 								        //     On the aarch side of things, we should be able to match the exact Intel behaviour by:
 								        //      * Reading the system wide clock (CNTVCT_EL0)
 								        //      * Normalizing to nanoseconds with the given frequency (CNTFRQ_EL0)
 								        //      * Divide by approx "3.6 Ghz" ops/ns
 								        // *: Ok, techincally you can/need to verify Invariant TSC: CPUID.80000007H:EDX[8], but who actually cares?
 								    }
 								    #define __rdtsc() RdtscArmEmulated(uClockFreq)
 								    #define ALT_RDT
 								#endif
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								    template <typename T>
 								    bool auline YieldToSharedCore(long spin, T callback)
 								    {
 								        if (callback())
 								        {
 								            return true;
 								        }
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
 								    #if defined(AURORA_ARCH_ARM)
 								        AuUInt64 uClockFreq { ArmQueryClockFrequency() };
 								    #endif
 								        if (kEnableSmartScheduling)
 								        {
 								            bool bRet { false };
 								            auto uWord = SMTGetAPICNumber();
 								            if (uWord < AuArraySize(gCoreTable) &&
 								                uWord < ThrdCfg::gCountOfPCores)
 								            {
 								                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 1u);
 								                auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
 								                if (uNow <= gSpinAdaptiveThreshold)
 								                {
 								                    auto uCount = spin;
 								                    if (AuAtomicLoad(&gCoreTable[uWord ^ 1]))
 								                    {
 								                        uCount /= 5;
 								                    }
 								                    else if (gHasThreadLocalTimeout)
 								                    {
 								                        uCount += tlsSpinCountLocal;
 								                    }
 								                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
 								                    auto perfCounter = __rdtsc() + uCount;
 								                    while (__rdtsc() < perfCounter)
 								                #else
 								                    while (uCount > 0)
 								                #endif
 								                    {
 								                        if (callback())
 								                        {
 								                            bRet = true;
 								                            break;
 								                        }
 								                        else
 								                        {
 								                            SMPPause();
 								                            uCount--;
 								                        }
 								                    }
 								                }
 								                AuAtomicStore<SMTAtomic_t>(&gCoreTable[uWord], 0u);
 								                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								            }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								            return bRet;
 								        }
 								        else if (gSpinAdaptiveThreshold)
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								        {
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								            auto uNow = AuAtomicAdd(&gSpinAdaptiveCurrentCount, 1u);
 								            if (uNow <= gSpinAdaptiveThreshold)
 								            {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto uCount = spin;
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                while (__rdtsc() < perfCounter)
 								            #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                while (uCount > 0)
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                {
 								                    if (callback())
 								                    {
 								                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                        return true;
 								                    }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    else
 								                    {
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                    #if defined(SPIN_FOUR) && SPIN_FOUR == 1
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                        SMPPause();
 								                        SMPPause();
 								                        SMPPause();
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount -= 4;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                    #else
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount -= 1;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                    #endif
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                if (gHasThreadLocalTimeout)
 								                {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    auto uCount = tlsSpinCountLocal;
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    auto perfCounter = __rdtsc() + uCount;
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                    while (__rdtsc() < perfCounter)
 								                #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    while (uCount > 0)
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                    {
 								                        if (callback())
 								                        {
 								                            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                            return true;
 								                        }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                        else
 								                        {
 								                            SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                            uCount--;
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                        }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                    }
 								                }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								            }
-												[*] ...yes

											
										
										
											2023-09-19 00:38:16 +00:00
+								            else if (uNow <= (gSpinAdaptiveThreadCount / 4  * 3))
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								            {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto uCount = (spin) / 3;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                while (__rdtsc() < perfCounter)
 								            #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                while (uCount > 0)
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                {
 								                    if (callback())
 								                    {
 								                        AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								                        return true;
 								                    }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    else
 								                    {
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount--;
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								            }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								            AuAtomicSub(&gSpinAdaptiveCurrentCount, 1u);
 								        }
 								        else
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								        {
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								            auto uCount = spin;
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								        #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								            auto perfCounter = __rdtsc() + uCount;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								            while (__rdtsc() < perfCounter)
 								        #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								            while (uCount > 0)
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								        #endif
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								            {
 								                if (callback())
 								                {
 								                    return true;
 								                }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                else
 								                {
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                #if defined(SPIN_FOUR) && SPIN_FOUR == 1
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    SMPPause();
 								                    SMPPause();
 								                    SMPPause();
 								                    SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    uCount -= 4;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                #else
 								                    SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                    uCount -= 1;
-												[*] x86_64 Use RDSC for more deterministic back-off durations
Well, sort of. It's more likely to be referenced against the exact frequency stated in the hard-coded CPUID vendor string.

											
										
										
											2023-09-02 13:29:55 +00:00
+								                #endif
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                }
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								            }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
 								            if (gHasThreadLocalTimeout)
 								            {
 								                auto uCount = tlsSpinCountLocal;
-												[+] Improved SMT yielding
[+] Clocks.aarch64.[h/c]pp

											
										
										
											2024-01-02 02:49:23 +00:00
+								            #if defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86) || defined(ALT_RDT)
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                auto perfCounter = __rdtsc() + uCount;
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								                while (__rdtsc() < perfCounter)
 								            #else
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                while (uCount > 0)
-												[*] Unify both SMT subloops

											
										
										
											2023-09-04 22:03:08 +00:00
+								            #endif
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                {
 								                    if (callback())
 								                    {
 								                        return true;
 								                    }
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    else
 								                    {
 								                        SMPPause();
-												[*] Cleanup/formatting of SMT yields

											
										
										
											2023-09-06 15:24:43 +00:00
+								                        uCount--;
-												[*] Tweak default thread config
[*] Fix regressions

											
										
										
											2023-08-27 20:27:49 +00:00
+								                    }
-												[+] Global adaptive spin

											
										
										
											2023-08-22 11:57:47 +00:00
+								                }
 								            }
-												[+] More threading options
[+] AuThreading::SetSpinCountTimeout
[+] AuThreading::SetThreadLocalAdditionalSpinCountTimeout

											
										
										
											2023-08-19 17:14:28 +00:00
+								        }
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								        return callback();
 								    }
 								    template <typename T>
 								    bool auline DoTryIf(T callback)
 								    {
-												[*] Optimize thread configurations to be unpacked from the bitmap once at startup and during reconfigure as opposed ad-hoc

											
										
										
											2023-09-09 16:37:14 +00:00
+								        if (ThrdCfg::gPlatformIsSMPProcessorOptimized)
-												[+] IWaitable::LockNS(...)
[+] AuThreading.WakeAllOnAddress
[+] AuThreading.WakeOnAddress
[+] AuThreading.WakeNOnAddress
[+] AuThreading.TryWaitOnAddress
[+] AuThreading.WaitOnAddress
[*] Further optimize synch primitives
[+] AuThreadPrimitives::RWRenterableLock

											
										
										
											2023-03-12 15:27:28 +00:00
+								        {
 								            return YieldToSharedCore(gRuntimeConfig.threadingConfig.uSpinLoopPowerA, callback);
 								        }
 								        else
 								        {
 								            return callback();
 								        }
 								    }
 								}