/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: AuWaitFor.cpp Date: 2021-6-12 Author: Reece ***/ #include #include "AuWaitFor.hpp" #if defined(AURORA_IS_LINUX_DERIVED) #include #include #include #include #include #endif // Read the local header file for this file. // The original idea was sane. // The implemention, not so much... // TODO: REWRITE! namespace Aurora::Threading { static void YieldToSharedCore(long spin) { int loops = (1 << spin); while (loops > 0) { #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86)) _mm_pause(); #endif loops -= 1; } } void YieldToOtherThread() { #if defined(AURORA_IS_MODERNNT_DERIVED) SwitchToThread(); #elif defined(AURORA_IS_LINUX_DERIVED) sched_yield(); #else YieldToSharedCore(12); #endif } template // forcefully optiMize by templating a constant argument static inline void _FastSnooze(long &count, AuUInt64 &startTime, AuUInt64 maxStallNS, int &alpha, int &bravo, bool &forceSpin) //, bool yieldFaster , long maxStallMS = 20) { // TODO: rewrite me AuUInt64 now = Time::SteadyClockNS(); // Begin least likely checks, we're getting on now // Ironically we need to burn off some CPU cycles AuUInt64 deltaNS = now - startTime; #define SHOULD_SWITCH_ASAP(yieldDelayThresholdNs, roundTripNs)\ (static_cast(Flags) & kYieldFlagsContextSwitchASAP) // Validate we have at least one whole average of a context switch of overhead remaining #define HAS_ENOUGH_TIME_FOR_TIMED_SLEEP(yieldDelayThresholdNs, roundTripNs)\ (maxStallNS >= (roundTripNs + deltaNS)) // The point of rewriting kernel-free userland thread components is to delegate everything to userland // One key reason is single app performance. We should we not know how long to yield for, giving an undefined // ...amount of time to other applications might be a bad thing. fuck. why cant we have rtos functionality :( #define HAS_ENOUGH_TIME_FOR_INFINITE_SLEEP(yieldDelayThresholdNs, roundTripNs)\ ((static_cast(Flags)& kYieldFlagsContextSwitchForever) && (!maxStallNS)) // Perform a good faith guess at assuming we have enough overhead for a hard context switch #define HAS_ENOUGH_TIME_OVERHEAD(yieldDelayThresholdNs, roundTripNs)\ (HAS_ENOUGH_TIME_FOR_INFINITE_SLEEP(yieldDelayThresholdNs, roundTripNs) || HAS_ENOUGH_TIME_FOR_TIMED_SLEEP(yieldDelayThresholdNs, roundTripNs)) // Validate enough time (lets say 1/3rd of the approximated time of a preemptive switch or sleep(0)) has passed #define HAS_ENOUGH_TIME_PASSED(yieldDelayThresholdNs, roundTripNs)\ (deltaNS > yieldDelayThresholdNs) #define SHOULD_CTXSWAP(yieldDelayThresholdNs, roundTripNs)\ if (SHOULD_SWITCH_ASAP(yieldDelayThresholdNs, roundTripNs) || (HAS_ENOUGH_TIME_PASSED(yieldDelayThresholdNs, roundTripNs) && HAS_ENOUGH_TIME_OVERHEAD(yieldDelayThresholdNs, roundTripNs))) #if defined(AURORA_IS_LINUX_DERIVED) SHOULD_CTXSWAP(kLinuxYieldTimeThresNano, kPredictedLinuxKernelTimeRTNano) { // we are not very nice :D setpriority(PRIO_PROCESS, 0, bravo); static timespec fuck = { 0, kLinuxYieldTimeNano }; nanosleep(&fuck, &fuck); setpriority(PRIO_PROCESS, 0, alpha); forceSpin = true; return; } #endif #if defined(AURORA_PLATFORM_WIN32) SHOULD_CTXSWAP(kPredictedNTOSSwitchTimeYDNS, kPredictedNTOSSwitchTimeRTNS) { // TODO: ::Sleep(1); return; } #endif // Always at least try to burn some cycles off in a spinlock-esc time waster YieldToOtherThread(); } template // forcefully optiMize by templating a constant argument static void FastSnooze(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS) //, bool yieldFaster , long maxStallMS = 20) { #if defined(AURORA_IS_LINUX_DERIVED) int alpha = getpriority(PRIO_PROCESS, 0); int bravo = AuMin(15, AuMax(19, alpha + 5)); #else int alpha, bravo = 0; #endif bool spin = false; _FastSnooze(count, startTime, maxStallMS, alpha, bravo, spin); } template void FastSnooze<0>(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS); template void FastSnooze(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS); template void FastSnooze(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS); template static inline bool YieldPollTmpl(AuUInt64 &time, AuUInt64 timeoutMs, PollCallback_cb cb) { #if defined(AURORA_IS_LINUX_DERIVED) int alpha = getpriority(PRIO_PROCESS, 0); int bravo = AuMin(15, AuMax(19, alpha + 5)); #else int alpha, bravo = 0; #endif bool spin = false; long count = 0; unsigned long long a = Time::SteadyClockNS(); do { if (permitMultipleContextSwitches) { _FastSnooze(count, time, timeoutMs, alpha, bravo, spin); } else { _FastSnooze<0>(count, time, timeoutMs, alpha, bravo, spin); } if (cb()) { return true; } a = Time::SteadyClockNS(); } while ((!timeoutMs) || (timeoutMs > a)); return cb(); } AUKN_SYM bool YieldPollNs(bool permitMultipleContextSwitches, AuUInt64 timeoutNs, PollCallback_cb cb) { AuUInt64 time = Time::SteadyClockNS(); if (cb()) { return true; } if (timeoutNs) { // only relevant when there's no timeout, fastsnooze will do its own magic given the templates parameters permitMultipleContextSwitches = false; } // do not trust the compiler do branch here with a mere Func(...) // it's far more likely the branch will be handled in our yield loop if (permitMultipleContextSwitches) { return YieldPollTmpl(time, timeoutNs, cb); } else { return YieldPollTmpl(time, timeoutNs, cb); } return false; } AUKN_SYM bool YieldPoll(bool permitMultipleContextSwitches, AuUInt64 timeoutMs, PollCallback_cb cb) { AuUInt64 time = Time::SteadyClockNS(); AuUInt64 timeoutNs = timeoutMs ? (time + (timeoutMs * 1000000)) : 0; if (cb()) { return true; } if (timeoutMs) { // only relevant when there's no timeout, fastsnooze will do its own magic given the templates parameters permitMultipleContextSwitches = false; } // do not trust the compiler do branch here with a mere Func(...) // it's far more likely the branch will be handled in our yield loop if (permitMultipleContextSwitches) { return YieldPollTmpl(time, timeoutNs, cb); } else { return YieldPollTmpl(time, timeoutNs, cb); } return false; } static bool WaitLogicHandledByImplementor(bool &status, IWaitable *waitable, AuUInt64 timeout) { if (!waitable->HasLockImplementation()) { return false; } status = waitable->LockMS(timeout); return true; } static bool WaitLogicHandledByNTOS(bool &status, IWaitable *waitable, AuUInt64 timeout) { #if defined(AURORA_IS_MODERNNT_DERIVED) AuMach handle = 0; if (!waitable->HasOSHandle(handle)) { return false; } auto win32 = reinterpret_cast(handle); auto ret = WaitForSingleObject(win32, timeout ? timeout : INFINITE); SysAssert(ret != WAIT_FAILED, "Internal Win32 Error {}", GetLastError()); if (ret == WAIT_TIMEOUT) { status = false; } else { status = true; } return true; #else return false; #endif } AUKN_SYM bool WaitFor(IWaitable *waitable, AuUInt64 timeout) { bool status; if (WaitLogicHandledByNTOS(status, waitable, timeout)) { return status; } if (WaitLogicHandledByImplementor(status, waitable, timeout)) { return status; } return YieldPoll(true, timeout, [=]() { return waitable->TryLock(); }); } static bool CanWin32HandleAll(const AuList &waitables) { #if defined(AURORA_IS_MODERNNT_DERIVED) for (auto &waitable : waitables) { AuMach handle = 0; if (!waitable->HasOSHandle(handle)) { return false; } } return true; #else return false; #endif } static bool Win32HandleMultiple(const AuList &waitables, AuUInt64 timeoutMs) { #if defined(AURORA_IS_MODERNNT_DERIVED) AuList winWaitables; winWaitables.resize(waitables.size()); std::transform(waitables.begin(), waitables.end(), winWaitables.begin(), [](IWaitable *waitable) -> HANDLE { AuMach handle = 0; auto status = waitable->HasOSHandle(handle); SysAssert(status, "OS Handle was NULL"); return reinterpret_cast(handle); }); auto status = WaitForMultipleObjectsEx(winWaitables.size(), winWaitables.data(), TRUE, timeoutMs ? timeoutMs : INFINITE, true); SysAssert(status != WAIT_FAILED, "Internal Win32 Error {}", GetLastError()); if (status == WAIT_TIMEOUT) { return false; } else { return true; } #else return false; #endif } AUKN_SYM bool WaitFor(const AuList &waitables, AuUInt64 timeout) { if (CanWin32HandleAll(waitables)) { return Win32HandleMultiple(waitables, timeout); } // im worried about the complexity of using a vector here // we would have to hit o(n) and memcpy in the best case scenario on each object release // unordered maps are glorified hash tables // maps are glorified binary trees // maps should be fast enough AuHashMap releasedObjects; releasedObjects.reserve(waitables.size()); // pseudo reserve for (AuMach i = 0; i < waitables.size(); i++) { releasedObjects[i] = false; } // yield for all auto status = YieldPoll(true, timeout, [&]() { for (AuMach i = 0; i < waitables.size(); i++) { if (!releasedObjects[i]) { if (waitables[i]->TryLock()) { releasedObjects[i] = true; } else { return false; } } } return true; }); // from the perspective of locks, should the be a timeout event, we need to go back and unlock them on timeout if (!status) { for (AuMach i = 0; i < waitables.size(); i++) { if (releasedObjects[i]) { waitables[i]->Unlock(); } } } return status; } AUKN_SYM void ContextYield() { YieldToOtherThread(); } }