AuroraRuntime/Source/Async/ThreadPool.cpp

/***
    Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: ThreadPool.cpp
    Date: 2021-10-30
    Author: Reece
***/
#include <Source/RuntimeInternal.hpp>
#include "Async.hpp"
#include "ThreadPool.hpp"
#include "AsyncApp.hpp"
#include "WorkItem.hpp"
#include "Schedular.hpp"
#include "ThreadWorkerQueueShim.hpp"
#include <Source/IO/Loop/LSAsync.hpp>

namespace Aurora::Async
{
    //STATIC_TLS(WorkerId_t, tlsWorkerId);
    static thread_local AuWPtr<ThreadPool> gCurrentPool;
    static const auto kMagicResortThreshold = 15;

    static thread_local int tlsCallStack;
    inline auto GetWorkerInternal(const AuSPtr<IThreadPool> &pool)
    {
        if (pool.get() == AuStaticCast<IAsyncApp>(gAsyncApp))
        {
            return AuUnsafeRaiiToShared(AuStaticCast<ThreadPool>(gAsyncApp));
        }

        return AuStaticPointerCast<ThreadPool>(pool);
    }

    AUKN_SYM WorkerPId_t GetCurrentWorkerPId()
    {
        auto lkPool = gCurrentPool.lock();
        if (!lkPool) return {};
        auto cpy = *lkPool->tlsWorkerId;
        auto lkPool2 = cpy.pool.lock();
        return WorkerPId_t(lkPool, cpy);
    }

    //

    ThreadPool::ThreadPool() : shutdownEvent_(false, false, true)
    {
    }

    // internal pool interface

    bool ThreadPool::WaitFor(WorkerId_t unlocker, const AuSPtr<Threading::IWaitable> &primitive, AuUInt32 timeoutMs)
    {
        return WaitFor(WorkerPId_t { AuAsync::GetCurrentWorkerPId().pool, unlocker }, primitive, timeoutMs);
    }

    bool ThreadPool::WaitFor(WorkerPId_t unlocker, const AuSPtr<Threading::IWaitable> &primitive, AuUInt32 timeoutMs)
    {
        auto curThread = GetThreadState();

        if (!curThread)
        {
            return Threading::WaitFor(primitive.get(), timeoutMs);
        }

        bool workerIdMatches     =  (unlocker.second == curThread->id.second) || ((unlocker.second == Async::kThreadIdAny) && (GetThreadWorkersCount(unlocker.first) == 1));

        if ((unlocker.first == curThread->id.first) &&
            (unlocker.pool.get() == this) && // work group matches
            (workerIdMatches)) // well, crap
        {

            bool queryAsync = false;
            while (!(queryAsync ? primitive->TryLock() : Threading::WaitFor(primitive.get(), 2)))
            {
                queryAsync = CtxYield();

                if (!queryAsync && this->shuttingdown_)
                {
                    return false;
                }
            }

            return true;
        }
        else
        {
            AuSPtr<ThreadState> pHandle;

            {
                AU_LOCK_GUARD(AuStaticCast<ThreadPool>(unlocker.pool)->rwlock_->AsReadable());

                if ((pHandle = AuStaticCast<ThreadPool>(unlocker.pool)->GetThreadHandle(unlocker)))
                {
                    AU_LOCK_GUARD(pHandle->externalFencesLock);
                    if (pHandle->exitingflag2)
                    {
                        return primitive->TryLock();
                    }
                    else
                    {
                        pHandle->externalFences.push_back(primitive.get());
                    }
                }
                else if (unlocker.pool.get() == this)
                {
                    return  primitive->LockMS(timeoutMs);
                }
            }

            bool bRet = Threading::WaitFor(primitive.get(), timeoutMs);

            if (pHandle)
            {
                AU_LOCK_GUARD(pHandle->externalFencesLock);
                AuTryRemove(pHandle->externalFences, primitive.get());
            }

            return bRet;
        }
    }

    void ThreadPool::Run(WorkerId_t target, AuSPtr<IAsyncRunnable> runnable)
    {
        return this->Run(target, runnable, true);
    }

    void ThreadPool::Run(WorkerId_t target, AuSPtr<IAsyncRunnable> runnable, bool bIncrement)
    {
        auto state = GetGroup(target.first);
        SysAssert(static_cast<bool>(state), "couldn't dispatch a task to an offline group");

        auto pWorker = state->GetThreadByIndex(target.second);
        if (!pWorker)
        {
            runnable->CancelAsync();
            return;
        }

        AU_DEBUG_MEMCRUNCH;

        if (bIncrement)
        {
            AuAtomicAdd(&this->uAtomicCounter, 1u);
        }

        state->workQueue.AddWorkEntry(pWorker.get(), AuMakePair(target.second, runnable));

        if (target.second == Async::kThreadIdAny)
        {
            state->BroadCast();
        }
        else
        {
            if (AuAtomicLoad(&pWorker->cvSleepCount))
            {
                // Barrier:
                pWorker->cvWorkMutex->Lock();
                pWorker->cvWorkMutex->Unlock();

                pWorker->cvVariable->Signal();
            }

            if (AuAtomicTestAndSet(&pWorker->cvLSActive, 0u) == 0)
            {
                pWorker->eventLs->Set();
            }
        }
    }

    IThreadPool *ThreadPool::ToThreadPool()
    {
        return this;
    }

    // ithreadpool

    size_t ThreadPool::GetThreadWorkersCount(ThreadGroup_t group)
    {
        AU_LOCK_GUARD(this->rwlock_->AsReadable());
        return GetGroup(group)->workers.size();
    }

    void ThreadPool::SetRunningMode(bool eventRunning)
    {
        this->runnersRunning_ = eventRunning;
    }

    bool ThreadPool::Spawn(WorkerId_t workerId)
    {
        return Spawn(workerId, false);
    }

    bool ThreadPool::Create(WorkerId_t workerId)
    {
        return Spawn(workerId, true);
    }

    bool ThreadPool::InRunnerMode()
    {
        return this->runnersRunning_;
    }

    bool ThreadPool::Poll()
    {
        AuUInt32 uCount {};
        return InternalRunOne(GetThreadStateNoWarn(), false, uCount);
    }

    bool ThreadPool::RunOnce()
    {
        AuUInt32 uCount {};
        return InternalRunOne(GetThreadStateNoWarn(), true, uCount);
    }

    bool ThreadPool::Run()
    {
        bool ranOnce {};

        auto pJobRunner = GetThreadStateNoWarn();

        if (!pJobRunner)
        {
            this->shutdownEvent_->LockMS(0);
            return true;
        }

        gCurrentPool  = AuWeakFromThis();

        auto auThread = AuThreads::GetThread();


        while ((!auThread->Exiting()) &&
               (!this->shutdown) &&
               (!pJobRunner->bBreakEarly))
        {
            AuUInt32 uCount {};

            // Do work (blocking)
            if (!InternalRunOne(pJobRunner, true, uCount))
            {
                if (this->shutdown)
                {
                    return ranOnce;
                }
            }
            ranOnce = true;
        }

        return ranOnce;
    }

    bool ThreadPool::InternalRunOne(AuSPtr<ThreadState> state, bool block, AuUInt32 &uCount)
    {
        if (!state)
        {
            SysPushErrorUninitialized("Not an async thread");
            return false;
        }

        bool success {};
        auto runMode = GetCurrentThreadRunMode();

        EarlyExitTick();

        //do
        {
            auto asyncLoop = state->asyncLoop;

            asyncLoop->OnFrame();

            if (asyncLoop->GetSourceCount() > 1)
            {
                bool bShouldTrySleepForKernel {};

                if (runMode == ERunMode::eLowLatencyFreqKernel)
                {
                    if (state->rateLimiter.CheckExchangePass())
                    {
                    #if defined(AURORA_PLATFORM_WIN32)
                        bShouldTrySleepForKernel = asyncLoop->PumpNonblocking();
                    #else
                        bShouldTrySleepForKernel = asyncLoop->IsSignaledPeek();
                    #endif
                    }
                    else
                    {
                        if (!PollInternal(state, false, uCount))
                        {
                            AuThreading::ContextYield();
                        }
                        else
                        {
                            success = true;
                        }
                    }
                }
                else if (runMode == ERunMode::eLowLatencyYield)
                {
                    AuThreading::ContextYield();
                    block = false;
                    #if defined(AURORA_PLATFORM_WIN32)
                        bShouldTrySleepForKernel = asyncLoop->PumpNonblocking();
                    #else
                        bShouldTrySleepForKernel = asyncLoop->IsSignaledPeek();
                    #endif
                }
                else if (runMode == ERunMode::eEfficient)
                {
                    bShouldTrySleepForKernel = block;
                    if (!block)
                    {
                        bShouldTrySleepForKernel = asyncLoop->IsSignaledPeek();
                    }
                }

                if (bShouldTrySleepForKernel)
                {
                    // epoll and such like can be checked without read success. kevent works on availablity, not scheduling read like iosubmit
                    // allow windows to atomically pump instead of wasting time buffering the primitives state
                    if ((AuBuild::kIsNtDerived && runMode == ERunMode::eEfficient) ||
                        (!AuBuild::kIsNtDerived))
                    {
                        AuAtomicAdd(&state->cvSleepCount, 1u);
                        asyncLoop->WaitAny(0);
                        AuAtomicSub(&state->cvSleepCount, 1u);
                    }
                }

                success = PollInternal(state, false, uCount);
            }
            else
            {
                success = PollInternal(state, block, uCount);
            }
        } //while (success);

        EarlyExitTick();


        return success;
    }


    void GroupWorkQueue::AddWorkEntry(ThreadState *pState, WorkEntry_t entry)
    {
        auto prio = (int)entry.second->GetPrio();
        SysAssert(prio < AuAsync::kEWorkPrioCount, "Invalid PRIO");

        AU_LOCK_GUARD(this->mutex);
        this->sortedWork[prio].push_back(entry);

        if (entry.first != kThreadIdAny)
        {
            if (auto pThat = pState->parent.lock()->GetThreadByIndex(entry.first))
            {
                AuAtomicAdd(&pThat->cvHasWork, 1u);
            }
        }
    }

    bool GroupWorkQueue::IsEmpty()
    {
        AU_LOCK_GUARD(this->mutex);
        for (AU_ITERATE_N(i, AuAsync::kEWorkPrioCount))
        {
            if (this->sortedWork[i].size())
            {
                return false;
            }
        }

        return true;
    }

    bool GroupWorkQueue::IsEmpty(ThreadPool *pPool, AuWorkerId_t id)
    {
    #if 1
        auto pHandle = pPool->GetThreadHandle(id);
        if (!pHandle)
        {
            return false;
        }

        return !AuAtomicLoad(&pHandle->cvHasWork);
    #else
        AU_LOCK_GUARD(this->mutex);

        for (AU_ITERATE_N(i, AuAsync::kEWorkPrioCount))
        {
            for (const auto &[srcId, pA] : this->sortedWork[i])
            {
                if (id == srcId)
                {
                    return false;
                }
            }
        }

        return true;
    #endif
    }

    void GroupWorkQueue::Dequeue(AuList<WorkEntry_t> &queue, int maxPopCount, AuAsync::ThreadId_t id)
    {
        AU_LOCK_GUARD(this->mutex);

        for (AU_ITERATE_N(i, AuAsync::kEWorkPrioCount))
        {
            auto &group = this->sortedWork[(int)AuAsync::kEWorkPrioMaxLegal - i];

            for (auto itr = group.begin(); ((itr != group.end()) && (queue.size() < maxPopCount)); )
            {
                if (itr->first == Async::kThreadIdAny)
                {
                    queue.push_back(*itr);
                    itr = group.erase(itr);
                    continue;
                }

                if ((itr->first != Async::kThreadIdAny) &&
                    (itr->first == id))
                {
                    queue.push_back(*itr);
                    itr = group.erase(itr);
                    continue;
                }

                itr++;
            }

            if (queue.size())
            {
                break;
            }
        }
    }

    void ThreadPool::DoThing(ThreadState *pState)
    {
        auto uState = pState->cvHasWork;
        auto uMin = AuMin(uState, pState->pendingWorkItems.size());
        if (!uMin) uMin = 1;

        while (uState &&
                AuAtomicCompareExchange<AuUInt32>(&pState->cvHasWork, uState - uMin, uState) != uState)
        {
            uState = pState->cvHasWork;

            if (uState < uMin)
            {
                uMin = uState;
            }
        }
    }

    bool ThreadPool::PollInternal(AuSPtr<ThreadState> state, bool block, AuUInt32 &uCount)
    {
        if (!state)
        {
            SysPushErrorUninitialized("Not an async thread");
            return false;
        }

        auto group = state->parent.lock();

        //state->pendingWorkItems.clear();

        {
            AuAtomicAdd(&state->cvSleepCount, 1u);
            AU_LOCK_GUARD(state->cvWorkMutex);

            do
            {
                group->workQueue.Dequeue(state->pendingWorkItems, state->multipopCount, state->id.second);

                this->DoThing(state.get());

                // Consider blocking for more work
                if (!block)
                {
                    break;
                }

                // pre-wakeup thread terminating check
                if (state->threadObject->Exiting())
                {
                    break;
                }


                // Block if no work items are present
                if (state->pendingWorkItems.empty())
                {
                    if (this->shuttingdown_ & 2)
                    {
                        break;
                    }

                    state->cvVariable->WaitForSignal();

                    if (this->shuttingdown_ & 2)
                    {
                        break;
                    }
                }

                // Post-wakeup thread terminating check
                if (state->threadObject->Exiting())
                {
                    break;
                }

                if (state->pendingWorkItems.empty() && (
                    (this->GetThreadState()->asyncLoop->GetSourceCount() > 1) ||
                    this->GetThreadState()->asyncLoop->CommitPending()))                    //(this->ToKernelWorkQueue()->IsSignaledPeek()))
                {
                    AuAtomicSub(&state->cvSleepCount, 1u);
                    return false;
                }

            }
            while (state->pendingWorkItems.empty() && block);

            if (!block) // quick hack: is worthy of io reset by virtue of having polled externally (most likely for IO ticks, unlikely for intraprocess ticks)
            {
                AU_LOCK_GUARD(group->workQueue.mutex); // dont atomically increment our work counters [signal under mutex group]...
                AU_LOCK_GUARD(group->workersMutex);    // dont atomically increment our work counters [broadcast]...
                // ...these primitives are far less expensive to hit than resetting kernel primitives
                // AU_LOCK_GUARD(state->cvWorkMutex) used to protect us

                if (group->workQueue.IsEmpty(this, state->id))
                {
                    state->eventLs->Reset(); // ...until we're done
                    AuAtomicStore(&state->cvLSActive, 0u);
                }
            }

            AuAtomicSub(&state->cvSleepCount, 1u);
        }

        if (state->pendingWorkItems.empty())
        {
            if (InRunnerMode())
            {
                if ((this->uAtomicCounter == 0) &&
                    this->IsDepleted())
                {
                    Shutdown();
                }
            }

            return false;
        }

        int runningTasks {};

        auto oldTlsHandle = AuExchange(gCurrentPool, AuSharedFromThis());

        bool lowPrioCont {};
        bool lowPrioContCached {};

        state->cookie++;

        int start = state->cookie;

        // Account for
        //            while (AuAsync.GetCurrentPool()->runForever());
        // in the first task (or deeper)
        if (InRunnerMode() && tlsCallStack) // are we one call deep?
        {
            auto queue = ToKernelWorkQueue();

            if ((this->uAtomicCounter == tlsCallStack) &&
                this->IsDepleted())
            {
                return false;
            }
        }

        //

        for (auto itr = state->pendingWorkItems.begin(); itr != state->pendingWorkItems.end(); )
        {
            if (state->threadObject->Exiting() || this->shutdown)
            {
                break;
            }

            // Set the last frame time for a watchdog later down the line
            state->lastFrameTime = Time::CurrentClockMS();


            // Dispatch
            auto oops = itr->second;

            // Remove from our local job queue
            itr = state->pendingWorkItems.erase(itr);

            tlsCallStack++;

            //SysBenchmark(fmt::format("RunAsync: {}", block));
            // Dispatch
            oops->RunAsync();
            uCount++;

            // Atomically decrement global task counter
            runningTasks = AuAtomicSub(&this->uAtomicCounter, 1u);

            tlsCallStack--;

            if (start != state->cookie)
            {
                start = state->cookie;
                itr = state->pendingWorkItems.begin();
            }
        }

        gCurrentPool = oldTlsHandle;

        // Return popped work back to the groups work pool when our -pump loops were preempted
        if (state->pendingWorkItems.size())
        {
            AU_LOCK_GUARD(state->cvWorkMutex);

            for (const auto &item : state->pendingWorkItems)
            {
                group->workQueue.AddWorkEntry(state.get(), item);
            }

            state->pendingWorkItems.clear();
            state->cvVariable->Broadcast();
            state->eventLs->Set();
        }

        // Account for
        //            while (AuAsync.GetCurrentPool()->runForever());
        // in the top most task
        if (InRunnerMode())
        {
            auto queue = ToKernelWorkQueue();

            if ((runningTasks == 0) &&
                (this->uAtomicCounter == 0) &&
                this->IsDepleted())
            {
                Shutdown();
            }
        }

        return true;
    }

    // While much of this subsystem needs good rewrite, under no circumstance should the shutdown process be "simpified" or "cleaned up"
    // This is our expected behaviour. Any changes will likely introduce hard to catch bugs across various softwares and exit conditions.
    void ThreadPool::Shutdown()
    {
        auto trySelfPid = AuAsync::GetCurrentWorkerPId();

        // Update shutting down flag
        // Specify the root-level shutdown flag for 'ok, u can work, but you're shutting down soon [microseconds, probably]'
        {
            if (AuAtomicTestAndSet(&this->shuttingdown_, 0) != 0)
            {
                return;
            }
        }

        auto pLocalRunner = this->GetThreadStateNoWarn();

        AuList<WorkerId_t> toBarrier;

        {
            {
                AU_LOCK_GUARD(this->rwlock_->AsReadable());

                for (auto pGroup : this->threadGroups_)
                {
                    if (!pGroup)
                    {
                        continue;
                    }

                    for (auto &[id, worker] : pGroup->workers)
                    {
                        if (trySelfPid == worker->id)
                        {
                            continue;
                        }

                        toBarrier.push_back(worker->id);
                    }
                }
            }


            for (const auto &id : toBarrier)
            {
                if (trySelfPid == id)
                {
                    continue;
                }

                this->Barrier(id, 0, false, false /* no reject*/); // absolute safest point in shutdown; sync to already submitted work
            }
        }

        {
            for (const auto &id : toBarrier)
            {
                if (trySelfPid == id)
                {
                    continue;
                }

                AuAtomicAdd(&this->uAtomicShutdownCookie, 1u);
            }
        }

        // Time for fuckiness

        // Specify the root-level shutdown flag for 'ok, u can work, but you're shutting down after sync barrier'
        {
            AuAtomicTestAndSet(&this->shuttingdown_, 1);
        }

        // Finally set the shutdown flag on all of our thread contexts
        // then release them from the runners/workers list
        // then release all group contexts
        AuList<AuThreads::ThreadShared_t> threads;
        AuList<AuSPtr<ThreadState>> states;
        {
            AU_LOCK_GUARD(this->rwlock_->AsReadable());

            for (auto pGroup : this->threadGroups_)
            {
                if (!pGroup)
                {
                    continue;
                }

                for (auto &[id, pState] : pGroup->workers)
                {
                    // main loop:
                    if (pState && pState->cvWorkMutex && pState->cvVariable)
                    {
                        states.push_back(pState);
                        pState->shuttingdown = true;
                    }
                    else
                    {
                        pState->shuttingdown = true;
                    }

                    // thread object:
                    if (!pGroup->IsSysThread()) // bug?
                    {
                        pState->threadObject->SendExitSignal();
                        threads.push_back(pState->threadObject);
                    }

                    // unrefreeze signals:
                    auto &event = pState->running;
                    if (event)
                    {
                        event->Set();
                    }
                }
            }
        }

        {
            for (const auto &pState : states)
            {
                AU_LOCK_GUARD(pState->cvWorkMutex);
                pState->cvVariable->Broadcast();
                pState->eventLs->Set();
            }
        }
        // Final sync to exit

        {
            for (const auto &id : toBarrier)
            {
                if (trySelfPid == id)
                {
                    continue;
                }

                auto handle = this->GetThreadHandle(id);
                if (handle)
                {
                    handle->rejecting = false;
                    handle->isDeadEvent->LockMS(250);
                }
            }
        }

        // Sync to shutdown threads to prevent a race condition whereby the async subsystem shuts down before the threads
        for (const auto &thread : threads)
        {
            thread->Exit();
        }

        // Is dead flag
        this->shutdown = true;
        this->shutdownEvent_->Set();

        if (pLocalRunner)
        {
            pLocalRunner->bIsKiller = true;
        }

        for (const auto &wOther : this->listWeakDepsParents_)
        {
            if (auto pThat = AuTryLockMemoryType(wOther))
            {
                if (pThat->InRunnerMode())
                {
                    continue;
                }

                if (!pThat->IsSelfDepleted())
                {
                    continue;
                }

                if (pThat->uAtomicCounter)
                {
                    continue;
                }

                pThat->Shutdown();
            }
        }
    }

    bool ThreadPool::Exiting()
    {
        return this->shuttingdown_ ||
               GetThreadState()->exiting;
    }

    AuUInt32 ThreadPool::PollAndCount(bool bStrict)
    {
        AuUInt32 uCount {};
        auto bRanAtLeastOne = this->InternalRunOne(this->GetThreadStateNoWarn(), false, uCount);
        return uCount ? uCount : (bStrict ? bRanAtLeastOne : 0);
    }

    AuUInt32 ThreadPool::RunAllPending()
    {
        AuUInt32 uCount {};
        bool ranAtLeastOne {};

        do
        {
            uCount = 0;
            ranAtLeastOne |= this->InternalRunOne(this->GetThreadStateNoWarn(), false, uCount);
        }
        while (uCount);

        return uCount ? uCount : false;
    }

    AuSPtr<IWorkItem> ThreadPool::NewWorkItem(const WorkerId_t &worker,
                                              const AuSPtr<IWorkItemHandler> &task)
    {
        // Error pass-through
        if (!task)
        {
            return {};
        }

        return AuMakeShared<WorkItem>(this, WorkerPId_t { this->SharedFromThis(), worker }, task);
    }

    AuSPtr<IWorkItem> ThreadPool::NewWorkFunction(const WorkerId_t &worker,
                                                  AuVoidFunc callback)

    {
        SysAssert(callback);
        return AuMakeShared<FuncWorker>(this, WorkerPId_t { this->SharedFromThis(), worker }, AuMove(callback));
    }

    AuSPtr<IWorkItem> ThreadPool::NewFence()
    {
        return AuMakeShared<WorkItem>(this, AuAsync::GetCurrentWorkerPId(), AuSPtr<IWorkItemHandler>{});
    }

    AuThreads::ThreadShared_t ThreadPool::ResolveHandle(WorkerId_t id)
    {
        auto pState = GetThreadHandle(id);
        if (!pState)
        {
            return {};
        }

        return pState->threadObject;
    }

    AuBST<ThreadGroup_t, AuList<ThreadId_t>> ThreadPool::GetThreads()
    {
        AU_LOCK_GUARD(rwlock_->AsReadable());

        AuBST<ThreadGroup_t, AuList<ThreadId_t>> ret;

        for (auto pGroup : this->threadGroups_)
        {
            AuList<ThreadId_t> workers;

            if (!pGroup)
            {
                continue;
            }

            AuTryReserve(workers, pGroup->workers.size());

            for (const auto &thread : pGroup->workers)
            {
                workers.push_back(thread.second->id.second);
            }

            ret[pGroup->group] = workers;
        }

        return ret;
    }

    WorkerId_t ThreadPool::GetCurrentThread()
    {
        return tlsWorkerId;
    }

    bool ThreadPool::Sync(WorkerId_t workerId, AuUInt32 timeoutMs, bool requireSignal)
    {
        AU_LOCK_GUARD(this->rwlock_->AsReadable());

        auto group = GetGroup(workerId.first);
        auto currentWorkerId = GetCurrentThread().second;

        if (workerId.second == Async::kThreadIdAny)
        {
            for (auto &jobWorker : group->workers)
            {
                if (!Barrier(jobWorker.second->id, timeoutMs, requireSignal && jobWorker.second->id.second != currentWorkerId, false)) // BAD!, should subtract time elapsed, clamp to, i dunno, 5ms min?
                {
                    return false;
                }
            }
        }
        else
        {
            return Barrier(workerId, timeoutMs, requireSignal && workerId.second != currentWorkerId, false);
        }

        return true;
    }

    void ThreadPool::Signal(WorkerId_t workerId)
    {
        AU_LOCK_GUARD(this->rwlock_->AsReadable());

        auto group = GetGroup(workerId.first);
        if (workerId.second == Async::kThreadIdAny)
        {
            for (auto &jobWorker : group->workers)
            {
                jobWorker.second->running->Set();
            }
        }
        else
        {
            GetThreadHandle(workerId)->running->Set();
        }
    }

    AuSPtr<AuLoop::ILoopSource> ThreadPool::WorkerToLoopSource(WorkerId_t workerId)
    {
        AU_LOCK_GUARD(this->rwlock_->AsReadable());

        auto a = GetThreadHandle(workerId);
        if (!a)
        {
            return {};
        }

        return a->asyncLoopSourceShared;
    }

    void ThreadPool::SyncAllSafe()
    {
        AU_LOCK_GUARD(this->rwlock_->AsReadable());

        for (auto pGroup : this->threadGroups_)
        {
            if (!pGroup)
            {
                continue;
            }

            for (auto &jobWorker : pGroup->workers)
            {
                SysAssert(Barrier(jobWorker.second->id, 0, false, false));
            }
        }
    }

    void ThreadPool::AddFeature(WorkerId_t id,
                                AuSPtr<AuThreads::IThreadFeature> pFeature,
                                bool bNonBlock)
    {
        auto pWorkItem = DispatchOn({ this->SharedFromThis(), id }, [=]()
        {
            auto pState = GetThreadState();
            AU_LOCK_GUARD(pState->featuresMutex);
            pState->features.push_back(pFeature);
            pFeature->Init();
        });

        if (!bNonBlock)
        {
            pWorkItem->BlockUntilComplete();
        }
    }

    void ThreadPool::AssertInThreadGroup(ThreadGroup_t group)
    {
        SysAssert(static_cast<WorkerId_t>(tlsWorkerId).first == group);
    }

    void ThreadPool::AssertWorker(WorkerId_t id)
    {
        SysAssert(static_cast<WorkerId_t>(tlsWorkerId) == id);
    }

    AuSPtr<AuLoop::ILoopQueue> ThreadPool::ToKernelWorkQueue()
    {
        return this->GetThreadState()->asyncLoop;
    }

    AuSPtr<AuLoop::ILoopQueue> ThreadPool::ToKernelWorkQueue(WorkerId_t workerId)
    {
        auto worker = this->GetThreadHandle(workerId);
        if (!worker)
        {
            SysPushErrorGeneric("Couldn't find requested worker");
            return {};
        }
        return worker->asyncLoop;
    }

    void ThreadPool::UpdateWorkMode(WorkerId_t workerId, RunMode mode)
    {
        auto states = this->GetThreadHandles(workerId);
        if (!states.size())
        {
            SysPushErrorGeneric("Couldn't find requested worker");
            return;
        }

        for (const auto &state : states)
        {
            state->runMode = mode.mode;
            if (mode.freqMsTick)
            {
                state->rateLimiter.SetNextStep(mode.freqMsTick * 1'000'000);
            }
        }
    }

    ERunMode ThreadPool::GetCurrentThreadRunMode()
    {
        auto state = this->GetThreadState();
        if (!state)
        {
            return ERunMode::eEfficient;
        }
        return state->runMode;
    }

    ERunMode ThreadPool::GetThreadRunMode(WorkerId_t workerId)
    {
        auto worker = this->GetThreadHandle(workerId);
        if (!worker)
        {
            SysPushErrorGeneric("Couldn't find requested worker");
            return {};
        }
        return worker->runMode;
    }

    bool ThreadPool::IsSelfDepleted()
    {
        auto queue = ToKernelWorkQueue();
        return (!queue || queue->GetSourceCount() <= 1 + this->uAtomicIOProcessorsWorthlessSources + this->uAtomicIOProcessors);
    }

    bool ThreadPool::IsDepleted()
    {
        if (!IsSelfDepleted())
        {
            return false;
        }

        for (const auto &wOther : this->listWeakDeps_)
        {
            if (auto pThat = AuTryLockMemoryType(wOther))
            {
                if (!pThat->IsSelfDepleted())
                {
                    return false;
                }

                if (pThat->uAtomicCounter)
                {
                    return false;
                }
            }
        }

        return true;
    }

    void ThreadPool::AddDependency(AuSPtr<IThreadPool> pPool)
    {
        if (!pPool)
        {
            return;
        }

        auto pOther = AuStaticCast<ThreadPool>(pPool);
        this->listWeakDeps_.push_back(pOther);
        pOther->listWeakDepsParents_.push_back(AuSharedFromThis());
    }

    AuSPtr<AuThreading::IWaitable> ThreadPool::GetShutdownEvent()
    {
        return AuSPtr<AuThreading::IWaitable>(AuSharedFromThis(), this->shutdownEvent_.AsPointer());
    }

    // Unimplemented fiber hooks, 'twas used for science. no longer in use

    int ThreadPool::CtxPollPush()
    {
        // TOOD (Reece): implement a context switching library
        // Refer to the old implementation of this on pastebin
        return 0;
    }

    void ThreadPool::CtxPollReturn(const AuSPtr<ThreadState> &state, int status, bool hitTask)
    {
    }

    bool ThreadPool::CtxYield()
    {
        bool ranAtLeastOne = false;

        // !!!

        auto pA = this->GetThreadStateNoWarn();

        if (this->shutdown ||
            this->shuttingdown_ & 2) // fast
        {
            if (pA->rejecting)
            {
                return false;
            }
        }

    #if 0
        return this->InternalRunOne(false, uCount);
    #else
        AuUInt32 uCount {};
        do
        {
            uCount = 0;
            ranAtLeastOne |= this->InternalRunOne(pA, false, uCount);
        }
        while (uCount);

        return uCount || ranAtLeastOne;
    #endif
    }

    //

    void ThreadPool::IncrementAbortFenceOnPool()
    {
        AuAtomicAdd(&this->uAtomicShutdownCookie, 1u);
    }

    void ThreadPool::IncrementAbortFenceOnWorker(WorkerId_t workerId)
    {
        if (auto pState = this->GetThreadHandle(workerId))
        {
            AuAtomicAdd(&pState->uShutdownFence, 1u);
        }
    }

    AuUInt64 ThreadPool::QueryAbortFence(AuOptional<WorkerId_t> optWorkerId)
    {
        if (auto pState = this->GetThreadHandle(optWorkerId.value_or(GetCurrentWorkerPId())))
        {
            return (AuUInt64(pState->uShutdownFence) << 32ull) | AuUInt64(this->uAtomicShutdownCookie);
        }
        else
        {
            return this->uAtomicShutdownCookie;
        }
    }

    bool ThreadPool::QueryShouldAbort(AuOptional<WorkerId_t> optWorkerId, AuUInt64 uFenceMagic)

    {
        auto uSelfCookie = AuBitsToLower(uFenceMagic);
        if (uSelfCookie != AuAtomicLoad(&this->uAtomicShutdownCookie))
        {
            return true;
        }

        auto uThreadCookie = AuBitsToHigher(uFenceMagic);
        if (!uThreadCookie)
        {
            return false;
        }

        if (auto pState = this->GetThreadHandle(optWorkerId.value_or(GetCurrentWorkerPId())))
        {
            return uThreadCookie != pState->uShutdownFence;
        }
        else
        {
            return false;
        }
    }


    // internal api

    bool ThreadPool::Spawn(WorkerId_t workerId, bool create)
    {
        AU_LOCK_GUARD(rwlock_->AsWritable());

        if (create)
        {
            gCurrentPool = AuSharedFromThis();
        }

        AuSPtr<GroupState> pGroup;

        // Try fetch or allocate group
        {
            if (!(pGroup = threadGroups_[workerId.first]))
            {
                pGroup = AuMakeShared<GroupState>();

                if (!pGroup->Init())
                {
                    SysPushErrorMemory("Not enough memory to intiialize a new group state");
                    return false;
                }

                this->threadGroups_[workerId.first] = pGroup;
            }
        }

        // Assert worker does not already exist
        {
            AuSPtr<ThreadState>* ret;

            if (AuTryFind(pGroup->workers, workerId.second, ret))
            {
                SysPushErrorGeneric("Thread ID already exists");
                return false;
            }
        }

        auto threadState = AuMakeShared<ThreadState>();
        if (!threadState)
        {
            SysPushErrorMemory();
            return {};
        }

        threadState->parent = pGroup;
        threadState->id = workerId;
        threadState->asyncLoop = AuMakeShared<AsyncLoop>();
        if (!threadState->asyncLoop)
        {
            SysPushErrorMemory();
            return {};
        }

        threadState->eventLs = AuLoop::NewLSAsync();
        if (!threadState->eventLs)
        {
            SysPushErrorMemory();
            return {};
        }

        threadState->asyncLoopSourceShared = threadState->eventLs;

        threadState->asyncLoop->pParent = threadState.get();
        threadState->rateLimiter.SetNextStep(1'000'000); // 1MS in nanoseconds
        threadState->runMode = ERunMode::eEfficient;

        if (!threadState->asyncLoop)
        {
            SysPushErrorMemory();
            return {};
        }

        if (!threadState->asyncLoop->Init())
        {
            SysPushErrorNested();
            return {};
        }

        threadState->asyncLoop->SourceAdd(threadState->eventLs);

        if (!create)
        {
            threadState->threadObject = AuThreads::ThreadShared(AuThreads::ThreadInfo(
                AuMakeShared<AuThreads::IThreadVectorsFunctional>(AuThreads::IThreadVectorsFunctional::OnEntry_t(std::bind(&ThreadPool::Entrypoint, this, threadState->id)),
                                                                  AuThreads::IThreadVectorsFunctional::OnExit_t{}),
                gRuntimeConfig.async.threadPoolDefaultStackSize
            ));

            if (!threadState->threadObject)
            {
                return {};
            }

            threadState->threadObject->Run();
        }
        else
        {
            threadState->threadObject = AuSPtr<AuThreads::IAuroraThread>(AuThreads::GetThread(), [](AuThreads::IAuroraThread *){});

            // TODO: this is just a hack
            // we should implement this properly
            threadState->threadObject->AddLastHopeTlsHook(AuMakeShared<AuThreads::IThreadFeatureFunctional>([]() -> void
            {

            }, []() -> void
            {
                auto pid = GetCurrentWorkerPId();
                if (pid.pool)
                {
                    GetWorkerInternal(pid.pool)->ThisExiting();
                }
            }));

            //
            gCurrentPool = AuWeakFromThis();
            tlsWorkerId = WorkerPId_t(AuSharedFromThis(), workerId);
        }

        pGroup->AddWorker(workerId.second, threadState);

        return true;
    }

    // private api

    AU_NOINLINE bool ThreadPool::Barrier(WorkerId_t workerId, AuUInt32 ms, bool requireSignal, bool drop)
    {
        auto self = GetThreadState();
        if (!self)
        {
            return {};
        }

        auto &semaphore = self->syncSema;
        auto unsafeSemaphore = semaphore.AsPointer();
        bool failed {};

        auto work = AuMakeShared<AsyncFuncRunnable>(
            [=]()
            {
                auto state = GetThreadState();

                if (drop)
                {
                    state->rejecting = true;
                }

                if (requireSignal)
                {
                    state->running->Reset();
                }

                unsafeSemaphore->Unlock(1);

                if (requireSignal)
                {
                    state->running->Lock();
                }
            },
            [&]()
            {
                unsafeSemaphore->Unlock(1);
                failed = true;
            }
        );

        if (!work)
        {
            return false;
        }

        Run(workerId, work);

        return WaitFor(workerId, AuUnsafeRaiiToShared(semaphore.AsPointer()), ms) && !failed;
    }

    void ThreadPool::Entrypoint(WorkerId_t id)
    {
        gCurrentPool  = AuWeakFromThis();
        tlsWorkerId   = WorkerPId_t(AuSharedFromThis(), id);

        auto job      = GetThreadState();

        Run();

        if (id != WorkerId_t {0, 0})
        {
            AU_LOCK_GUARD(this->rwlock_->AsReadable());

            if (!this->shuttingdown_ && !job->rejecting)
            {
                // Pump and barrier + reject all after atomically
                Barrier(id, 0, false, true);
            }
        }

        ThisExiting();

        if (id == WorkerId_t {0, 0})
        {
            CleanWorkerPoolReservedZeroFree();
        }
    }

    void ThreadPool::EarlyExitTick()
    {
        auto jobWorker = GetThreadState();
        auto state = jobWorker->parent.lock();
        if (!jobWorker)
        {
            SysPushErrorUninitialized("Not an async thread");
            return;
        }

        if ((this->shuttingdown_ & 2) != 2)
        {
            return;
        }

        state->BroadCast();

        {
            if (AuExchange(jobWorker->bAlreadyDoingExitTick, true))
            {
                return;
            }

            AuUInt32 uCount {};
            do
            {
                uCount = 0;
                this->PollInternal(jobWorker, false, uCount);
            }
            while (uCount);
        }


        AuList<AuSPtr<AuThreads::IThreadFeature>> features;
        {
            AU_LOCK_GUARD(jobWorker->featuresMutex);
            features = AuExchange(jobWorker->features, {});
        }

        {
            for (const auto &thread : features)
            {
                try
                {
                    thread->Cleanup();
                }
                catch (...)
                {
                    SysPushErrorCatch("Couldn't clean up thread feature!");
                }
            }

            jobWorker->isDeadEvent->Set();


            jobWorker->bAlreadyDoingExitTick = false;
            jobWorker->bBreakEarly = true;
        }

    }

    void ThreadPool::ThisExiting()
    {
        auto id    = GetCurrentThread();
        auto state = GetGroup(id.first);

        auto pLocalState = state->GetThreadByIndex(id.second);

        AuList<AuSPtr<AuThreads::IThreadFeature>> features;
        {
            AU_LOCK_GUARD(this->rwlock_->AsReadable());

            pLocalState->isDeadEvent->Set();

            CleanUpWorker(id);

            TerminateSceduledTasks(this, id);

            pLocalState->syncSema->Unlock(10); // prevent ::Barrier dead-locks

            {
                AU_LOCK_GUARD(pLocalState->externalFencesLock);
                pLocalState->exitingflag2 = true;

                for (const auto &pIWaitable : pLocalState->externalFences)
                {
                    pIWaitable->Unlock();
                }

                pLocalState->externalFences.clear();
            }

            {
                AU_LOCK_GUARD(pLocalState->featuresMutex);
                features = AuExchange(pLocalState->features, {});
            }
        }

        {
            for (const auto &thread : features)
            {
                try
                {
                    thread->Cleanup();
                }
                catch (...)
                {
                    SysPushErrorConcurrentRejected("Couldn't clean up thread feature!");
                }
            }

            features.clear();
        }

        {
            AU_LOCK_GUARD(this->rwlock_->AsWritable());
            state->Decommit(id.second);
        }
    }

    AuSPtr<GroupState>  ThreadPool::GetGroup(ThreadGroup_t type)
    {
        return this->threadGroups_[type];
    }

    AuSPtr<ThreadState> ThreadPool::GetThreadState()
    {
        auto thread = gCurrentPool.lock();
        if (!thread)
        {
            return {};
        }

    #if defined(AU_CFG_ID_INTERNAL) || defined(AU_CFG_ID_DEBUG)
        if (thread.get() != this)
        {
            SysPushErrorGeneric("wrong thread");
            return {};
        }
    #endif

        auto worker = *tlsWorkerId;

        auto state = GetGroup(worker.first);
        if (!state)
        {
            return {};
        }

        return state->GetThreadByIndex(worker.second);
    }

    AuSPtr<ThreadState> ThreadPool::GetThreadStateNoWarn()
    {
        auto thread = gCurrentPool.lock();
        if (!thread)
        {
            return {};
        }

        if (thread.get() != this)
        {
            return {};
        }

        auto worker = *tlsWorkerId;

        auto state = GetGroup(worker.first);
        if (!state)
        {
            return {};
        }

        return state->GetThreadByIndex(worker.second);
    }

    AuSPtr<ThreadState> ThreadPool::GetThreadHandle(WorkerId_t id)
    {
        auto group = GetGroup(id.first);
        if (!group)
        {
            return {};
        }

        return group->GetThreadByIndex(id.second);
    }

    AuList<AuSPtr<ThreadState>> ThreadPool::GetThreadHandles(WorkerId_t id)
    {
        AU_LOCK_GUARD(this->rwlock_->AsReadable());

        auto group = GetGroup(id.first);
        if (!group)
        {
            return {};
        }

        AuList<AuSPtr<ThreadState>> ret;
        if (id.second != Async::kThreadIdAny)
        {
            if (auto pPtr = group->GetThreadByIndex(id.second))
            {
                ret.push_back(pPtr);
            }
        }
        else
        {
            for (const auto &[key, value] : group->workers)
            {
                ret.push_back(value);
            }
        }

        return ret;
    }

    AUKN_SYM AuSPtr<IThreadPool> NewThreadPool()
    {
        // apps that don't require async shouldn't be burdened with the overhead of this litl spiner
        StartSched();
        return AuMakeShared<ThreadPool>();
    }

    struct KeepGroupAlive
    {
        KeepGroupAlive(AuSPtr<AuAsync::IThreadPool> pPool) : pPool(AuStaticCast<AuAsync::ThreadPool>(pPool))
        {
            AuAtomicAdd(&this->pPool->uAtomicCounter, 1u);
        }

        ~KeepGroupAlive()
        {
            auto uNow = AuAtomicSub(&this->pPool->uAtomicCounter, 1u);

            if (uNow == 0)
            {
                for (const auto &pState : this->pPool->threadGroups_)
                {
                    if (pState)
                    {
                        pState->BroadCast();
                    }
                }
            }
        }

        AuSPtr<AuAsync::ThreadPool> pPool;
    };

    AUKN_SYM AuSPtr<void> KeepThreadPoolAlive(AuSPtr<AuAsync::IThreadPool> pPool)
    {
        return AuMakeSharedThrow<KeepGroupAlive>(pPool);
    }
}