/***
    Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: AuWaitFor.cpp
    Date: 2021-6-12
    Author: Reece
***/
#include <Source/RuntimeInternal.hpp>
#include "AuWaitFor.hpp"

#if defined(AURORA_IS_LINUX_DERIVED)
    #include <sched.h>
    #include <sys/resource.h>
    #include <sys/time.h>
    #include <unistd.h>
    #include <time.h>
#endif 

// Read the local header file for this file.
// The original idea was sane.
// The implemention, not so much...

// TODO: REWRITE!

namespace Aurora::Threading
{
    static void YieldToSharedCore(long spin)
    {
        int loops = (1 << spin);
        while (loops > 0)
        {
        #if (defined(AURORA_ARCH_X64) || defined(AURORA_ARCH_X86))
            _mm_pause();
        #endif
            loops -= 1;
        }
    }

    void YieldToOtherThread()
    {
    #if defined(AURORA_IS_MODERNNT_DERIVED)
        SwitchToThread();
    #elif defined(AURORA_IS_LINUX_DERIVED)
        sched_yield();
    #else
        YieldToSharedCore(12);
    #endif
    }

    template<AuMach Flags> // forcefully optiMize by templating a constant argument 
    static inline void _FastSnooze(long &count, AuUInt64 &startTime, AuUInt64 maxStallNS, int &alpha, int &bravo, bool &forceSpin) //, bool yieldFaster , long maxStallMS = 20)
    {
        // TODO: rewrite me
        AuUInt64 now = Time::SteadyClockNS();

        // Begin least likely checks, we're getting on now
        // Ironically we need to burn off some CPU cycles 
        AuUInt64 deltaNS = now - startTime;

    
    #define SHOULD_SWITCH_ASAP(yieldDelayThresholdNs, roundTripNs)\
        (static_cast<int>(Flags) & kYieldFlagsContextSwitchASAP)

    // Validate we have at least one whole average of a context switch of overhead remaining
    #define HAS_ENOUGH_TIME_FOR_TIMED_SLEEP(yieldDelayThresholdNs, roundTripNs)\
        (maxStallNS >= (roundTripNs + deltaNS))

    // The point of rewriting kernel-free userland thread components is to delegate everything to userland
    // One key reason is single app performance. We should we not know how long to yield for, giving an undefined 
    // ...amount of time to other applications might be a bad thing. fuck. why cant we have rtos functionality :( 
    #define HAS_ENOUGH_TIME_FOR_INFINITE_SLEEP(yieldDelayThresholdNs, roundTripNs)\
        ((static_cast<int>(Flags)& kYieldFlagsContextSwitchForever) && (!maxStallNS))

    // Perform a good faith guess at assuming we have enough overhead for a hard context switch 
    #define HAS_ENOUGH_TIME_OVERHEAD(yieldDelayThresholdNs, roundTripNs)\
        (HAS_ENOUGH_TIME_FOR_INFINITE_SLEEP(yieldDelayThresholdNs, roundTripNs) || HAS_ENOUGH_TIME_FOR_TIMED_SLEEP(yieldDelayThresholdNs, roundTripNs))

    // Validate enough time (lets say 1/3rd of the approximated time of a preemptive switch or sleep(0)) has passed
    #define HAS_ENOUGH_TIME_PASSED(yieldDelayThresholdNs, roundTripNs)\
        (deltaNS > yieldDelayThresholdNs)

    #define SHOULD_CTXSWAP(yieldDelayThresholdNs, roundTripNs)\
         if  (SHOULD_SWITCH_ASAP(yieldDelayThresholdNs, roundTripNs) || (HAS_ENOUGH_TIME_PASSED(yieldDelayThresholdNs, roundTripNs) &&  HAS_ENOUGH_TIME_OVERHEAD(yieldDelayThresholdNs, roundTripNs)))

    #if defined(AURORA_IS_LINUX_DERIVED)
        SHOULD_CTXSWAP(kLinuxYieldTimeThresNano, kPredictedLinuxKernelTimeRTNano)
        {
            // we are not very nice :D 
            setpriority(PRIO_PROCESS, 0, bravo);
            static timespec fuck = { 0, kLinuxYieldTimeNano };
            nanosleep(&fuck, &fuck);
            setpriority(PRIO_PROCESS, 0, alpha);
            forceSpin = true;
            return;
        }
    #endif 

    #if defined(AURORA_PLATFORM_WIN32)
        SHOULD_CTXSWAP(kPredictedNTOSSwitchTimeYDNS, kPredictedNTOSSwitchTimeRTNS)
        {
            // TODO: 
            ::Sleep(1);
            return;
        }
    #endif 


        // Always at least try to burn some cycles off in a spinlock-esc time waster  
        YieldToOtherThread();
    }

    template<AuMach Flags> // forcefully optiMize by templating a constant argument 
    static void FastSnooze(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS) //, bool yieldFaster , long maxStallMS = 20)
    {
    #if defined(AURORA_IS_LINUX_DERIVED)
        int alpha = getpriority(PRIO_PROCESS, 0);
        int bravo = AuMin(15, AuMax(19, alpha + 5));
    #else
        int alpha, bravo = 0;
    #endif
        bool spin = false;
        _FastSnooze<Flags>(count, startTime, maxStallMS, alpha, bravo, spin);
    }

    template
    void FastSnooze<0>(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS);
    template
    void FastSnooze<kYieldFlagsContextSwitchASAP>(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS);
    template
    void FastSnooze<kYieldFlagsContextSwitchForever>(long &count, AuUInt64 &startTime, AuUInt64 maxStallMS);

    template<bool permitMultipleContextSwitches>
    static inline bool YieldPollTmpl(AuUInt64 &time, AuUInt64 timeoutMs, PollCallback_cb cb)
    {
    #if defined(AURORA_IS_LINUX_DERIVED)
        int alpha = getpriority(PRIO_PROCESS, 0);
        int bravo = AuMin(15, AuMax(19, alpha + 5));
    #else
        int alpha, bravo = 0;
    #endif
        bool spin = false;

        long count = 0;

        unsigned long long a = Time::SteadyClockNS();
        do
        {
            if (permitMultipleContextSwitches)
            {
                _FastSnooze<kYieldFlagsContextSwitchForever>(count, time, timeoutMs, alpha, bravo, spin);
            }
            else
            {
                _FastSnooze<0>(count, time, timeoutMs, alpha, bravo, spin);
            }

            if (cb())
            {
                return true;
            }
            a = Time::SteadyClockNS();

        } while ((!timeoutMs) || (timeoutMs > a));

        return cb();
    }

    AUKN_SYM bool YieldPollNs(bool permitMultipleContextSwitches, AuUInt64 timeoutNs, PollCallback_cb cb)
    {
        AuUInt64 time = Time::SteadyClockNS();

        if (cb())
        {
            return true;
        }

        if (timeoutNs)
        {
            // only relevant when there's no timeout, fastsnooze will do its own magic given the templates parameters
            permitMultipleContextSwitches = false;
        }

        // do not trust the compiler do branch here with a mere Func<variable>(...)
        // it's far more likely the branch will be handled in our yield loop
        if (permitMultipleContextSwitches)
        {
            return YieldPollTmpl<true>(time, timeoutNs, cb);
        }
        else
        {
            return YieldPollTmpl<false>(time, timeoutNs, cb);
        }

        return false;
    }

    AUKN_SYM bool YieldPoll(bool permitMultipleContextSwitches, AuUInt64 timeoutMs, PollCallback_cb cb)
    {
        AuUInt64 time = Time::SteadyClockNS();
        AuUInt64 timeoutNs = timeoutMs ? (time + (timeoutMs * 1000000)) : 0;

        if (cb())
        {
            return true;
        }

        if (timeoutMs)
        {
            // only relevant when there's no timeout, fastsnooze will do its own magic given the templates parameters
            permitMultipleContextSwitches = false;
        }

        // do not trust the compiler do branch here with a mere Func<variable>(...)
        // it's far more likely the branch will be handled in our yield loop
        if (permitMultipleContextSwitches)
        {
            return YieldPollTmpl<true>(time, timeoutNs, cb);
        }
        else
        {
            return YieldPollTmpl<false>(time, timeoutNs, cb);
        }

        return false;
    }

    static bool WaitLogicHandledByImplementor(bool &status, IWaitable *waitable, AuUInt64 timeout)
    {
        if (!waitable->HasLockImplementation())
        {
            return false;
        }

        status = waitable->LockMS(timeout);
        return true;
    }

    static bool WaitLogicHandledByNTOS(bool &status, IWaitable *waitable, AuUInt64 timeout)
    {
    #if defined(AURORA_IS_MODERNNT_DERIVED)
        AuMach handle = 0;
        if (!waitable->HasOSHandle(handle))
        {
            return false;
        }

        auto win32 = reinterpret_cast<HANDLE>(handle);
        auto ret = WaitForSingleObject(win32, timeout ? timeout : INFINITE);

        SysAssert(ret != WAIT_FAILED, "Internal Win32 Error {}", GetLastError());

        if (ret == WAIT_TIMEOUT)
        {
            status = false;
        }
        else
        {
            status = true;
        }

        return true;
    #else
        return false;
    #endif 
    }

    AUKN_SYM bool WaitFor(IWaitable *waitable, AuUInt64 timeout)
    {
        bool status;

        if (WaitLogicHandledByNTOS(status, waitable, timeout))
        {
            return status;
        }

        if (WaitLogicHandledByImplementor(status, waitable, timeout))
        {
            return status;
        }

        return YieldPoll(true, timeout, [=]()
        {
            return waitable->TryLock();
        });
    }

    static bool CanWin32HandleAll(const AuList<IWaitable *> &waitables)
    {
    #if defined(AURORA_IS_MODERNNT_DERIVED)
        for (auto &waitable : waitables)
        {
            AuMach handle = 0;
            if (!waitable->HasOSHandle(handle))
            {
                return false;
            }
        }
        return true;
    #else
        return false;
    #endif
    }

    static bool Win32HandleMultiple(const AuList<IWaitable *> &waitables, AuUInt64 timeoutMs)
    {
    #if defined(AURORA_IS_MODERNNT_DERIVED)
        AuList<HANDLE> winWaitables;

        winWaitables.resize(waitables.size());

        std::transform(waitables.begin(), waitables.end(), winWaitables.begin(), [](IWaitable *waitable) -> HANDLE
        {
            AuMach handle = 0;
            auto status = waitable->HasOSHandle(handle);
            SysAssert(status, "OS Handle was NULL");
            return reinterpret_cast<HANDLE>(handle);
        });

        auto status = WaitForMultipleObjectsEx(winWaitables.size(), winWaitables.data(), TRUE, timeoutMs ? timeoutMs : INFINITE, true);
        SysAssert(status != WAIT_FAILED, "Internal Win32 Error {}", GetLastError());

        if (status == WAIT_TIMEOUT)
        {
            return false;
        }
        else
        {
            return true;
        }
    #else
        return false;
    #endif
    }

    AUKN_SYM bool WaitFor(const AuList<IWaitable *> &waitables, AuUInt64 timeout)
    {
        if (CanWin32HandleAll(waitables))
        {
            return Win32HandleMultiple(waitables, timeout);
        }

        // im worried about the complexity of using a vector here
        // we would have to hit o(n) and memcpy in the best case scenario on each object release
        // unordered maps are glorified hash tables
        // maps are glorified binary trees
        // maps should be fast enough
        AuHashMap<int, bool> releasedObjects;

        releasedObjects.reserve(waitables.size());

        // pseudo reserve 
        for (AuMach i = 0; i < waitables.size(); i++)
        {
            releasedObjects[i] = false;
        }

        // yield for all 
        auto status = YieldPoll(true, timeout, [&]()
        {
            for (AuMach i = 0; i < waitables.size(); i++)
            {
                if (!releasedObjects[i])
                {
                    if (waitables[i]->TryLock())
                    {
                        releasedObjects[i] = true;
                    }
                    else
                    {
                        return false;
                    }
                }
            }
            return true;
        });

        // from the perspective of locks, should the be a timeout event, we need to go back and unlock them on timeout 
        if (!status)
        {
            for (AuMach i = 0; i < waitables.size(); i++)
            {
                if (releasedObjects[i])
                {
                    waitables[i]->Unlock();
                }
            }
        }

        return status;
    }

    AUKN_SYM void ContextYield()
    {
        YieldToOtherThread();
    }
}