AuroraRuntime/Include/Aurora/RuntimeConfig.hpp

/***
    Copyright (C) 2021-2023 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: RuntimeConfig.hpp
    Date: 2023-12-1
    Date: 2021-6-9
    Author: Reece
***/
#pragma once

namespace Aurora
{
    struct LocalLogInfo
    {
        bool     bEnableLogging         { true };
        AuLog::DirectoryLogger defaultFileLogger
        {
            /* uMaxLogsOrZeroBeforeCompress */ 16, /* uMaxLogsOrZeroBeforeDelete */ 0,
            /* uMaxCumulativeFileSizeInMiBOrZeroBeforeCompress*/ 0, /* uMaxCumulativeFileSizeInMiBOrZeroBeforeDelete */ 128 * 1024 * 1024,
            0, 0 
        };

    #if defined(AU_CFG_ID_SHIP)
        bool     bWriteLogsToUserDir    { true };              // use user directory
    #else
        bool     bWriteLogsToUserDir    { false };             // use cwd
    #endif
    };

    struct TelemetryConfigDoc
    {
        bool enabled                { false };
    };
    
    struct TelemetryConfig
    {
        bool readModNameJsonConfig {};
        TelemetryConfigDoc defaultConfig;
    };

    struct ConsoleConfig
    {
        /// Enables Aurora::Console::xxxStd functions; defer to enableStdXX for default logger behaviour
        bool enableStdPassthrough {false};
        
        /// Enables standard, debug, and GUI consoles
        bool enableConsole       {true};
        
        /// Attempt to force a terminal emulator host under graphical subsystems
        bool forceConsoleWindow  {};

        /// Attempt to force a GUI console under command line targets
        bool forceToolKitWindow  {};
        
        /// In conjunction with enableStdPassthrough, Aurora::Console::ReadStd reads a binary stream
        /// In conjunction with !enableStdPassthrough, enables stdin cmd processing, otherwise disables stdin input
        bool enableStdIn         {true};

        /// In conjunction with enableStdPassthrough, enables Aurora::Console::WriteStd to write binary, otherwise enables the console logger
        /// In conjunction with !enableStdPassthrough, enables stdout logging
        bool enableStdOut        {true};
        
        /// Use WxWidgets when possible
        bool enableWxWidgets     {true};

        /// Delegate stdout writes to loops -> recommended for servers
        bool asyncWrite {true};

        /// Async debug log
        bool asyncVSLog {false};
        
        /// Should stdout print the full date or a mere HH MM SS prefix?
        bool bStdOutShortTime { false };

        /// 
        bool bStdOutUseLocalTime { true }; 

    #if 1
        /// FIO config
        LocalLogInfo fio;
    #endif

        AuString titleBrand      = "Aurora SDK Sample";

        AuString supportPublic   {"https://git.reece.sx/AuroraSupport/AuroraRuntime/issues"};
        AuString supportInternal {"https://jira.reece.sx"};

        bool consoleTTYHasPerAppHistory {true};
    };

    struct LoggerConfig
    {
        /// FIO config
        LocalLogInfo fileConfiguration;

        /// Socket config
        // tbd
    };
    
    struct CryptoConfig
    {
        bool    bPreferSystemCertStoreOverBuiltin { false };
        bool    bReserved[31];
    };
    
    struct AsyncConfig
    {
        bool     bStartSchedularOnStartup       { true };                   // spawns the scheduler thread during the runtime initialization process, otherwise delegate the spawn until the very last minute.
        bool     bEnableLegacyTicks             { false };                  // turn this on to enable an async-app/singleton-threadpool to SysPump tick on thread worker-id: zero. Alternatively, use SetMainThreadForSysPumpScheduling once you have a thread pool and worker id.
        AuUInt32 threadPoolDefaultStackSize     { };
        AuUInt32 dwSchedulerRateLimitNS         { AuMSToNS<AuUInt64>(2) };  //
        AuUInt32 dwLegacyMainThreadSystemTickMS { 60 };                     // nowadays this is used to dispatch AuConsole commands to a mainthread with AuAsync.
        bool     bEnableCpp20RecursiveCallstack { true };                   // enables/disables co_routine support in that the runtime can work with nested IWorkItem::BlockUntilComplete()'s and IThreadPool::[Run/Poll/RunOnce/etc]()'s.
    };

    struct FIOConfig
    {
        AuOptional<AuString> optDefaultBrand = "Aurora SDK Sample";
        bool bForceOverlappedUtilsToDelegatedThreadPool { false };
        bool bIsIntranetTrusted {};
        AuUInt32 uOverlappedUtilsThreadPoolSize { 2 }; // note: this does not relate to the overlapped aio apis
    };                                                 //       these threads are only spawned as a fallback for AuFS::Overlapped*** apis

    struct DebugConfig
    {
        /**
         * @brief Precache/initialize symbols for printable stack traces under binaries not intended for shipping to consumers
         * 
         * @warning true will result in artificially high commit charge under certain monitoring applications as application databases are 
         *          precached into memory. these maps shouldn't be duplicated across processes (if the kernel plays nice). regardless,
         *          end users shouldn't see the hit. in the best case scenario, this serves as a quality of life upgrade for stage/debug 
         *          binaries that want real time stack traces with file locations cached. noting that this isn't optional for stage win32
         *          builds with live exception traces with line info. in the future, debug tools should be able to parse telemetry dumps, though.
        */
        bool bNonshipPrecachesSymbols { true };

        /**
         * @brief Activates the internal AddVectoredExceptionHandler handler. Might conflict with DRM and other debugging utilities
        */
        bool bEnableWin32RootExceptionHandler { true };

        /**
         * @brief 
        */
        bool bEnableInjectedExceptionHandler { true };

        /**
         * @brief Raises a SysPanic in place of returning null over all allocators. This includes C++ containers which *should* be configured to use our overloaded allocators (compile ./Source/Alloc.cpp & link in with your application/library).
         *        If you're one of those people who don't like the idea of raising out of memory exceptions, and therefore compile without exceptions, this is an alternative this allows for exceptions to remain enabled for truly exceptional behaviour
         *          IE: we're fucked. Lets see if we can restart the root tick, at the risk of unaccounted for behaviour, because crashing over a STL parse exception or something stupid in that ballpark would be a really terrible **[end]user**-experience.
        */
        bool bIsMemoryErrorFatal { false };

        /**
         * @brief 
        */
        bool bIsExceptionThrowFatal { false };

        bool bSaveAllExceptionsAsMinidumpInBg { false };

        bool bRaiseDebuggerToAllExceptionsInStageAndDbg { true };

        bool bPrintExceptionStackTracesOut { true };

        bool bIsApplicationClientSoftwareOnJitteryMemorySystem { false }; // enable me to enable padding from system out of memory conditions.
                                                                          // the catch: usually this memory is reserved for exit callbacks, internal low memory conditions, error reporting, and the like.
                                                                          // 
                                                                          // generally you should not exploit this without ** acknowledging this thread-local condition via AuDebug::[Add/Dec]MemoryCrunch. **  ( << tl;dr: recommended way of accessing this memory)
                                                                          // 
                                                                          // setting this flag enables debug buffer memory to be used at any point during any threads execution - the moment mimalloc runs  
                                                                          // out of pre-reserved and system mappable memory. i wouldn't use this for anything except monolithic client/user-facing applications
                                                                          // that are likely to run on low resource systems (low spec or heavy app), with untested/uncaught C++ allocations splattered everywhere.
                                                                          // this could be VERY useful to end users who are running into `bIsMemoryErrorFatal` crashes.


        AuUInt32 uDebugMemoryReserveSize { 3 * 1024 * 1024 };  /* nowdays: a single v8 isolate is low sub-tens MB of memory, executable file sizes are low mbs, image sizes are much larger. forget small low-footprint vms
                                                                           of flex and bison yesteryear. 3MB given our heavyish standard footprint already is probably fine. how much memory do Java heap+JIT engines sink just to boot again?
                                                                           this'll allow us to high tens of KBs of malicous strings, overhead for doing telemetry/(structured ???)logging, overhead to stack traces, module cache, etc.
                                                                           i'm sure it'll also be enough to give back to the user, if need be.

                                                                          @warning: 0 =  3 * 1024 * 1024
                                                                          */ 
        bool bWin32VerifyTrustFailMissingAPI { true }; // just to not be annoying in the future
    };

    // Checkout some of the members towards the top.
    // Do not adjust the bitfield entries towards the bottom.
    struct ThreadingConfig
    {
        // WARN: these values are not final
                                                                                                 // Resets everything assuming we dont have default initialization (c++14) or we cannot bit default initialize (c++20).
                                                                                                 // This is a struct local clear bit for on init.
    #if defined(AU_LANG_CPP_20_)
        bool     bResetToRuntimeDefaults                 { false };
    #else
        bool     bResetToRuntimeDefaults                 { true };
    #endif
        bool     bNoThreadNames                          { false };
        bool     bPlatformIsSMPProcessorOptimized        { true };                               // Whether to attempt to using mm_pause or similar instruction before yielding into the kernel
        AuUInt16 uSpinLoopPowerA                         { 128 };                                // Nudgable spinloop power. This is our local userland niceness factor
                                                                                                 // This is comparable to Win32's SetCriticalSectionSpinCount applied across every single AuThreadPrimitives try-lock and lock.
                                                                                                 // Adjust this value to compensate for longer critical sections when context switching isn't preferrable.
                                                                                                 // Using 128 as a default (bouncing around 64 and 512)
                                                                                                 // Facebook used to say half this (cant find src), I used to say about 82 to 512, Windows 7s implementation of CRITICAL_SECTION and SRWLOCK says double that (256), for aggressive (and now incorrect) spin mutex examples ive seen around 2k or less, some intel reference material uses 64 as a demo max spin value, for not so aggressive pause loops ive seen people use 32-128-ish pauses (also incorrect), dumb shits parroting Win9x documentation and SetCriticalSectionSpinCount's example value think you need above >= 4k (stackexchange man strike again).
                                                                                                 // Personally, I've seen this tested on 5-12th gen intel, Windows 7 through 11, Linux, and various other configurations.
                                                                                                 // Personally, I've seen this run Qt with less CPU resources than every other Qt process on Win7. I've seen this run JavaScript programs dead last on the taskmanagers detail panel, on both 10 and 7.
                                                                                                 // 128 to 512 is fine. on the upper end you, the developer, need to start asserting you are a real time application aware of your hardware requirements / have properly matched task affinity / etc, and don't mind shredding old processor power efficiency while chewing thru nop cycles
                                                                                                 // <<<<<<<<<<<<<<< (QA:)   Each applications will probably need its own nudge value
        AuUInt64 bEnableAggressiveScheduling          : 1  AU_BIT_FIELD_INIT_AFTER_20( false );  // <<<<<<<<<<<<<<< (SHIP:) ENABLE ME FOR AROUND 1MS OR LESS SCHED RESOLUTION
        AuUInt64 bEnableAgrSchedulingRatelimit        : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bPreferNt51XpMutexesOver8            : 1  AU_BIT_FIELD_INIT_AFTER_20( false );  // under modern versions of windows, do not use keyedevents. use the native waitonaddress internals, then waitonaddress proper; and dont touch keyedevents paths.
        AuUInt64 bPreferNt51XpCondvarsOver8           : 1  AU_BIT_FIELD_INIT_AFTER_20( false );  // under modern versions of windows, do not use keyedevents. use the native waitonaddress internals, then waitonaddress proper; and dont touch keyedevents paths.
        AuUInt64 bPreferNtCondvarModernWinSpin        : 1  AU_BIT_FIELD_INIT_AFTER_20( false );  // very modern cpus have monitor / tpause / etc intrins. sometimes like us, microsoft will use them in userspace under waitonaddress of very modern windows builds. i wouldn't rely on that. we implement spinning ourselves for linux + old win32 for 2 decades worth of processors.
        AuUInt64 bPreferNtCondvarOlderWinSpin         : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );  // windows 7 and lower sees better CPU + power draw when we implement spinning ourselves on top of the the dreaded bidirectionally blocking keyedevents. besides, msft refused to backport userland monitor (very modern chipsets) to old versions of 10 and 7.
        AuUInt64 bPreferNtSemaphoreSpinTryLock        : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bPreferNtMutexSpinTryLock            : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bPreferNtCondMutexSpinTryLock        : 1  AU_BIT_FIELD_INIT_AFTER_20( false );
        AuUInt64 bPreferLinuxSemaphoreSpinTryLock     : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bPreferLinuxMutexSpinTryLock         : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bPreferLinuxCondMutexSpinTryLock     : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
    #if 0
        AuUInt64 bPreferEmulatedWakeOnAddress         : 1  AU_BIT_FIELD_INIT_AFTER_20( false );
    #else
        AuUInt64 bPreferEmulatedWakeOnAddress         : 1  AU_BIT_FIELD_INIT_AFTER_20( !AuBuild::kIsNtDerived ); // ...,everybody else requires us to hit the kernel
    #endif
        AuUInt64 bPreferWaitOnAddressAlwaysSpin       : 1  AU_BIT_FIELD_INIT_AFTER_20( false  );                 // ..., if emulated! if double-spinning under higher level locks, disable me.
        AuUInt64 bPreferWaitOnAddressAlwaysSpinNative : 1  AU_BIT_FIELD_INIT_AFTER_20( !AuBuild::kIsNtDerived ); // ..., if not emulated! noting that most kernels and user-schedulers will spin for you. nt users can expect ntdll to spin / pause / monitor / etc, under * modern * win32 versions.
        AuUInt64 bPreferRWLockReadLockSpin            : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bUWPNanosecondEmulationCheckFirst    : 1  AU_BIT_FIELD_INIT_AFTER_20( false );
        AuUInt64 uUWPNanosecondEmulationMaxYields     : 7  AU_BIT_FIELD_INIT_AFTER_20( 12    ); 
        AuUInt64 bForceEnableAdaptiveSpin             : 1  AU_BIT_FIELD_INIT_AFTER_20( false ); // ||
        AuUInt64 bPreferEnableAdaptiveSpin            : 1  AU_BIT_FIELD_INIT_AFTER_20( true  ); //   .
        AuUInt64 bPreferLinuxAdaptiveSpin             : 1  AU_BIT_FIELD_INIT_AFTER_20( true  ); //    (&&)
        AuUInt64 bPreferOldWin32AdaptiveSpin          : 1  AU_BIT_FIELD_INIT_AFTER_20( false ); //    (&&)
        AuUInt64 bPreferNewWin32AdaptiveSpin          : 1  AU_BIT_FIELD_INIT_AFTER_20( true  ); //    (&&)               
        AuUInt64 uAdaptiveSpinCUCnt0                  : 4  AU_BIT_FIELD_INIT_AFTER_20( 0     );                  // boring thread topology assumptions
        AuUInt64 uAdaptiveSpinCUCnt4                  : 4  AU_BIT_FIELD_INIT_AFTER_20( 2     );                  // boring thread topology assumptions
        AuUInt64 uAdaptiveSpinCUCnt8                  : 4  AU_BIT_FIELD_INIT_AFTER_20( 2     );                  // boring thread topology assumptions
        AuUInt64 uAdaptiveSpinCUCnt16                 : 4  AU_BIT_FIELD_INIT_AFTER_20( 4     );
        AuUInt64 bPreferFutexRWLock                   : 1  AU_BIT_FIELD_INIT_AFTER_20( true  ); // Win10+ and Linux should use futexes inside the AuRWLock primitive, vs other dumber primitives built on similar futex abstraction, both that'll perform about the same regardless. 
                                                                                                // Once taking to account other platform specific member overhead, making this compile time isnt worth it in memory and in the CPU-overhead. Enjoy the extra compat (incl WinXP, for almost free).
                                                                                                // Considering we beat pthreads, 3 STLs, Win32 primitives in API functionality and in legacy XP compat, we're * hundreds * of bytes less than a bad STL (incl llvm and msvc), I think our RWLock is fine. 
                                                                                                // Making it any smaller would require a different API, different tooling assumptions, and different CPU branching overhead assumptions.
                                                                                                // Even the CPU branching implications of a *portable, potentially-relinkable, potentially-asm* thread id check destroys the excuse for a smaller Aurora::Threading::Waitables futex reimplementation.
        AuUInt64 bWinXpThrough7BlazeOptimizerPower    : 12 AU_BIT_FIELD_INIT_AFTER_20( 300   ); // dont worry about it. we dont care about old portables. lets try to make older win32 targets tweak the scheduling in our favor a bit.
        AuUInt64 bPreferLinuxPrimitivesFutexNoSpin    : 1  AU_BIT_FIELD_INIT_AFTER_20( false );
        AuUInt64 bPreferUnixPrimitivesNoSpin          : 1  AU_BIT_FIELD_INIT_AFTER_20( false );
        AuUInt64 bAlwaysRWLockWriteBiasOnReadLock     : 1  AU_BIT_FIELD_INIT_AFTER_20( false );
        AuUInt64 bEnableRWLockWriteBiasOnReadLock     : 1  AU_BIT_FIELD_INIT_AFTER_20( true  );
        AuUInt64 bPreferFutexEvent                    : 1  AU_BIT_FIELD_INIT_AFTER_20( true  ); // Win10+ and Linux should use a futex inside the AuEvent / AuThreadPrimitive event as the hybrid binary-semaphore/cross/event's signal flag.
    };

    struct DummyConfig
    {

    };

    struct LinuxConfig
    {
        bool bFIODisableBatching            { false };
        bool bIOLinuxHasProcIpcPerms        { false };
        AuUInt8 uSignalTerminate            { 64 - 3 };
        AuUInt8 uSignalGAIOWorkerThreadDone { 64 - 4 };
    };

    struct ProcessConfig
    {
        bool bAlwaysPreloadEntireClassPath  { false };
        bool bForcePreload                  { false };
        bool bEnablePreload                 { true };
    };

    struct IOConfig
    {
        AuUInt32 uProtocolStackDefaultBufferSize      { 64 * 1024 };

    #if defined(AURORA_IS_SERVER)
        bool     bIsVeryLargeIOApplication            { true };
    #else
        bool     bIsVeryLargeIOApplication            { false };
    #endif

    // On Win32, NewLSTimer has very bad resolution.
    // On linux, timerfd is good enough.
    // On other POSIX platforms, its best to keep timers emulated in process.
    // By default, you have to opt into a higher res in-process, if required by the platform.
    // This does not bypass IO yielding or timeouts; we simply use our own semaphore and scheduler instead of the kernels.
    // It only takes 8k ns to 60k ns depending on the platform to wake a thread, and we can hit AuAsync scheduler without too much error; we can just do this instead of relying on historically shitty IO primitives.
    // Assiging this to true, on any platform, will bypass the kernels timer ticker.
        bool     bForceAltOSTimerPrimitives           { false };

    // High resolution timers will be emulated in process, if this is assigned false, and we're on a platform with good IO timers
    // For niche POSIX targets and Windows, this value will not be respected.
    // For Linux, this value can be used to experience/test/benchmark Win32-like in-process scheduling.
        bool     bTrustOSTimerPrimitiveIfKnownGood    { true };

    // 
        bool     bUseHighResTimerUnderIOWaitableTimer { false };
        
    //
        bool     bAPCUseCoroutineStack                { true  };

    // Decreases syscall and global lock contension if false.
    // Enable me if youre emulating Win32 or NT as a platform.
    // This helps mitigate waitables returning false during multiple AND-mode acquisition.
        bool     bAimCloserForNTParityWaitALL         { false };

        bool     bUseOldIOWaitAllAlg                  { false };

        bool     bUseSelectWaitAllStrat               { true  };

    // Enable me if youre emulating Win32 or NT as a platform, leave configurable.
    // If the emulated target can run without this, which is very very likely, this will save a ass-ton of syscalls.
    // Affected platforms: NT-likes only. Win32-emu-on-POSIX can use in-process primitives all they like.
        bool     bINeedPerfectAnds                    { false };
    };

    struct Win32Config
    {
        bool bProcessCheckWinLdrForModNameFirst { true };
    };

    // ADHD/shipping check-list version:
    //  Go check: 
    //      ioConfig.bIsVeryLargeIOApplication          | for increased memory overhead, allows servers to open more io completion contexts. looks bad on linux client applications.
    //      fio.optDefaultBrand                         | for application branding. change this to your publishers name. used for configuration and ~home isolation.
    //      threadingConfig.bEnableAggressiveScheduling | for real time applications. required for retarded timing resolution coalescence. 0.0Xms to 0.3MS tier resolution is viable on modern PC platforms (say <250,000NS).
    //      threadingConfig.bNoThreadNames              | disable vendor libraries from specifying their thread name to an attached debugger, if not stripped from the application.
    //      threadingConfig.uSpinLoopPowerA             | increase me if the *global* context switch rate is too high. use AuThreading:: APIs is it's a per-thread issue.
    //      async.dwSchedulerRateLimitNS                | for sub 2MS AuAsync timers and Windows 7 timers. 
    //                                                  |  Real-time applications should set this to 0.
    //                                                  |  Interactive applications should lower this to 500'000 nanoseconds (.5MS) to 1'000'000ns (1MS).
    //                                                  |  GUI applications should keep this value high to prevent high idle CPU usage. 
    struct RuntimeStartInfo
    {
        ConsoleConfig console;
        AuAlignTo<512, CryptoConfig> crypto;
        AuAlignTo<512, TelemetryConfig> telemetry;
        AuAlignTo<32, AsyncConfig> async;
        AuAlignTo<32, FIOConfig> fio;
        AuAlignTo<64, DebugConfig> debug;
        AuAlignTo<32, ThreadingConfig> threadingConfig;
        AuAlignTo<32, LinuxConfig> linuxConfig;
        AuAlignTo<32, Win32Config> win32Config;
        AuAlignTo<32, ProcessConfig> processConfig;
        AuAlignTo<32, IOConfig> ioConfig;
        AuAlignTo<32, DummyConfig> padding;
    };
}