AuroraRuntime/Source/Locale/Locale.cpp

/***
    Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: Locale.cpp
    Date: 2021-6-11
    Author: Reece
***/
#define I_REALLY_NEED_WIDECHAR_PUBAPI
#include <Source/RuntimeInternal.hpp>
#include "Locale.hpp"

#if !defined(AU_NO_CPPLOCALE)
    #include <locale>
    #include <codecvt>
#endif

#include <wchar.h>
#include <tuple>

namespace Aurora::Locale
{
    static AuString gCountryCode;
    static AuString gLanguageCode;
    static AuString gCodeset;
    static ECodePage gInternalCodePage = ECodePage::eEnumInvalid;

    // Note: [0] out of touch boomers deprecated std::wstring_convert before going for a nappy. we do not have a replacement yet
    //       [1] the native win32 implementation appears to be more optimized than MSVC/stl
#if !defined(AU_NO_CPPLOCALE) && !(defined(AURORA_COMPILER_MSVC) && defined(AU_LANG_CPP_20))
    static std::wstring_convert<std::codecvt_utf8<wchar_t>> gUtf8Conv;
#endif

    AUKN_SYM AuString ConvertFromWChar(const wchar_t *in)
    {
        try
        {
            return ConvertFromWChar(in, wcslen(in));
        }
        catch (...)
        {
            SysPushErrorMem("ConvertFromWChar failed");
            return {};
        }
    }

    AUKN_SYM AuString ConvertFromWChar(const wchar_t *in, AuMach length)
    {
        try
        {
        #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
            AuString ret;
            auto chars = WideCharToMultiByte(CP_UTF8, 0, in, length, NULL, 0, NULL, NULL);

            if (!chars)
            {
                return {};
            }

            ret.resize(chars);
            WideCharToMultiByte(CP_UTF8, 0, in, length, ret.data(), ret.size(), NULL, NULL);
            return ret;
        #elif !defined(AU_NO_CPPLOCALE)
            return gUtf8Conv.to_bytes(std::wstring(in, wcslen(in)));
        #else
            SysPushErrorUnimplemented("ConvertFromWChar");
            return {};
        #endif
        }
        catch (...)
        {
            SysPushErrorMem("ConvertFromWChar failed");
            Debug::CheckErrors();
        }
        return {};
    }

    AUKN_SYM std::wstring ConvertFromUTF8(const AuString &in)
    {
        try
        {
        #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
            std::wstring ret;
            auto chars = MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), NULL, 0);

            if (!chars)
            {
                return {};
            }

            ret.resize(chars);
            MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), ret.data(), ret.size());
            return ret;
        #elif !defined(AU_NO_CPPLOCALE)
            return gUtf8Conv.from_bytes(in);
        #else
            SysPushErrorUnimplemented("ConvertFromUTF8");
            return {};
        #endif
        }
        catch (...)
        {
            SysPushErrorMem("ConvertFromUTF8 failed");
            Debug::CheckErrors();
        }
        return {};
    }

    ECodePage GetInternalCodePage()
    {
        return gInternalCodePage;
    }

    AuString const &GetInternalCodePageString()
    {
        return gCodeset;
    }

#if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)

    static void SetCodesetCommonGuessWin32()
    {
        int acp = GetACP();

        if (acp == CP_CHINESE)
        {
            gCodeset      = "GB18030";
            gInternalCodePage = ECodePage::e18030;
        }
        else if (acp == CP_UTF8)
        {
            gCodeset      = "UTF-8";
            gInternalCodePage = ECodePage::eUTF8;
        }
        else if (acp == CP_UTF_16)
        {
            gCodeset      = "UTF-16";
            gInternalCodePage = ECodePage::eUTF16;
        }
        else if (acp == CP_UTF_16 + 1)
        {
            gCodeset      = "UTF-16";
            gInternalCodePage = ECodePage::eUTF16BE;
        }
        else if (acp == CP_LATIN_1)
        {
            gCodeset      = "Latin-1";
            gInternalCodePage = ECodePage::eLatin1;
        }
        else if (acp == CP_2312_LIMITED_GBK)
        {
            gCodeset      = "GBK";
            gInternalCodePage = ECodePage::eGBK;
        }
        else if (acp == 437)
        {
            gCodeset      = "IBM437";
            gInternalCodePage = ECodePage::eSysUnk;
        }
        else if (acp == CP_SHIFTJIS)
        {
            gCodeset      = "SJIS";
            gInternalCodePage = ECodePage::eSJIS;
        }
        else
        {
            gCodeset      = "MS-" + AuToString(acp);
            gInternalCodePage = ECodePage::eSysUnk;
        }
    }

    static void SetLanguageWin32()
    {
        int ret;
        wchar_t name[LOCALE_NAME_MAX_LENGTH] = { 0 };

        ret = LCIDToLocaleName(LOCALE_USER_DEFAULT, name, LOCALE_NAME_MAX_LENGTH, LOCALE_ALLOW_NEUTRAL_NAMES);
        SysAssert(ret, "Couldn't acquire win32 locale information");

        wchar_t language[LOCALE_NAME_MAX_LENGTH] = { 0 };
        ret = GetLocaleInfoEx(name, LOCALE_SISO639LANGNAME, language, LOCALE_NAME_MAX_LENGTH);
        SysAssert(ret, "Couldn't acquire win32 provided ISO 639 map of {}", ConvertFromWChar(name));

        wchar_t country[LOCALE_NAME_MAX_LENGTH] = { 0 };
        ret = GetLocaleInfoEx(name, LOCALE_SISO3166CTRYNAME, country, LOCALE_NAME_MAX_LENGTH);
        SysAssert(ret, "Couldn't acquire win32 provided ISO 3166 map of {}", ConvertFromWChar(name));

        gCountryCode  = ConvertFromWChar(country);
        gLanguageCode = ConvertFromWChar(language);

        SetCodesetCommonGuessWin32();
    }

#elif defined(AURORA_IS_POSIX_DERIVED)

    static AuHashMap<unsigned char, AuString> ParseLocaleString(const AuString &locale)
    {
        static auto isCharacterSplitter = [&](unsigned char ch) -> bool
        {
            static AuList<unsigned char> characterSplitters = { '.', '_', '@' };
            for (auto const splitter : characterSplitters)
            {
                if (splitter == ch)
                {
                    return true;
                }
            }
            return false;
        };

        AuHashMap<unsigned char, AuString> parseTable;

        AuMach startingIndex = 0;
        unsigned char startingCharacter = '!';
        for (AuMach i = 0; i < locale.size(); i++)
        {
            unsigned char curCh = locale[i];

            if (!(isCharacterSplitter(curCh)))
            {
                continue;
            }

            parseTable.insert(AuMakePair(startingCharacter, locale.substr(startingIndex, i - startingIndex)));
            startingIndex = i + 1;
            startingCharacter = curCh;
        }

        parseTable.insert(AuMakePair(startingCharacter, locale.substr(startingIndex, locale.size() - startingIndex)));

        return parseTable;
    }

    static void SetLanguageUnix()
    {
    #if 0
        // this doesn't seem to work with libc++ lol?
        auto locale = -std::--locale("").name();
    #else
        setlocale(LC_ALL, "");
        AuString locale = setlocale(LC_ALL, NULL);
    #endif

        if (locale == "C")
        {
            AuLogWarn("Improperly configured UNIX environment.");
            AuLogWarn("This localization detection code was written in 2020, please follow the `language[_territory][.codeset][@modifier]` convention for user/sys locales.");
            AuLogWarn("'C' is not a language, country, or anything with which we can discern anything meaningful from. Fix your scuffed unix operating system and try again later...");
            SysPanic("You fools");
        }

        auto parseTable = ParseLocaleString(locale);

        AuString *lc;
        if ((AuTryFind(parseTable, '!', lc)) && (lc->size()))
        {
            gLanguageCode = *lc;
        }
        else
        {
            AuLogWarn("Improperly configured UNIX environment.");
            AuLogWarn("Couldn't discern language from localization string: {}", locale);
            SysPanic("You fools");
        }

        AuString *cc;
        if ((AuTryFind(parseTable, '_', cc)) && (cc->size()))
        {
            gCountryCode = *cc;
        }
        else
        {
            gCountryCode = "GB";
        }

        AuString *cs;
        if ((AuTryFind(parseTable, '.', cs)) && (cs->size()))
        {
            gCodeset = *cs;
        }
        else
        {
            gCodeset = "UTF-8"; //also technically not true, but most UNIX/Linux applications expect UTF8 byte stirngs or UTF-32 wchar_t strings. this assumption shouldn't break anything
        }
    }
#define AURORA_HAS_UNIXLOCALE
#endif

#if defined(AURORA_PLATFORM_WIN32) || defined(AURORA_PLATFORM_LINUX) || defined(AURORA_PLATFORM_BSD)
    static void SetLanguageEnvBlock()
    {
        const char *language;
        if (language = getenv("AURORA_ENV_LANGUAGE"))
        {
            gLanguageCode = language;
        }

        const char *countryCode;
        if (countryCode = getenv("AURORA_ENV_COUNTRY"))
        {
            gCountryCode = countryCode;
        }

        // You may not overload codeset on win32 targets
        const char *codeSet;
        if (codeSet = getenv("AURORA_ENV_CODESET"))
        {
            gCodeset = codeSet;
        }
    }

    #define AURORA_HAS_ENVBLOCK
#endif

    static void GuessSystemECodePage()
    {
        if (gInternalCodePage != ECodePage::eEnumInvalid)
        {
            return;
        }

        if (gCodeset == "UTF-8")
        {
            gInternalCodePage = ECodePage::eUTF8;
        }
        else if (gCodeset == "UTF-16")
        {
            // TODO: is big endian
            gInternalCodePage = ECodePage::eUTF16;
        }
        else if (gCodeset == "UTF-32")
        {
            // TODO: is big endian
            gInternalCodePage = ECodePage::eUTF32;
        }
        else if (gCodeset == "SJIS")
        {
            gInternalCodePage = ECodePage::eSJIS;
        }
        // a history of chinese locales
        else if (gCodeset == "GB18030") // is the new legally defined standard
        {
            gInternalCodePage = ECodePage::e18030;
        }
        else if (gCodeset == "GBK") // GB18030 is derived from GBK, GBK is drived from GB2312
        {
            gInternalCodePage = ECodePage::eGBK;
        }
        else if (gCodeset == "GB2312") // GBK is drived from GB2312, GB2312 is derived from telegraph shid
        {
            gInternalCodePage = ECodePage::e2312;
        }
        else
        {
            gInternalCodePage = ECodePage::eSysUnk;
        }
    }

    void Init()
    {
    #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
        SetLanguageWin32();
    #elif defined(AURORA_HAS_UNIXLOCALE)
        SetLanguageUnix();
    #endif

    #if defined(AURORA_HAS_ENVBLOCK)
        SetLanguageEnvBlock();
    #endif

        GuessSystemECodePage();

        gLanguageCode = AuToLower(gLanguageCode);
        gCountryCode  = AuToUpper(gCountryCode);
        gCodeset      = gCodeset;

        Encoding::InitIConv();
    }

    static bool gLockLocale = false;

    AUKN_SYM void RuntimeOverloadLocality(const AuPair<AuString, AuString> &locality)
    {
        SysAssert(!AuExchange(gLockLocale, true), "Locality has been locked");
        gLanguageCode = AuToLower(locality.first);
        gCountryCode  = AuToUpper(locality.second);
    }

    AUKN_SYM LocalizationInfo GetLocale()
    {
        gLockLocale = true;
        return LocalizationInfo(gLanguageCode, gCountryCode, gCodeset, gInternalCodePage);
    }
}