/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: Locale.cpp Date: 2021-6-11 Author: Reece ***/ #include #include "Locale.hpp" #if !defined(AU_NO_CPPLOCALE) #include #include #endif #include #include namespace Aurora::Locale { static AuString gCountryCode; static AuString gLanguageCode; static AuString gCodeset; static ECodePage gInternalCodePage = ECodePage::eUnsupported; // Note: [0] out of touch boomers deprecated std::wstring_convert before going for a nappy. we do not have a replacement yet // [1] the native win32 implementation appears to be more optimized than MSVC/stl #if !defined(AU_NO_CPPLOCALE) static std::wstring_convert> gUtf8Conv; #endif AUKN_SYM AuString ConvertFromWChar(const wchar_t *in) { return ConvertFromWChar(in, wcslen(in)); } AUKN_SYM AuString ConvertFromWChar(const wchar_t *in, AuMach length) { #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) AuString ret; auto chars = WideCharToMultiByte(CP_UTF8, 0, in, length, NULL, 0, NULL, NULL); if (!chars) { return {}; } ret.resize(chars); WideCharToMultiByte(CP_UTF8, 0, in, length, ret.data(), ret.size(), NULL, NULL); return ret; #elif !defined(AU_NO_CPPLOCALE) return gUtf8Conv.to_bytes(std::wstring(in, wcslen(in))); #else return false; #endif } AUKN_SYM std::wstring ConvertFromUTF8(const AuString &in) { #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) std::wstring ret; auto chars = MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), NULL, 0); if (!chars) { return {}; } ret.resize(chars); MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), ret.data(), ret.size()); return ret; #elif !defined(AU_NO_CPPLOCALE) return gUtf8Conv.from_bytes(in); #else return false; #endif } ECodePage GetInternalCodePage() { return gInternalCodePage; } AuString const &GetInternalCodePageString() { return gCodeset; } #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) static void SetCodesetCommonGuessWin32() { int acp = GetACP(); if (acp == CP_CHINESE) { gCodeset = "GB18030"; gInternalCodePage = ECodePage::e18030; } else if (acp == CP_UTF8) { gCodeset = "UTF-8"; gInternalCodePage = ECodePage::eUTF8; } else if (acp == CP_UTF_16) { gCodeset = "UTF-16"; gInternalCodePage = ECodePage::eUTF16; } else if (acp == CP_UTF_16 + 1) { gCodeset = "UTF-16"; gInternalCodePage = ECodePage::eUTF16BE; } else if (acp == CP_LATIN_1) { gCodeset = "Latin-1"; gInternalCodePage = ECodePage::eLatin1; } else if (acp == CP_2312_LIMITED_GBK) { gCodeset = "GBK"; gInternalCodePage = ECodePage::eGBK; } else if (acp == 437) { gCodeset = "IBM437"; gInternalCodePage = ECodePage::eSysUnk; } else if (acp == CP_SHIFTJIS) { gCodeset = "SJIS"; gInternalCodePage = ECodePage::eSJIS; } else { gCodeset = "MS-" + std::to_string(acp); gInternalCodePage = ECodePage::eSysUnk; } } static void SetLanguageWin32() { int ret; wchar_t name[LOCALE_NAME_MAX_LENGTH] = { 0 }; ret = LCIDToLocaleName(LOCALE_USER_DEFAULT, name, LOCALE_NAME_MAX_LENGTH, LOCALE_ALLOW_NEUTRAL_NAMES); SysAssert(ret, "Couldn't acquire win32 locale information"); wchar_t language[LOCALE_NAME_MAX_LENGTH] = { 0 }; ret = GetLocaleInfoEx(name, LOCALE_SISO639LANGNAME, language, LOCALE_NAME_MAX_LENGTH); SysAssert(ret, "Couldn't acquire win32 provided ISO 639 map of {}", ConvertFromWChar(name)); wchar_t country[LOCALE_NAME_MAX_LENGTH] = { 0 }; ret = GetLocaleInfoEx(name, LOCALE_SISO3166CTRYNAME, country, LOCALE_NAME_MAX_LENGTH); SysAssert(ret, "Couldn't acquire win32 provided ISO 3166 map of {}", ConvertFromWChar(name)); gCountryCode = ConvertFromWChar(country); gLanguageCode = ConvertFromWChar(language); SetCodesetCommonGuessWin32(); } #elif defined(AURORA_IS_POSIX_DERIVED) static AuHashMap ParseLocaleString(const AuString &locale) { static auto isCharacterSplitter = [&](unsigned char ch) -> bool { static AuList characterSplitters = { '.', '_', '@' }; for (auto const splitter : characterSplitters) { if (splitter == ch) { return true; } } return false; }; AuHashMap parseTable; AuMach startingIndex = 0; unsigned char startingCharacter = '!'; for (AuMach i = 0; i < locale.size(); i++) { unsigned char curCh = locale[i]; if (!(isCharacterSplitter(curCh))) { continue; } parseTable.insert(AuMakePair(startingCharacter, locale.substr(startingIndex, i - startingIndex))); startingIndex = i + 1; startingCharacter = curCh; } parseTable.insert(AuMakePair(startingCharacter, locale.substr(startingIndex, locale.size() - startingIndex))); return parseTable; } static void SetLanguageUnix() { #if 0 // this doesn't seem to work with libc++ lol? auto locale = std::locale("").name(); #else setlocale(LC_ALL, ""); AuString locale = setlocale(LC_ALL, NULL); #endif if (locale == "C") { LogWarn("Improperly configured UNIX environment."); LogWarn("This localization detection code was written in 2020, please follow the `language[_territory][.codeset][@modifier]` convention for user/sys locales."); LogWarn("'C' is not a language, country, or anything with which we can discern anything meaningful from. Fix your scuffed unix operating system and try again later..."); SysPanic("You fools"); } auto parseTable = ParseLocaleString(locale); AuString *lc; if ((TryFind(parseTable, '!', lc)) && (lc->size())) { gLanguageCode = *lc; } else { LogWarn("Improperly configured UNIX environment."); LogWarn("Couldn't discern language from localization string: {}", locale); SysPanic("You fools"); } AuString *cc; if ((TryFind(parseTable, '_', cc)) && (cc->size())) { gCountryCode = *cc; } else { gCountryCode = "GB"; } AuString *cs; if ((TryFind(parseTable, '.', cs)) && (cs->size())) { gCodeset = *cs; } else { gCodeset = "UTF-8"; //also technically not true, but most UNIX/Linux applications expect UTF8 byte stirngs or UTF-32 wchar_t strings. this assumption shouldn't break anything } } #define AURORA_HAS_UNIXLOCALE #endif #if defined(AURORA_PLATFORM_WIN32) || defined(AURORA_PLATFORM_LINUX) || defined(AURORA_PLATFORM_BSD) static void SetLanguageEnvBlock() { const char *language; if (language = getenv("AURORA_ENV_LANGUAGE")) { gLanguageCode = language; } const char *countryCode; if (countryCode = getenv("AURORA_ENV_COUNTRY")) { gCountryCode = countryCode; } // You may not overload codeset on win32 targets const char *codeSet; if (codeSet = getenv("AURORA_ENV_CODESET")) { gCodeset = codeSet; } } #define AURORA_HAS_ENVBLOCK #endif static void GuessSystemECodePage() { if (gInternalCodePage != ECodePage::eUnsupported) { return; } if (gCodeset == "UTF-8") { gInternalCodePage = ECodePage::eUTF8; } else if (gCodeset == "UTF-16") { // TODO: is big endian gInternalCodePage = ECodePage::eUTF16; } else if (gCodeset == "UTF-32") { // TODO: is big endian gInternalCodePage = ECodePage::eUTF32; } else if (gCodeset == "SJIS") { gInternalCodePage = ECodePage::eSJIS; } // a history of chinese locales else if (gCodeset == "GB18030") // is the new legally defined standard { gInternalCodePage = ECodePage::e18030; } else if (gCodeset == "GBK") // GB18030 is derived from GBK, GBK is drived from GB2312 { gInternalCodePage = ECodePage::eGBK; } else if (gCodeset == "GB2312") // GBK is drived from GB2312, GB2312 is derived from telegraph shid { gInternalCodePage = ECodePage::e2312; } else { gInternalCodePage = ECodePage::eSysUnk; } } void Init() { #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) SetLanguageWin32(); #elif defined(AURORA_HAS_UNIXLOCALE) SetLanguageUnix(); #endif #if defined(AURORA_HAS_ENVBLOCK) SetLanguageEnvBlock(); #endif GuessSystemECodePage(); gLanguageCode = AuToLower(gLanguageCode); gCountryCode = AuToUpper(gCountryCode); gCodeset = AuToUpper(gCodeset); LogDbg("Initialized localization information (language: {}, country: {}, codeset: {})", gLanguageCode, gCountryCode, gCodeset); } static bool gLockLocale = false; AUKN_SYM void RuntimeOverloadLocality(const AuPair &locality) { SysAssert(!std::exchange(gLockLocale, true), "Locality has been locked"); gLanguageCode = AuToLower(locality.first); gCountryCode = AuToUpper(locality.second); } AUKN_SYM LocalizationInfo GetLocale() { gLockLocale = true; return LocalizationInfo(gLanguageCode, gCountryCode, gCodeset, gInternalCodePage); } }