AuroraRuntime/Source/Locale/Locale.cpp

394 lines
11 KiB
C++
Raw Normal View History

2021-06-27 21:25:29 +00:00
/***
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: Locale.cpp
Date: 2021-6-11
Author: Reece
***/
#define I_REALLY_NEED_WIDECHAR_PUBAPI
2021-09-30 14:57:41 +00:00
#include <Source/RuntimeInternal.hpp>
2021-06-27 21:25:29 +00:00
#include "Locale.hpp"
2021-09-06 10:58:08 +00:00
#if !defined(AU_NO_CPPLOCALE)
#include <locale>
#include <codecvt>
#endif
2021-06-27 21:25:29 +00:00
#include <wchar.h>
2021-09-06 10:58:08 +00:00
#include <tuple>
2021-06-27 21:25:29 +00:00
namespace Aurora::Locale
{
2021-09-06 10:58:08 +00:00
static AuString gCountryCode;
static AuString gLanguageCode;
static AuString gCodeset;
[*/+/-] MEGA COMMIT. ~2 weeks compressed. The intention is to quickly improve and add util apis, enhance functionality given current demands, go back to the build pipeline, finish that, publish runtime tests, and then use what we have to go back to to linux support with a more stable api. [+] AuMakeSharedArray [+] Technet ArgvQuote [+] Grug subsystem (UNIX signal thread async safe ipc + telemetry flusher + log flusher.) [+] auEndianness -> Endian swap utils [+] AuGet<N>(...) [*] AUE_DEFINE conversion for ECompresionType, EAnsiColor, EHashType, EStreamError, EHexDump [+] ConsoleMessage ByteBuffer serialization [+] CmdLine subsystem for parsing command line arguments and simple switch/flag checks [*] Split logger from console subsystem [+] StartupParameters -> A part of a clean up effort under Process [*] Refactor SysErrors header + get caller hack [+] Atomic APIs [+] popcnt [+] Ring Buffer sink [+] Added more standard errors Catch, Submission, LockError, NoAccess, ResourceMissing, ResourceLocked, MalformedData, InSandboxContext, ParseError [+] Added ErrorCategorySet, ErrorCategoryClear, GetStackTrace [+] IExitSubscriber, ETriggerLevel [*] Write bias the high performance RWLockImpl read-lock operation operation [+] ExitHandlerAdd/ExitHandlerRemove (exit subsystem) [*] Updated API style Digests [+] CpuId::CpuBitCount [+] GetUserProgramsFolder [+] GetPackagePath [*] Split IStreamReader with an inl file [*] BlobWriter/BlobReader/BlobArbitraryReader can now take shared pointers to bytebuffers. default constructor allocates a new scalable bytebuffer [+] ICharacterProvider [+] ICharacterProviderEx [+] IBufferedCharacterConsumer [+] ProviderFromSharedString [+] ProviderFromString [+] BufferConsumerFromProvider [*] Parse Subsystem uses character io bufferer [*] Rewritten NT's high perf semaphore to use userland SRW/ConVars [like mutex, based on generic semaphore] [+] ByteBuffer::ResetReadPointer [*] Bug fix bytebuffer base not reset on free and some scaling issues [+] ProcessMap -> Added kSectionNameStack, kSectionNameFile, kSectionNameHeap for Section [*] ProcessMap -> Refactor Segment to Section. I was stupid for keeping a type conflict hack API facing [+] Added 64 *byte* fast RNG seeds [+] File Advisorys/File Lock Awareness [+] Added extended IAuroraThread from OS identifier caches for debug purposes [*] Tweaked how memory is reported on Windows. Better consistency of what values mean across functions. [*] Broke AuroraUtils/Typedefs out into a separate library [*] Update build script [+] Put some more effort into adding detail to the readme before rewriting it, plus, added some media [*] Improved public API documentation [*] Bug fix `SetConsoleCtrlHandler` [+] Locale TimeDateToFileNameISO8601 [+] Console config stdOutShortTime [*] Begin using internal UTF8/16 decoders when platform support isnt available (instead of stl) [*] Bug fixes in decoders [*] Major bug fix, AuMax [+] RateLimiter [+] Binary file sink [+] Log directory sink [*] Data header usability (more operators) [+] AuRemoveRange [+] AuRemove [+] AuTryRemove [+] AuTryRemoveRange [+] auCastUtils [+] Finish NewLSWin32Source [+] AuTryFindByTupleN, AuTryRemoveByTupleN [+] Separated AuRead/Write types, now in auTypeUtils [+] Added GetPosition/SetPosition to FileWriter [*] Fix stupid AuMin in place of AuMax in SpawnThread.Unix.Cpp [*] Refactored Arbitrary readers to SeekingReaders (as in, they could be atomic and/or parallelized, and accept an arbitrary position as a work parameter -> not Seekable, as in, you can simply set the position) [*] Hack back in the sched deinit [+] File AIO loop source interop [+] Begin to prototype a LoopQueue object I had in mind for NT, untested btw [+] Stub code for networking [+] Compression BaseStream/IngestableStreamBase [*] Major: read/write locks now support write-entrant read routines. [*] Compression subsystem now uses the MemoryView concept [*] Rewrite the base stream compressions, made them less broken [*] Update hashing api [*] WriterTryGoForward and ReaderTryGoForward now revert to the previous relative index instead of panicing [+] Added new AuByteBuffer apis Trim, Pad, WriteFrom, WriteString, [TODO: ReadString] [+] Added ByteBufferPushReadState [+] Added ByteBufferPushWriteState [*] Move from USC-16 to full UTF-16. Win32 can handle full UTF-16. [*] ELogLevel is now an Aurora enum [+] Raised arbitrary limit in header to 255, the max filter buffer [+] Explicit GZip support [+] Explicit Zip support [+] Added [some] compressors et al
2022-02-17 00:11:40 +00:00
static ECodePage gInternalCodePage = ECodePage::eEnumInvalid;
2021-09-06 10:58:08 +00:00
// Note: [0] out of touch boomers deprecated std::wstring_convert before going for a nappy. we do not have a replacement yet
// [1] the native win32 implementation appears to be more optimized than MSVC/stl
#if !defined(AU_NO_CPPLOCALE) && !(defined(AURORA_COMPILER_MSVC) && defined(AU_LANG_CPP_20))
2021-06-27 21:25:29 +00:00
static std::wstring_convert<std::codecvt_utf8<wchar_t>> gUtf8Conv;
2021-09-06 10:58:08 +00:00
#endif
2021-06-27 21:25:29 +00:00
AUKN_SYM AuString ConvertFromWChar(const wchar_t *in)
{
try
{
return ConvertFromWChar(in, wcslen(in));
}
catch (...)
{
SysPushErrorMem("ConvertFromWChar failed");
return {};
}
2021-06-27 21:25:29 +00:00
}
AUKN_SYM AuString ConvertFromWChar(const wchar_t *in, AuMach length)
{
try
2021-06-27 21:25:29 +00:00
{
#if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
AuString ret;
auto chars = WideCharToMultiByte(CP_UTF8, 0, in, length, NULL, 0, NULL, NULL);
if (!chars)
{
return {};
}
ret.resize(chars);
WideCharToMultiByte(CP_UTF8, 0, in, length, ret.data(), ret.size(), NULL, NULL);
return ret;
#elif !defined(AU_NO_CPPLOCALE)
return gUtf8Conv.to_bytes(std::wstring(in, wcsnlen(in, length)));
#else
SysPushErrorUnimplemented("ConvertFromWChar");
2021-06-27 21:25:29 +00:00
return {};
#endif
2021-06-27 21:25:29 +00:00
}
catch (...)
{
SysPushErrorMem("ConvertFromWChar failed");
Debug::CheckErrors();
}
return {};
2021-06-27 21:25:29 +00:00
}
AUKN_SYM std::wstring ConvertFromUTF8(const AuString &in)
{
try
2021-06-27 21:25:29 +00:00
{
#if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
std::wstring ret;
auto chars = MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), NULL, 0);
if (!chars)
{
return {};
}
ret.resize(chars);
MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), ret.data(), ret.size());
return ret;
#elif !defined(AU_NO_CPPLOCALE)
return gUtf8Conv.from_bytes(in);
#else
SysPushErrorUnimplemented("ConvertFromUTF8");
2021-06-27 21:25:29 +00:00
return {};
#endif
2021-06-27 21:25:29 +00:00
}
catch (...)
{
SysPushErrorMem("ConvertFromUTF8 failed");
Debug::CheckErrors();
}
return {};
2021-06-27 21:25:29 +00:00
}
2021-09-06 10:58:08 +00:00
ECodePage GetInternalCodePage()
{
return gInternalCodePage;
}
2021-06-27 21:25:29 +00:00
2021-09-06 10:58:08 +00:00
AuString const &GetInternalCodePageString()
{
return gCodeset;
}
#if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
static void SetCodesetCommonGuessWin32()
{
int acp = GetACP();
2021-06-27 21:25:29 +00:00
2021-09-06 10:58:08 +00:00
if (acp == CP_CHINESE)
{
gCodeset = "GB18030";
gInternalCodePage = ECodePage::e18030;
}
else if (acp == CP_UTF8)
{
gCodeset = "UTF-8";
gInternalCodePage = ECodePage::eUTF8;
}
else if (acp == CP_UTF_16)
{
gCodeset = "UTF-16";
gInternalCodePage = ECodePage::eUTF16;
}
else if (acp == CP_UTF_16 + 1)
{
gCodeset = "UTF-16";
gInternalCodePage = ECodePage::eUTF16BE;
}
else if (acp == CP_LATIN_1)
{
gCodeset = "Latin-1";
gInternalCodePage = ECodePage::eLatin1;
}
else if (acp == CP_2312_LIMITED_GBK)
{
gCodeset = "GBK";
gInternalCodePage = ECodePage::eGBK;
}
else if (acp == 437)
{
gCodeset = "IBM437";
gInternalCodePage = ECodePage::eSysUnk;
}
else if (acp == CP_SHIFTJIS)
{
gCodeset = "SJIS";
gInternalCodePage = ECodePage::eSJIS;
}
else
{
2022-01-19 17:08:13 +00:00
gCodeset = "MS-" + AuToString(acp);
2021-09-06 10:58:08 +00:00
gInternalCodePage = ECodePage::eSysUnk;
}
}
2021-06-27 21:25:29 +00:00
static void SetLanguageWin32()
{
int ret;
wchar_t name[LOCALE_NAME_MAX_LENGTH] = { 0 };
ret = LCIDToLocaleName(LOCALE_USER_DEFAULT, name, LOCALE_NAME_MAX_LENGTH, LOCALE_ALLOW_NEUTRAL_NAMES);
SysAssert(ret, "Couldn't acquire win32 locale information");
wchar_t language[LOCALE_NAME_MAX_LENGTH] = { 0 };
ret = GetLocaleInfoEx(name, LOCALE_SISO639LANGNAME, language, LOCALE_NAME_MAX_LENGTH);
SysAssert(ret, "Couldn't acquire win32 provided ISO 639 map of {}", ConvertFromWChar(name));
wchar_t country[LOCALE_NAME_MAX_LENGTH] = { 0 };
ret = GetLocaleInfoEx(name, LOCALE_SISO3166CTRYNAME, country, LOCALE_NAME_MAX_LENGTH);
SysAssert(ret, "Couldn't acquire win32 provided ISO 3166 map of {}", ConvertFromWChar(name));
gCountryCode = ConvertFromWChar(country);
gLanguageCode = ConvertFromWChar(language);
2021-09-06 10:58:08 +00:00
SetCodesetCommonGuessWin32();
2021-06-27 21:25:29 +00:00
}
2021-09-06 10:58:08 +00:00
#elif defined(AURORA_IS_POSIX_DERIVED)
2021-06-27 21:25:29 +00:00
static AuHashMap<unsigned char, AuString> ParseLocaleString(const AuString &locale)
{
static auto isCharacterSplitter = [&](unsigned char ch) -> bool
{
static AuList<unsigned char> characterSplitters = { '.', '_', '@' };
for (auto const splitter : characterSplitters)
{
if (splitter == ch)
{
return true;
}
}
return false;
};
AuHashMap<unsigned char, AuString> parseTable;
AuMach startingIndex = 0;
unsigned char startingCharacter = '!';
for (AuMach i = 0; i < locale.size(); i++)
{
unsigned char curCh = locale[i];
if (!(isCharacterSplitter(curCh)))
{
continue;
}
2021-09-06 10:58:08 +00:00
parseTable.insert(AuMakePair(startingCharacter, locale.substr(startingIndex, i - startingIndex)));
2021-06-27 21:25:29 +00:00
startingIndex = i + 1;
startingCharacter = curCh;
}
2021-09-06 10:58:08 +00:00
parseTable.insert(AuMakePair(startingCharacter, locale.substr(startingIndex, locale.size() - startingIndex)));
2021-06-27 21:25:29 +00:00
return parseTable;
}
static void SetLanguageUnix()
{
#if 0
// this doesn't seem to work with libc++ lol?
2022-01-19 17:08:13 +00:00
auto locale = -std::--locale("").name();
2021-06-27 21:25:29 +00:00
#else
setlocale(LC_ALL, "");
AuString locale = setlocale(LC_ALL, NULL);
#endif
if (locale == "C")
{
AuLogWarn("Improperly configured UNIX environment.");
AuLogWarn("This localization detection code was written in 2020, please follow the `language[_territory][.codeset][@modifier]` convention for user/sys locales.");
AuLogWarn("'C' is not a language, country, or anything with which we can discern anything meaningful from. Fix your scuffed unix operating system and try again later...");
2021-06-27 21:25:29 +00:00
SysPanic("You fools");
}
auto parseTable = ParseLocaleString(locale);
AuString *lc;
if ((AuTryFind(parseTable, '!', lc)) && (lc->size()))
2021-06-27 21:25:29 +00:00
{
gLanguageCode = *lc;
}
else
{
AuLogWarn("Improperly configured UNIX environment.");
AuLogWarn("Couldn't discern language from localization string: {}", locale);
2021-06-27 21:25:29 +00:00
SysPanic("You fools");
}
AuString *cc;
if ((AuTryFind(parseTable, '_', cc)) && (cc->size()))
2021-06-27 21:25:29 +00:00
{
gCountryCode = *cc;
}
2021-09-06 10:58:08 +00:00
else
{
gCountryCode = "GB";
}
2021-06-27 21:25:29 +00:00
AuString *cs;
if ((AuTryFind(parseTable, '.', cs)) && (cs->size()))
2021-06-27 21:25:29 +00:00
{
gCodeset = *cs;
}
else
{
gCodeset = "UTF-8"; //also technically not true, but most UNIX/Linux applications expect UTF8 byte stirngs or UTF-32 wchar_t strings. this assumption shouldn't break anything
}
}
#define AURORA_HAS_UNIXLOCALE
#endif
2021-09-06 10:58:08 +00:00
#if defined(AURORA_PLATFORM_WIN32) || defined(AURORA_PLATFORM_LINUX) || defined(AURORA_PLATFORM_BSD)
2021-06-27 21:25:29 +00:00
static void SetLanguageEnvBlock()
{
const char *language;
if (language = getenv("AURORA_ENV_LANGUAGE"))
{
gLanguageCode = language;
}
const char *countryCode;
if (countryCode = getenv("AURORA_ENV_COUNTRY"))
{
gCountryCode = countryCode;
}
2021-09-06 10:58:08 +00:00
// You may not overload codeset on win32 targets
2021-06-27 21:25:29 +00:00
const char *codeSet;
if (codeSet = getenv("AURORA_ENV_CODESET"))
{
gCodeset = codeSet;
}
}
#define AURORA_HAS_ENVBLOCK
#endif
2021-09-06 10:58:08 +00:00
static void GuessSystemECodePage()
{
[*/+/-] MEGA COMMIT. ~2 weeks compressed. The intention is to quickly improve and add util apis, enhance functionality given current demands, go back to the build pipeline, finish that, publish runtime tests, and then use what we have to go back to to linux support with a more stable api. [+] AuMakeSharedArray [+] Technet ArgvQuote [+] Grug subsystem (UNIX signal thread async safe ipc + telemetry flusher + log flusher.) [+] auEndianness -> Endian swap utils [+] AuGet<N>(...) [*] AUE_DEFINE conversion for ECompresionType, EAnsiColor, EHashType, EStreamError, EHexDump [+] ConsoleMessage ByteBuffer serialization [+] CmdLine subsystem for parsing command line arguments and simple switch/flag checks [*] Split logger from console subsystem [+] StartupParameters -> A part of a clean up effort under Process [*] Refactor SysErrors header + get caller hack [+] Atomic APIs [+] popcnt [+] Ring Buffer sink [+] Added more standard errors Catch, Submission, LockError, NoAccess, ResourceMissing, ResourceLocked, MalformedData, InSandboxContext, ParseError [+] Added ErrorCategorySet, ErrorCategoryClear, GetStackTrace [+] IExitSubscriber, ETriggerLevel [*] Write bias the high performance RWLockImpl read-lock operation operation [+] ExitHandlerAdd/ExitHandlerRemove (exit subsystem) [*] Updated API style Digests [+] CpuId::CpuBitCount [+] GetUserProgramsFolder [+] GetPackagePath [*] Split IStreamReader with an inl file [*] BlobWriter/BlobReader/BlobArbitraryReader can now take shared pointers to bytebuffers. default constructor allocates a new scalable bytebuffer [+] ICharacterProvider [+] ICharacterProviderEx [+] IBufferedCharacterConsumer [+] ProviderFromSharedString [+] ProviderFromString [+] BufferConsumerFromProvider [*] Parse Subsystem uses character io bufferer [*] Rewritten NT's high perf semaphore to use userland SRW/ConVars [like mutex, based on generic semaphore] [+] ByteBuffer::ResetReadPointer [*] Bug fix bytebuffer base not reset on free and some scaling issues [+] ProcessMap -> Added kSectionNameStack, kSectionNameFile, kSectionNameHeap for Section [*] ProcessMap -> Refactor Segment to Section. I was stupid for keeping a type conflict hack API facing [+] Added 64 *byte* fast RNG seeds [+] File Advisorys/File Lock Awareness [+] Added extended IAuroraThread from OS identifier caches for debug purposes [*] Tweaked how memory is reported on Windows. Better consistency of what values mean across functions. [*] Broke AuroraUtils/Typedefs out into a separate library [*] Update build script [+] Put some more effort into adding detail to the readme before rewriting it, plus, added some media [*] Improved public API documentation [*] Bug fix `SetConsoleCtrlHandler` [+] Locale TimeDateToFileNameISO8601 [+] Console config stdOutShortTime [*] Begin using internal UTF8/16 decoders when platform support isnt available (instead of stl) [*] Bug fixes in decoders [*] Major bug fix, AuMax [+] RateLimiter [+] Binary file sink [+] Log directory sink [*] Data header usability (more operators) [+] AuRemoveRange [+] AuRemove [+] AuTryRemove [+] AuTryRemoveRange [+] auCastUtils [+] Finish NewLSWin32Source [+] AuTryFindByTupleN, AuTryRemoveByTupleN [+] Separated AuRead/Write types, now in auTypeUtils [+] Added GetPosition/SetPosition to FileWriter [*] Fix stupid AuMin in place of AuMax in SpawnThread.Unix.Cpp [*] Refactored Arbitrary readers to SeekingReaders (as in, they could be atomic and/or parallelized, and accept an arbitrary position as a work parameter -> not Seekable, as in, you can simply set the position) [*] Hack back in the sched deinit [+] File AIO loop source interop [+] Begin to prototype a LoopQueue object I had in mind for NT, untested btw [+] Stub code for networking [+] Compression BaseStream/IngestableStreamBase [*] Major: read/write locks now support write-entrant read routines. [*] Compression subsystem now uses the MemoryView concept [*] Rewrite the base stream compressions, made them less broken [*] Update hashing api [*] WriterTryGoForward and ReaderTryGoForward now revert to the previous relative index instead of panicing [+] Added new AuByteBuffer apis Trim, Pad, WriteFrom, WriteString, [TODO: ReadString] [+] Added ByteBufferPushReadState [+] Added ByteBufferPushWriteState [*] Move from USC-16 to full UTF-16. Win32 can handle full UTF-16. [*] ELogLevel is now an Aurora enum [+] Raised arbitrary limit in header to 255, the max filter buffer [+] Explicit GZip support [+] Explicit Zip support [+] Added [some] compressors et al
2022-02-17 00:11:40 +00:00
if (gInternalCodePage != ECodePage::eEnumInvalid)
2021-09-06 10:58:08 +00:00
{
return;
}
if (gCodeset == "UTF-8")
{
gInternalCodePage = ECodePage::eUTF8;
}
else if (gCodeset == "UTF-16")
{
// TODO: is big endian
gInternalCodePage = ECodePage::eUTF16;
}
else if (gCodeset == "UTF-32")
{
// TODO: is big endian
gInternalCodePage = ECodePage::eUTF32;
}
else if (gCodeset == "SJIS")
{
gInternalCodePage = ECodePage::eSJIS;
}
// a history of chinese locales
else if (gCodeset == "GB18030") // is the new legally defined standard
{
gInternalCodePage = ECodePage::e18030;
}
else if (gCodeset == "GBK") // GB18030 is derived from GBK, GBK is drived from GB2312
{
gInternalCodePage = ECodePage::eGBK;
}
else if (gCodeset == "GB2312") // GBK is drived from GB2312, GB2312 is derived from telegraph shid
{
gInternalCodePage = ECodePage::e2312;
}
else
{
gInternalCodePage = ECodePage::eSysUnk;
}
}
2021-06-27 21:25:29 +00:00
void Init()
{
2021-09-06 10:58:08 +00:00
#if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT)
2021-06-27 21:25:29 +00:00
SetLanguageWin32();
#elif defined(AURORA_HAS_UNIXLOCALE)
SetLanguageUnix();
#endif
#if defined(AURORA_HAS_ENVBLOCK)
SetLanguageEnvBlock();
#endif
2021-09-06 10:58:08 +00:00
GuessSystemECodePage();
gLanguageCode = AuToLower(gLanguageCode);
gCountryCode = AuToUpper(gCountryCode);
gCodeset = gCodeset;
Encoding::InitIConv();
2021-06-27 21:25:29 +00:00
}
2021-09-06 10:58:08 +00:00
static bool gLockLocale = false;
AUKN_SYM void RuntimeOverloadLocality(const AuPair<AuString, AuString> &locality)
{
2022-01-19 17:08:13 +00:00
SysAssert(!AuExchange(gLockLocale, true), "Locality has been locked");
2021-09-06 10:58:08 +00:00
gLanguageCode = AuToLower(locality.first);
gCountryCode = AuToUpper(locality.second);
}
2021-06-27 21:25:29 +00:00
AUKN_SYM LocalizationInfo GetLocale()
{
2021-09-06 10:58:08 +00:00
gLockLocale = true;
return LocalizationInfo(gLanguageCode, gCountryCode, gCodeset, gInternalCodePage);
2021-06-27 21:25:29 +00:00
}
}