/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: EncoderNSL.cpp Date: 2021-8-19 Author: Reece ***/ #include #include "../Locale.hpp" #include "Encoding.hpp" #include "EncoderNSL.hpp" namespace Aurora::Locale::Encoding { static AuStreamReadWrittenPair_t Win32ConvertCpAToCPB(AuUInt32 cpA, AuUInt32 cpB, const void *in, AuUInt32 inLength, void *cpBlob, AuUInt32 cpLen) { #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) if (!in) { return {}; } // Get the UTF-16 character count of the cpA string in/inLength auto chars = MultiByteToWideChar(cpA, 0, (LPCCH)in, inLength, NULL, 0); if (!chars) { return {}; } // Allocate a temp utf-16/widechar string buffer auto ret = _new wchar_t[chars]; if (!ret) { return {}; } // Convert the cpA buffer to UTF-16 MultiByteToWideChar(cpA, 0, (LPCCH)in, inLength, ret, chars); AuUInt32 utf16Recalc = chars; if (!utf16Recalc) { return {}; } // convert the shortened string with invalid surrogates back into a cpA length AuUInt32 read = inLength; // always calc for now // handle **end of stream** and error trailing bytes #if 0 if (slowPath) #endif { // Assumes A1 -> B -> A2 (A1 == A2), would GB2312 survive the round trip? // I know edge cases don't always map 1:1 with unicode // However, with all that struggle to maintain a codepage, would you have duplicate characters? // googling `GB2312 duplicate characters` and how translating works specifically is taking too long // this shoule be fine, i cope. worst case scenario, we end up with a stream jump x bytes bug // TODO: REVIEW ME read = WideCharToMultiByte(cpA, 0, ret, utf16Recalc, NULL, 0, NULL, NULL); // skipping is one thing, but this would be really bad read = std::min(AuUInt32(read), AuUInt32(inLength)); } AuUInt32 cpLength; if (!cpBlob) { cpLength = WideCharToMultiByte(cpB, 0, ret, utf16Recalc, NULL, NULL, NULL, NULL); } else { cpLength = WideCharToMultiByte(cpB, 0, ret, utf16Recalc, (LPSTR)cpBlob, cpLen, NULL, NULL); } delete[] ret; return {read, cpLength}; #else return {}; #endif } AuStreamReadWrittenPair_t Win32ConvertFromCPToUTF8(AuUInt32 cp, const void *in, AuUInt length, void *utf8, AuUInt32 utf8Len) { return Win32ConvertCpAToCPB(cp, CP_UTF8, in, length, utf8, utf8Len); } AuStreamReadWrittenPair_t Win32ConvertFromUTF8ToCp(AuUInt32 cp, const void *utf8, AuUInt utf8Length, void *cpBlob, AuUInt32 cpLen) { return Win32ConvertCpAToCPB(CP_UTF8, cp, utf8, utf8Length, cpBlob, cpLen); } AuStreamReadWrittenPair_t Win32ConvertFromUTF16ToUTF8(const void *in, AuUInt32 inLength, void *utf8, AuUInt32 utf8Len) { #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) if (!in) { return {}; } auto pWideChars = (const wchar_t *)in; AuUInt32 iCChars = inLength / sizeof(wchar_t); AuUInt32 lastIdx = 0; AuUInt32 lastEnd = 0; AuUInt32 curIdx = 0; for (curIdx = 0; curIdx < iCChars; curIdx++) { if (IS_LOW_SURROGATE(pWideChars[curIdx])) { if (curIdx + 2 > iCChars) { break; } if (!IS_HIGH_SURROGATE(pWideChars[curIdx + 1])) { break; } lastEnd = curIdx + 2; } else { lastEnd = curIdx + 1; } lastIdx = curIdx; } AuUInt32 cpLength; if (!utf8) { cpLength = WideCharToMultiByte(CP_UTF8, 0, (const wchar_t *)in, lastEnd, NULL, NULL, NULL, NULL); return {lastEnd, cpLength}; } else { cpLength = WideCharToMultiByte(CP_UTF8, 0, (const wchar_t *)in, lastEnd, (LPSTR)utf8, utf8Len, NULL, NULL); #if 0 auto actLen = lastEnd; #else auto actLen = MultiByteToWideChar(CP_UTF8, 0, (LPSTR)utf8, utf8Len, NULL, 0); // this might be worth it in the long run #endif return {actLen, cpLength}; } #else return {}; #endif } AuStreamReadWrittenPair_t Win32ConvertFromUTF8ToUTF16(const void *in, AuUInt32 inLength, void *utf16, AuUInt32 utf16Len) { #if defined(AU_HAS_MSFT_NATIONALLANGSUPPORT) if (!in) { return {}; } // Get the UTF-16 character count of the cpA string in/inLength auto chars = MultiByteToWideChar(CP_UTF8, 0, (LPCCH)in, inLength, NULL, 0); if (!chars) { return {}; } if (chars > utf16Len) { if (!utf16) { return {0, utf16Len}; } else { return {}; } } // Convert the CP_UTF8 buffer to UTF-16 MultiByteToWideChar(CP_UTF8, 0, (LPCCH)in, inLength, (LPWSTR)utf16, chars); // convert the shortened string with invalid surrogates back into a CP_UTF8 length AuUInt32 read = inLength; #if 0 if (slowPath) #endif { read = WideCharToMultiByte(CP_UTF8, 0, (LPWSTR)utf16, chars, NULL, 0, NULL, NULL); } return {read, chars}; #else return {}; #endif } // TODO(reece): Consider implementing bigendian when I can be bothered AuStreamReadWrittenPair_t Win32CPToUTF8(ECodePage page, void *in, AuUInt length, void *utf8, AuUInt32 utf8Max) { AuStreamReadWrittenPair_t ret {}; switch (page) { default: case ECodePage::eUnsupported: return {}; case ECodePage::e18030: ret = Win32ConvertFromCPToUTF8(CP_CHINESE, in, length, utf8, utf8Max); break; case ECodePage::eSysUnk: ret = Win32ConvertFromCPToUTF8(CP_ACP, in, length, utf8, utf8Max); break; case ECodePage::eLatin1: ret = Win32ConvertFromCPToUTF8(CP_LATIN_1, in, length, utf8, utf8Max); break; case ECodePage::eUTF7: ret = Win32ConvertFromCPToUTF8(CP_UTF7, in, length, utf8, utf8Max); break; case ECodePage::e2312: ret = Win32ConvertFromCPToUTF8(CP_2312_LIMITED_GBK, in, length, utf8, utf8Max); break; case ECodePage::eGBK: ret = Win32ConvertFromCPToUTF8(CP_2312_LIMITED_GBK, in, length, utf8, utf8Max); break; case ECodePage::eSJIS: ret = Win32ConvertFromCPToUTF8(CP_SHIFTJIS, in, length, utf8, utf8Max); break; case ECodePage::eUTF16: ret = Win32ConvertFromUTF16ToUTF8(in, length, utf8, utf8Max); break; } return ret; } AuStreamReadWrittenPair_t Win32CPToUTF8(ECodePage page, const void *in, AuUInt length, void *utf8, AuUInt32 utf8Max) { AuStreamReadWrittenPair_t ret {}; switch (page) { default: case ECodePage::eUnsupported: return {}; case ECodePage::e18030: ret = Win32ConvertFromUTF8ToCp(CP_CHINESE, in, length, utf8, utf8Max); break; case ECodePage::eSysUnk: ret = Win32ConvertFromCPToUTF8(CP_ACP, in, length, utf8, utf8Max); break; case ECodePage::eLatin1: ret = Win32ConvertFromCPToUTF8(CP_LATIN_1, in, length, utf8, utf8Max); break; case ECodePage::eUTF7: ret = Win32ConvertFromCPToUTF8(CP_UTF7, in, length, utf8, utf8Max); break; case ECodePage::e2312: ret = Win32ConvertFromCPToUTF8(CP_2312_LIMITED_GBK, in, length, utf8, utf8Max); break; case ECodePage::eGBK: ret = Win32ConvertFromCPToUTF8(CP_2312_LIMITED_GBK, in, length, utf8, utf8Max); break; case ECodePage::eSJIS: ret = Win32ConvertFromCPToUTF8(CP_SHIFTJIS, in, length, utf8, utf8Max); break; case ECodePage::eUTF16: ret = Win32ConvertFromUTF16ToUTF8(in, length, utf8, utf8Max); break; } return ret; } AuStreamReadWrittenPair_t Win32UTF8ToCp(ECodePage page, const void *utf8, AuUInt32 utf8Length, void *cp, AuUInt32 cpLen) { AuStreamReadWrittenPair_t ret {}; switch (page) { default: case ECodePage::eUnsupported: return {}; case ECodePage::eSysUnk: ret = Win32ConvertFromUTF8ToCp(CP_ACP, utf8, utf8Length, cp, cpLen); break; case ECodePage::eUTF7: ret = Win32ConvertFromUTF8ToCp(CP_UTF7, utf8, utf8Length, cp, cpLen); break; case ECodePage::eLatin1: ret = Win32ConvertFromUTF8ToCp(CP_LATIN_1, utf8, utf8Length, cp, cpLen); break; case ECodePage::e18030: ret = Win32ConvertFromUTF8ToCp(CP_CHINESE, utf8, utf8Length, cp, cpLen); break; case ECodePage::eSJIS: ret = Win32ConvertFromUTF8ToCp(CP_SHIFTJIS, utf8, utf8Length, cp, cpLen); break; case ECodePage::e2312: case ECodePage::eGBK: ret = Win32ConvertFromUTF8ToCp(CP_2312_LIMITED_GBK, utf8, utf8Length, cp, cpLen); break; case ECodePage::eUTF16: ret = Win32ConvertFromUTF8ToUTF16(utf8, utf8Length, cp, cpLen); break; } return ret; } }