/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: Encoding.hpp Date: 2021-8-18 Author: Reece ***/ #pragma once namespace Aurora::Locale::Encoding { AuStreamReadWrittenPair_t DecodeUTF8(void *binary, AuUInt32 binaryLength, AuString &out, ECodePage page = ECodePage::eUnsupported); template struct TextStreamDecoderImpl { bool readHeader {}; ECodePage page = ECodePage::eUnsupported; ECodePage defaultPage = ECodePage::eUnsupported; EncoderAdapter state; TextStreamDecoderImpl(ECodePage page = ECodePage::eSysUnk) : defaultPage(page) {} using TypeIn_t = std::conditional_t; using TypeCast_t = std::conditional_t; static int GetLenSJISCodePoint(const AuUInt8 *in, AuUInt32 len) { if (len == 0) return 0; auto b = in[0]; if (b >= 0x80) { if (b <= 0xDF) { if (len < 2) return 0; else return 2; } else if (b <= 0xEF) { if (len < 3) return 0; else return 3; } else { if (len < 4) return 0; else return 4; } } return 1; } static int GetLenSJISString(const AuUInt8 *in, AuUInt32 len) { AuUInt32 i; for (i = 0; i < len; ) { auto next = GetLenSJISCodePoint(in + i, len - i); if (next == 0) return i; i += next; } return i; } static int GetLenGBKCodePoint(const AuUInt8 *in, AuUInt32 len) { if (len == 0) return 0; auto b = in[0]; if (b >= 0x80) { if (len < 2) return 0; else return 2; } return 1; } static int GetLenGBKString(const AuUInt8 *in, AuUInt32 len) { AuUInt32 i; for (i = 0; i < len; ) { auto next = GetLenGBKCodePoint(in + i, len - i); if (next == 0) return i; i += next; } return i; } #define AU_HIGH_SURROGATE_START 0xd800 #define AU_HIGH_SURROGATE_END 0xdbff #define AU_LOW_SURROGATE_START 0xdc00 #define AU_LOW_SURROGATE_END 0xdfff #define AU_IS_HIGH_SURROGATE(wch) (((wch) >= AU_HIGH_SURROGATE_START) && ((wch) <= AU_HIGH_SURROGATE_END)) #define AU_IS_LOW_SURROGATE(wch) (((wch) >= AU_LOW_SURROGATE_START) && ((wch) <= AU_LOW_SURROGATE_END)) static int GetLenUC2CodePoint(const AuUInt8 *in, AuUInt32 len) { if (len < 2) return 0; auto a = *reinterpret_cast(in); if (!AU_IS_HIGH_SURROGATE(a)) return 2; if (len < 4) return 0; auto b = *reinterpret_cast(in + 2); return AU_IS_LOW_SURROGATE(b) ? 4 : 0; } static int GetLenUC2String(const AuUInt8 *in, AuUInt32 len) { AuUInt32 i; for (i = 0; i < len; ) { auto next = GetLenUC2CodePoint(in + i, len - i); if (next == 0) return i; i += next; } return i; } AuStreamReadWrittenPair_t EncodeUTF8(TypeIn_t binary, AuUInt32 binaryLength, void *utf8, AuUInt32 utfLen) { int offset = 0; if (!binary) return {}; if (!binaryLength) return {}; if (!std::exchange(readHeader, true)) { if (page == ECodePage::eUnsupported) { auto header = DecodeBOM(binary, binaryLength); if (header) { page = header->first; offset = header->second; } else { if ((defaultPage != ECodePage::eUnsupported)) { page = defaultPage; } else { page = GetInternalCodePage(); } } state.Init(page, true); } } if (page == ECodePage::eUnsupported) { return {}; } binaryLength = binaryLength - offset; if (page == ECodePage::eGBK) { binaryLength = GetLenGBKString(reinterpret_cast(binary) + offset, binaryLength); } else if (page == ECodePage::eSJIS) { binaryLength = GetLenSJISString(reinterpret_cast(binary) + offset, binaryLength); } else if ((page == ECodePage::eUTF16) || (page == ECodePage::eUTF16BE)) { binaryLength = GetLenUC2String(reinterpret_cast(binary) + offset, binaryLength); } else if ((page == ECodePage::eUTF32) || (page == ECodePage::eUTF32BE)) { binaryLength &= ~3; } auto real = state.CPToUTF8(reinterpret_cast(binary) + offset, binaryLength, utf8, utfLen); return AuMakePair(real.first + offset, real.second); } AuStreamReadWrittenPair_t EncodeUTF8(TypeIn_t binary, AuUInt32 binaryLength, AuString &out) { auto preemptive = EncodeUTF8(binary, binaryLength, nullptr, 0); if (!AuTryResize(out, preemptive.second)) return {}; auto main = EncodeUTF8(binary, preemptive.second, out.data(), out.size()); if (main.second == 0) return {}; if (!AuTryResize(out, main.second)) return {}; out.shrink_to_fit(); return main; } }; struct TextStreamEncoder { ECodePage page; EncoderAdapter state; TextStreamEncoder(ECodePage page = ECodePage::eUTF32) : page(page) { state.Init(page, false); } AuStreamReadWrittenPair_t DecodeUTF8(const void *utf8In, AuUInt32 length, AuString &out) { if (page == ECodePage::eUnsupported) { return {}; } if (!utf8In) { return {}; } if (!length) { return {}; } auto preemptive = state.UTF8ToCp(utf8In, length, nullptr, 0); auto written = state.UTF8ToCp(utf8In, preemptive.second, out.data(), AuUInt32(out.size())); out.resize(written.second); out.shrink_to_fit(); return written; } AuStreamReadWrittenPair_t DecodeUTF8(const void *utf8In, AuUInt32 utf8Length, void *binaryOut, AuUInt32 binaryLength) { if (page == ECodePage::eUnsupported) { return {}; } return state.UTF8ToCp(utf8In, utf8Length, binaryOut, binaryLength); } }; /// 'TextStreamProcessor', a stateful wrapper around DecodeUTF8 /// Using this you can handle a stateful, optionally bom prefixed, stream /// Initialization (ie: setting a default codepage) is optional using TextStreamProcessor = typename TextStreamDecoderImpl; using TextStreamProcessorInternal = typename TextStreamDecoderImpl; }