/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: Encoding.hpp Date: 2021-8-20 Author: Reece ***/ #pragma once namespace Aurora::Locale::Encoding { struct BOM { ECodePage page; AuUInt8 length; }; // Attempt to guess the code page signature of the string view provided by the binary view AUKN_SYM BOM DecodeBOM(const Memory::MemoryViewRead &binary); // General purpose arbitrary page to UTF8 (AuStrings are UTF-8 - not 16 or 32; bite me) AUKN_SYM AuStreamReadWrittenPair_t EncodeUTF8(const Memory::MemoryViewRead &utf8, const Memory::MemoryViewWrite &binary, ECodePage page = ECodePage::eEnumInvalid); AUKN_SYM AuStreamReadWrittenPair_t DecodeUTF8(const Memory::MemoryViewRead &binary, const Memory::MemoryViewWrite &utf8, ECodePage page = ECodePage::eEnumInvalid); AUKN_SYM AuStreamReadWrittenPair_t DecodeUTF8(const Memory::MemoryViewRead &binary, AuString &out, ECodePage page = ECodePage::eEnumInvalid); // Optimized UTF translation functions // Note: these functions support full 6-byte UTF8 encoding. 5&6 are reserved. AUKN_SYM AuStreamReadWrittenPair_t ReadUTF32IntoUTF8ByteString(const Memory::MemoryViewRead &utf32, const Memory::MemoryViewWrite &utf8); AUKN_SYM AuStreamReadWrittenPair_t ReadUTF8IntoUTF32ByteString(const Memory::MemoryViewRead &utf8, const Memory::MemoryViewWrite &utf32); // Endianswap functions, could be subject to SIMD optimizations AUKN_SYM void SwapUTF32(const Memory::MemoryViewWrite &utf32); AUKN_SYM void SwapUTF16(const Memory::MemoryViewWrite &utf16); // Counst the amount of codepoints in a buffer, breaking when the stream is incomplete, giving you the accurate amount of bytes or relevant codepoints in a stream view AUKN_SYM AuUInt32 CountUTF32Length (const Memory::MemoryViewRead &utf32, bool bytes = false); // codepoint = U32 encoded; always 4 bytes per codepoint AUKN_SYM AuUInt32 CountUTF16Length (const Memory::MemoryViewRead &utf16, bool bytes = false); // codepoint = U32 encoded; at most: 4 bytes per codepoint, usual: 2 bytes AUKN_SYM AuUInt32 CountUTF16BELength(const Memory::MemoryViewRead &utf16, bool bytes = false); // codepoint = U32 encoded; at most: 4 bytes per codepoint, usual: 2 bytes AUKN_SYM AuUInt32 CountUTF8Length (const Memory::MemoryViewRead &utf8, bool bytes = false); // codepoint = U32 encoded; at most: 6 bytes per codepoint AUKN_SYM AuUInt32 CountSJISLength (const Memory::MemoryViewRead &sjis, bool bytes = false); // codepoint = one character AUKN_SYM AuUInt32 CountGBK16Length (const Memory::MemoryViewRead &gbk, bool bytes = false); // codepoint = at most; one GBK byte pair AUKN_SYM AuUInt32 CountEncodedStringLength(ECodePage page, const Memory::MemoryViewRead &view, bool bytes = false); AUKN_SYM AuUInt32 IterateUTF32 (const Memory::MemoryViewRead &utf32); AUKN_SYM AuUInt32 IterateUTF16 (const Memory::MemoryViewRead &utf16); AUKN_SYM AuUInt32 IterateUTF16BE(const Memory::MemoryViewRead &utf16); AUKN_SYM AuUInt32 IterateUTF8 (const Memory::MemoryViewRead &utf8); AUKN_SYM AuUInt32 IterateSJIS (const Memory::MemoryViewRead &sjis); AUKN_SYM AuUInt32 IterateGBK16 (const Memory::MemoryViewRead &gbk); AUKN_SYM AuUInt32 IterateEncodedString(ECodePage page, const Memory::MemoryViewRead &view); }