From fe19a47a2ad03a8ae34a044753645dec8dc0b625 Mon Sep 17 00:00:00 2001 From: Reece Wilson Date: Sat, 13 Aug 2022 06:01:41 +0100 Subject: [PATCH] [+] Missing external class if iterative/codepoint length locale apis [+] (internal) UTF8::IterateUTF8 --- Include/Aurora/Locale/Encoding/Encoding.hpp | 10 +++- Source/Locale/Encoding/Encoding.cpp | 52 +++++++++++++++++++++ Source/Locale/Encoding/UTFn/AuUTF8.hpp | 48 +++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) diff --git a/Include/Aurora/Locale/Encoding/Encoding.hpp b/Include/Aurora/Locale/Encoding/Encoding.hpp index 71d43d97..af38d2b8 100644 --- a/Include/Aurora/Locale/Encoding/Encoding.hpp +++ b/Include/Aurora/Locale/Encoding/Encoding.hpp @@ -40,6 +40,14 @@ namespace Aurora::Locale::Encoding AUKN_SYM AuUInt32 CountSJISLength (const Memory::MemoryViewRead &sjis, bool bytes = false); // codepoint = one character AUKN_SYM AuUInt32 CountGBK16Length (const Memory::MemoryViewRead &gbk, bool bytes = false); // codepoint = at most; one GBK byte pair - AUKN_SYM AuUInt32 CountEncodedStringLength(ECodePage page, const Memory::MemoryViewRead &view, bool bytes = false); + + AUKN_SYM AuUInt32 IterateUTF32 (const Memory::MemoryViewRead &utf32); + AUKN_SYM AuUInt32 IterateUTF16 (const Memory::MemoryViewRead &utf16); + AUKN_SYM AuUInt32 IterateUTF16BE(const Memory::MemoryViewRead &utf16); + AUKN_SYM AuUInt32 IterateUTF8 (const Memory::MemoryViewRead &utf8); + AUKN_SYM AuUInt32 IterateSJIS (const Memory::MemoryViewRead &sjis); + AUKN_SYM AuUInt32 IterateGBK16 (const Memory::MemoryViewRead &gbk); + + AUKN_SYM AuUInt32 IterateEncodedString(ECodePage page, const Memory::MemoryViewRead &view); } \ No newline at end of file diff --git a/Source/Locale/Encoding/Encoding.cpp b/Source/Locale/Encoding/Encoding.cpp index 6f23255e..a0f5f435 100644 --- a/Source/Locale/Encoding/Encoding.cpp +++ b/Source/Locale/Encoding/Encoding.cpp @@ -132,4 +132,56 @@ namespace Aurora::Locale::Encoding return {}; } } + + AUKN_SYM AuUInt32 IterateUTF32(const Memory::MemoryViewRead &utf32) + { + return utf32.length < 4 ? 0 : 4; + } + + AUKN_SYM AuUInt32 IterateUTF16(const Memory::MemoryViewRead &utf16) + { + return UTF16::GetLenUC2CodePointLE(utf16.ToPointer(), utf16.length); + } + + AUKN_SYM AuUInt32 IterateUTF16BE(const Memory::MemoryViewRead &utf16) + { + return UTF16::GetLenUC2CodePointBE(utf16.ToPointer(), utf16.length); + } + + AUKN_SYM AuUInt32 IterateUTF8(const Memory::MemoryViewRead &utf8) + { + return UTF8::IterateUTF8(utf8); + } + + AUKN_SYM AuUInt32 IterateSJIS(const Memory::MemoryViewRead &sjis) + { + return SJIS::GetLenSJISCodePoint(sjis.ToPointer(), sjis.length); + } + + AUKN_SYM AuUInt32 IterateGBK16(const Memory::MemoryViewRead &gbk) + { + return GBK::GetLenGBKCodePoint(gbk.ToPointer(), gbk.length); + } + + AUKN_SYM AuUInt32 IterateEncodedString(ECodePage page, const Memory::MemoryViewRead &view) + { + switch (page) + { + case ECodePage::eGBK: + return IterateGBK16(view); + case ECodePage::eUTF8: + return IterateUTF8(view); + case ECodePage::eSJIS: + return IterateSJIS(view); + case ECodePage::eUTF32: + case ECodePage::eUTF32BE: + return IterateUTF16(view); + case ECodePage::eUTF16: + return IterateUTF16(view); + case ECodePage::eUTF16BE: + return IterateUTF16BE(view); + default: + return {}; + } + } } \ No newline at end of file diff --git a/Source/Locale/Encoding/UTFn/AuUTF8.hpp b/Source/Locale/Encoding/UTFn/AuUTF8.hpp index c97bc636..bf19a491 100644 --- a/Source/Locale/Encoding/UTFn/AuUTF8.hpp +++ b/Source/Locale/Encoding/UTFn/AuUTF8.hpp @@ -170,4 +170,52 @@ namespace Aurora::Locale::Encoding::UTF8 else return 0; // I've seen 7 char support in some libs, i thought we should only go up to 6? i haven't seen a coeffient of 0x80000000 x 200h [-1] used in any of them } + + static AuUInt32 IterateUTF8(const Memory::MemoryViewRead &utf8) + { + const char *pItr = utf8.Begin(); + AuUInt32 nby = 0; + + auto ch = *pItr; + unsigned int result = (ch & 0xF0); + + if ((ch & 0x80) == 0) + { + nby = 1; + } + else if ((ch & 0xE0) == 0xC0) + { + nby = 2; + } + else if (result == 0xE0) + { + nby = 3; + } + else if (result == 0xF0) + { + if ((ch & 0x08) == 0x08) + { + nby = 5; + } + else if ((ch & 0x0c) == 0x0c) + { + nby = 6; + } + else + { + nby = 4; + } + } + else + { + return 0; + } + + if (nby > utf8.length) + { + return 0; + } + + return nby; + } } \ No newline at end of file