/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: AuUTF16.hpp Date: 2021-10-31 Author: Reece ***/ #pragma once #include "AuUTF8.hpp" namespace Aurora::Locale::Encoding::UTF16 { static void SwapU16(void *base, AuUInt32 count) { count *= 2; for (int i = 0; i < count; i += 2) { AuWriteU16BE(base, i, AuReadU16LE(base, i)); } } static const AuUInt16 kHighSurrogateStart = 0xd800; static const AuUInt16 kHighSurrogateEnd = 0xdbff; static constexpr bool IsHighSurrogate(const AuUInt16 i) { return (i & AuUInt16(0xfffffc00)) == kHighSurrogateStart; } static const AuUInt16 kLowSurrogateStart = 0xdc00; static const AuUInt16 kLowSurrogateEnd = 0xdfff; static constexpr bool IsLowSurrogate(const AuUInt16 i) { return (i & AuUInt16(0xfffffc00)) == kLowSurrogateStart; } static int GetLenUC2CodePoint(const AuUInt8 *in, AuUInt32 len) { // Check for at least one U16 word if (len < 2) { return 0; } // Neeto, we found a codepoint in range if (!IsHighSurrogate(AuReadU16LE(in, 0))) { return 2; } // Check we have enough stream overhead to consume 4 bytes if (len < 4) { return 0; } return IsLowSurrogate(AuReadU16LE(in, 2)) ? 4 : 0; } static int GetLenUC2CodePoint(AuUInt32 &codepoint, const AuUInt8 *in, AuUInt32 len) { // Check for at least one U16 word if (len < 2) { return 0; } // Neeto, we found a codepoint in range auto high = AuReadU16LE(in, 0); if (!IsHighSurrogate(high)) { codepoint = high; return 2; } // Check we have enough stream overhead to consume 4 bytes if (len < 4) { return 0; } auto low = AuReadU16LE(in, 2); if (!IsLowSurrogate(low)) { return 0; } codepoint = ((AuUInt32(high & 0x3FFU) << 10) | AuUInt32(low & 0x3FFU)) + 0x10000; return 4; } static AuStreamReadWrittenPair_t CPToUTF8(const AuMemoryViewRead &utf16, const AuMemoryViewWrite &utf8) { AuUInt32 cp {}; AuUInt nextOffset {}; AuUInt totalOffset {}; AuUInt32 writeOffset {}; const AuUInt8 *pItr = utf16.Begin(); //reinterpret_cast(in); auto pcUtf8Itr = utf8.Begin(); auto length = utf16.length; if (utf8) { while (nextOffset = GetLenUC2CodePoint(cp, pItr, length)) { length -= nextOffset; pItr += nextOffset; totalOffset += nextOffset; if (!UTF8::WriteCp(cp, pcUtf8Itr, writeOffset, utf8.length)) { break; } } } else { while (nextOffset = GetLenUC2CodePoint(cp, pItr, length)) { length -= nextOffset; pItr += nextOffset; totalOffset += nextOffset; writeOffset += UTF8::CountU8Overhead(cp); } } return {totalOffset, writeOffset}; } static AuStreamReadWrittenPair_t UTF8ToCp(const AuMemoryViewRead &utf8, const AuMemoryViewWrite &utf16) { auto pair = ReadUTF8IntoUTF32ByteString(utf8, {}); auto bytes = pair.first; auto codepoints = pair.second / sizeof(AuUInt32); if (!(pair.first & pair.second)) { return {}; } auto utf32Array = AuMakeSharedArray(codepoints); if (!utf32Array) { return {}; } auto pair2 = ReadUTF8IntoUTF32ByteString(utf8, {utf32Array.get(), codepoints}); if (!(pair2.first & pair2.second)) { return {}; } if (!utf16) { AuUInt cpOffset {}; auto ptr = utf32Array.get(); for (AuUInt i = 0; i < codepoints; i++) { if (ptr[i] <= 0xFFFFU) { cpOffset += 2; } else { cpOffset += 4; } } return {pair2.first, cpOffset}; } auto out = utf16.Begin(); auto end = utf16.End(); AuUInt cpOffset {}; for (AuUInt i = 0; i < codepoints; i++) { auto c = utf32Array.get()[i]; if (c <= 0xFFFFU) { if ((out + 2) > end) { break; } *(out++) = c; cpOffset += 2; } else { c -= 0x10000U; auto high = AuUInt16(0xD800U | ((c >> 10) & 0x3FFU)); auto low = AuUInt16(0xDC00U | (c & 0x3FFU)); if ((out + 4) > end) { break; } *(out++) = high; cpOffset += 2; *(out++) = low; cpOffset += 2; } } return {pair2.first, cpOffset}; } static int Count16(const void *base, AuUInt32 length, bool bytes = false) { AuUInt32 i {}, cps {}; for (; i < length; ) { auto next = GetLenUC2CodePoint(((const AuUInt8 *)base) + i, length - i); if (next == 0) { return bytes ? i : cps; } if (i + next > length) { return bytes ? i : cps; } i += next; cps++; } return bytes ? i : cps; } }