/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: AuUTF8.hpp Date: 2021-10-31 Author: Reece ***/ #pragma once namespace Aurora::Locale::Encoding::UTF8 { static bool WriteCp(AuUInt32 cp, char *&result, AuUInt32 &counter, AuUInt32 max) { if (cp < 0x80) { if (counter + 1 > max) { return false; } counter += 1; *(result++) = static_cast(cp); } else if (cp < 0x800) { if (counter + 2 > max) { return false; } counter += 2; *(result++) = static_cast((cp >> 6) | 0xc0); *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp < 0x10000) { if (counter + 3 > max) { return false; } counter += 3; *(result++) = static_cast((cp >> 12) | 0xe0); *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp < 0x200000) { if (counter + 4 > max) { return false; } counter += 4; *(result++) = static_cast((cp >> 18) | 0xf0); *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp < 0x4000000) { if (counter + 5 > max) { return false; } counter += 5; *(result++) = static_cast((cp >> 24) | 0xf8); *(result++) = static_cast(((cp >> 18) & 0x3f) | 0x80); *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp < 0x80000000) { if (counter + 6 > max) { return false; } counter += 6; *(result++) = static_cast((cp >> 30) | 0xfc); *(result++) = static_cast(((cp >> 24) & 0x3f) | 0x80); *(result++) = static_cast(((cp >> 18) & 0x3f) | 0x80); *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); *(result++) = static_cast((cp & 0x3f) | 0x80); } else { return false; } return true; } static bool ReadUtf8ByteString(AuUInt32 *&cp, AuUInt32 *cpEnd, const char *&it, const char *end) { if (!it) { return false; } bool bUpdate = cp; while ((it < end) && // can consooooome ((bUpdate && cp < cpEnd) || !bUpdate)) { AuUInt32 c = 0; if (*it <= 0x7FU) { c = *it; ++it; } else { if ((*it & 0xC0U) != 0xC0U) { return false; } AuUInt32 nby = 0; for (AuUInt8 b = *it; (b & 0x80U) != 0; b <<= 1, ++nby) { } if (nby > 6) { return false; } if (AuUInt(end - it) < AuUInt(nby)) { return false; } c = *it & (AuUInt8(0xFFU) >> (nby + 1)); for (AuUInt32 i = 1; i < nby; ++i) { if ((it[i] & 0xC0U) != 0x80U) { return false; } c = (c << 6) | (it[i] & 0x3FU); } it += nby; } if (bUpdate) { *cp = c; } cp++; } return true; } static AuUInt32 CountU8Overhead(AuUInt32 cp) { if (cp < 0x80) return 1; else if (cp < 0x800) return 2; else if (cp < 0x10000) return 3; else if (cp < 0x200000) return 4; else if (cp < 0x400000) return 5; // diasbled/reserved range (was legal) else if (cp < 0x80000000) return 6; // disabled/reserved range else return 0; // I've seen 7 char support in some libs, i thought we should only go up to 6? i haven't seen a coeffient of 0x80000000 x 200h [-1] used in any of them } static AuUInt32 IterateUTF8(const Memory::MemoryViewRead &utf8) { const char *pItr = utf8.Begin(); AuUInt32 nby = 0; auto ch = *pItr; unsigned int result = (ch & 0xF0); if ((ch & 0x80) == 0) { nby = 1; } else if ((ch & 0xE0) == 0xC0) { nby = 2; } else if (result == 0xE0) { nby = 3; } else if (result == 0xF0) { if ((ch & 0x08) == 0x08) { nby = 5; } else if ((ch & 0x0c) == 0x0c) { nby = 6; } else { nby = 4; } } else { return 0; } if (nby > utf8.length) { return 0; } return nby; } }