AuroraRuntime/Source/Locale/Encoding/UTFn/AuUTF8.hpp
2022-08-13 06:03:24 +01:00

221 lines
5.7 KiB
C++

/***
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: AuUTF8.hpp
Date: 2021-10-31
Author: Reece
***/
#pragma once
namespace Aurora::Locale::Encoding::UTF8
{
static bool WriteCp(AuUInt32 cp, char *&result, AuUInt32 &counter, AuUInt32 max)
{
if (cp < 0x80)
{
if (counter + 1 > max)
{
return false;
}
counter += 1;
*(result++) = static_cast<AuUInt8>(cp);
}
else if (cp < 0x800)
{
if (counter + 2 > max)
{
return false;
}
counter += 2;
*(result++) = static_cast<AuUInt8>((cp >> 6) | 0xc0);
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000)
{
if (counter + 3 > max)
{
return false;
}
counter += 3;
*(result++) = static_cast<AuUInt8>((cp >> 12) | 0xe0);
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
}
else if (cp < 0x200000)
{
if (counter + 4 > max)
{
return false;
}
counter += 4;
*(result++) = static_cast<AuUInt8>((cp >> 18) | 0xf0);
*(result++) = static_cast<AuUInt8>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
}
else if (cp < 0x4000000)
{
if (counter + 5 > max)
{
return false;
}
counter += 5;
*(result++) = static_cast<AuUInt8>((cp >> 24) | 0xf8);
*(result++) = static_cast<AuUInt8>(((cp >> 18) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
}
else if (cp < 0x80000000)
{
if (counter + 6 > max)
{
return false;
}
counter += 6;
*(result++) = static_cast<AuUInt8>((cp >> 30) | 0xfc);
*(result++) = static_cast<AuUInt8>(((cp >> 24) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>(((cp >> 18) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
}
else
{
return false;
}
return true;
}
static bool ReadUtf8ByteString(AuUInt32 *&cp, AuUInt32 *cpEnd, const char *&it, const char *end)
{
if (!it)
{
return false;
}
bool bUpdate = cp;
while ((it < end) && // can consooooome
((bUpdate && cp < cpEnd) || !bUpdate))
{
AuUInt32 c = 0;
if (*it <= 0x7FU)
{
c = *it;
++it;
}
else
{
if ((*it & 0xC0U) != 0xC0U)
{
return false;
}
AuUInt32 nby = 0;
for (AuUInt8 b = *it; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > 6)
{
return false;
}
if (AuUInt(end - it) < AuUInt(nby))
{
return false;
}
c = *it & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((it[i] & 0xC0U) != 0x80U)
{
return false;
}
c = (c << 6) | (it[i] & 0x3FU);
}
it += nby;
}
if (bUpdate)
{
*cp = c;
}
cp++;
}
return true;
}
static AuUInt32 CountU8Overhead(AuUInt32 cp)
{
if (cp < 0x80)
return 1;
else if (cp < 0x800)
return 2;
else if (cp < 0x10000)
return 3;
else if (cp < 0x200000)
return 4;
else if (cp < 0x400000)
return 5; // diasbled/reserved range (was legal)
else if (cp < 0x80000000)
return 6; // disabled/reserved range
else
return 0; // I've seen 7 char support in some libs, i thought we should only go up to 6? i haven't seen a coeffient of 0x80000000 x 200h [-1] used in any of them
}
static AuUInt32 IterateUTF8(const Memory::MemoryViewRead &utf8)
{
const char *pItr = utf8.Begin<char>();
AuUInt32 nby = 0;
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
nby = 6;
}
else
{
nby = 4;
}
}
else
{
return 0;
}
if (nby > utf8.length)
{
return 0;
}
return nby;
}
}