221 lines
5.7 KiB
C++
221 lines
5.7 KiB
C++
/***
|
|
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
|
|
|
File: AuUTF8.hpp
|
|
Date: 2021-10-31
|
|
Author: Reece
|
|
***/
|
|
#pragma once
|
|
|
|
namespace Aurora::Locale::Encoding::UTF8
|
|
{
|
|
static bool WriteCp(AuUInt32 cp, char *&result, AuUInt32 &counter, AuUInt32 max)
|
|
{
|
|
if (cp < 0x80)
|
|
{
|
|
if (counter + 1 > max)
|
|
{
|
|
return false;
|
|
}
|
|
counter += 1;
|
|
*(result++) = static_cast<AuUInt8>(cp);
|
|
}
|
|
else if (cp < 0x800)
|
|
{
|
|
if (counter + 2 > max)
|
|
{
|
|
return false;
|
|
}
|
|
counter += 2;
|
|
*(result++) = static_cast<AuUInt8>((cp >> 6) | 0xc0);
|
|
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
|
|
}
|
|
else if (cp < 0x10000)
|
|
{
|
|
if (counter + 3 > max)
|
|
{
|
|
return false;
|
|
}
|
|
counter += 3;
|
|
*(result++) = static_cast<AuUInt8>((cp >> 12) | 0xe0);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
|
|
}
|
|
else if (cp < 0x200000)
|
|
{
|
|
if (counter + 4 > max)
|
|
{
|
|
return false;
|
|
}
|
|
counter += 4;
|
|
*(result++) = static_cast<AuUInt8>((cp >> 18) | 0xf0);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 12) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
|
|
}
|
|
else if (cp < 0x4000000)
|
|
{
|
|
if (counter + 5 > max)
|
|
{
|
|
return false;
|
|
}
|
|
counter += 5;
|
|
*(result++) = static_cast<AuUInt8>((cp >> 24) | 0xf8);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 18) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 12) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
|
|
}
|
|
else if (cp < 0x80000000)
|
|
{
|
|
if (counter + 6 > max)
|
|
{
|
|
return false;
|
|
}
|
|
counter += 6;
|
|
*(result++) = static_cast<AuUInt8>((cp >> 30) | 0xfc);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 24) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 18) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 12) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>(((cp >> 6) & 0x3f) | 0x80);
|
|
*(result++) = static_cast<AuUInt8>((cp & 0x3f) | 0x80);
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool ReadUtf8ByteString(AuUInt32 *&cp, AuUInt32 *cpEnd, const char *&it, const char *end)
|
|
{
|
|
if (!it)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool bUpdate = cp;
|
|
|
|
while ((it < end) && // can consooooome
|
|
((bUpdate && cp < cpEnd) || !bUpdate))
|
|
{
|
|
AuUInt32 c = 0;
|
|
if (*it <= 0x7FU)
|
|
{
|
|
c = *it;
|
|
++it;
|
|
}
|
|
else
|
|
{
|
|
if ((*it & 0xC0U) != 0xC0U)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
AuUInt32 nby = 0;
|
|
for (AuUInt8 b = *it; (b & 0x80U) != 0; b <<= 1, ++nby)
|
|
{
|
|
}
|
|
|
|
if (nby > 6)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (AuUInt(end - it) < AuUInt(nby))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
c = *it & (AuUInt8(0xFFU) >> (nby + 1));
|
|
|
|
for (AuUInt32 i = 1; i < nby; ++i)
|
|
{
|
|
if ((it[i] & 0xC0U) != 0x80U)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
c = (c << 6) | (it[i] & 0x3FU);
|
|
}
|
|
|
|
it += nby;
|
|
}
|
|
|
|
if (bUpdate)
|
|
{
|
|
*cp = c;
|
|
}
|
|
|
|
cp++;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static AuUInt32 CountU8Overhead(AuUInt32 cp)
|
|
{
|
|
if (cp < 0x80)
|
|
return 1;
|
|
else if (cp < 0x800)
|
|
return 2;
|
|
else if (cp < 0x10000)
|
|
return 3;
|
|
else if (cp < 0x200000)
|
|
return 4;
|
|
else if (cp < 0x400000)
|
|
return 5; // diasbled/reserved range (was legal)
|
|
else if (cp < 0x80000000)
|
|
return 6; // disabled/reserved range
|
|
else
|
|
return 0; // I've seen 7 char support in some libs, i thought we should only go up to 6? i haven't seen a coeffient of 0x80000000 x 200h [-1] used in any of them
|
|
}
|
|
|
|
static AuUInt32 IterateUTF8(const Memory::MemoryViewRead &utf8)
|
|
{
|
|
const char *pItr = utf8.Begin<char>();
|
|
AuUInt32 nby = 0;
|
|
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
if (nby > utf8.length)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return nby;
|
|
}
|
|
} |