263 lines
6.5 KiB
C++
263 lines
6.5 KiB
C++
/***
|
|
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
|
|
|
File: AuUTF16.hpp
|
|
Date: 2021-10-31
|
|
Author: Reece
|
|
***/
|
|
#pragma once
|
|
|
|
#include "AuUTF8.hpp"
|
|
|
|
namespace Aurora::Locale::Encoding::UTF16
|
|
{
|
|
static void SwapU16(void *base, AuUInt32 count)
|
|
{
|
|
count *= 2;
|
|
for (AuUInt32 i = 0; i < count; i += 2)
|
|
{
|
|
AuWriteU16BE(base, i, AuReadU16LE(base, i));
|
|
}
|
|
}
|
|
|
|
static const AuUInt16 kHighSurrogateStart = 0xd800;
|
|
static const AuUInt16 kHighSurrogateEnd = 0xdbff;
|
|
static constexpr bool IsHighSurrogate(const AuUInt16 i)
|
|
{
|
|
return (i & AuUInt16(0xfffffc00)) == kHighSurrogateStart;
|
|
}
|
|
|
|
static const AuUInt16 kLowSurrogateStart = 0xdc00;
|
|
static const AuUInt16 kLowSurrogateEnd = 0xdfff;
|
|
static constexpr bool IsLowSurrogate(const AuUInt16 i)
|
|
{
|
|
return (i & AuUInt16(0xfffffc00)) == kLowSurrogateStart;
|
|
}
|
|
|
|
static int GetLenUC2CodePointLE(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
// Check for at least one U16 word
|
|
if (len < 2)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
// Neeto, we found a codepoint in range
|
|
if (!IsHighSurrogate(AuReadU16LE(in, 0)))
|
|
{
|
|
return 2;
|
|
}
|
|
|
|
// Check we have enough stream overhead to consume 4 bytes
|
|
if (len < 4)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return IsLowSurrogate(AuReadU16LE(in, 2)) ? 4 : 0;
|
|
}
|
|
|
|
static int GetLenUC2CodePointBE(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
// Check for at least one U16 word
|
|
if (len < 2)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
// Neeto, we found a codepoint in range
|
|
if (!IsHighSurrogate(AuReadU16BE(in, 0)))
|
|
{
|
|
return 2;
|
|
}
|
|
|
|
// Check we have enough stream overhead to consume 4 bytes
|
|
if (len < 4)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return IsLowSurrogate(AuReadU16BE(in, 2)) ? 4 : 0;
|
|
}
|
|
|
|
static int GetLenUC2CodePoint(AuUInt32 &codepoint, const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
// Check for at least one U16 word
|
|
if (len < 2)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
// Neeto, we found a codepoint in range
|
|
auto high = AuReadU16LE(in, 0);
|
|
if (!IsHighSurrogate(high))
|
|
{
|
|
codepoint = high;
|
|
return 2;
|
|
}
|
|
|
|
// Check we have enough stream overhead to consume 4 bytes
|
|
if (len < 4)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
auto low = AuReadU16LE(in, 2);
|
|
if (!IsLowSurrogate(low))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
codepoint = ((AuUInt32(high & 0x3FFU) << 10) | AuUInt32(low & 0x3FFU)) + 0x10000;
|
|
return 4;
|
|
}
|
|
|
|
static AuStreamReadWrittenPair_t CPToUTF8(const AuMemoryViewRead &utf16, const AuMemoryViewWrite &utf8)
|
|
{
|
|
AuUInt32 cp {};
|
|
AuUInt nextOffset {};
|
|
AuUInt totalOffset {};
|
|
AuUInt32 writeOffset {};
|
|
|
|
const AuUInt8 *pItr = utf16.Begin<AuUInt8>(); //reinterpret_cast<const AuUInt8 *>(in);
|
|
auto pcUtf8Itr = utf8.Begin<char>();
|
|
auto length = utf16.length;
|
|
|
|
if (utf8)
|
|
{
|
|
while (nextOffset = GetLenUC2CodePoint(cp, pItr, length))
|
|
{
|
|
length -= nextOffset;
|
|
pItr += nextOffset;
|
|
totalOffset += nextOffset;
|
|
|
|
if (!UTF8::WriteCp(cp, pcUtf8Itr, writeOffset, utf8.length))
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
while (nextOffset = GetLenUC2CodePoint(cp, pItr, length))
|
|
{
|
|
length -= nextOffset;
|
|
pItr += nextOffset;
|
|
totalOffset += nextOffset;
|
|
|
|
writeOffset += UTF8::CountU8Overhead(cp);
|
|
}
|
|
}
|
|
|
|
return {totalOffset, writeOffset};
|
|
}
|
|
|
|
static AuStreamReadWrittenPair_t UTF8ToCp(const AuMemoryViewRead &utf8, const AuMemoryViewWrite &utf16)
|
|
{
|
|
auto pair = ReadUTF8IntoUTF32ByteString(utf8, {});
|
|
auto bytes = pair.first;
|
|
auto codepoints = pair.second / sizeof(AuUInt32);
|
|
|
|
if (!(pair.first & pair.second))
|
|
{
|
|
return {};
|
|
}
|
|
|
|
auto utf32Array = AuMakeSharedArray<AuUInt32>(codepoints);
|
|
if (!utf32Array)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
auto pair2 = ReadUTF8IntoUTF32ByteString(utf8, {utf32Array.get(), codepoints});
|
|
|
|
if (!(pair2.first & pair2.second))
|
|
{
|
|
return {};
|
|
}
|
|
|
|
if (!utf16)
|
|
{
|
|
AuUInt cpOffset {};
|
|
auto ptr = utf32Array.get();
|
|
|
|
for (AuUInt i = 0; i < codepoints; i++)
|
|
{
|
|
if (ptr[i] <= 0xFFFFU)
|
|
{
|
|
cpOffset += 2;
|
|
}
|
|
else
|
|
{
|
|
cpOffset += 4;
|
|
}
|
|
}
|
|
|
|
return {pair2.first, cpOffset};
|
|
}
|
|
|
|
auto out = utf16.Begin<AuUInt16>();
|
|
auto end = utf16.End<AuUInt16>();
|
|
|
|
AuUInt cpOffset {};
|
|
for (AuUInt i = 0; i < codepoints; i++)
|
|
{
|
|
auto c = utf32Array.get()[i];
|
|
|
|
if (c <= 0xFFFFU)
|
|
{
|
|
if ((out + 2) > end)
|
|
{
|
|
break;
|
|
}
|
|
|
|
*(out++) = c;
|
|
cpOffset += 2;
|
|
}
|
|
else
|
|
{
|
|
c -= 0x10000U;
|
|
auto high = AuUInt16(0xD800U | ((c >> 10) & 0x3FFU));
|
|
auto low = AuUInt16(0xDC00U | (c & 0x3FFU));
|
|
|
|
if ((out + 4) > end)
|
|
{
|
|
break;
|
|
}
|
|
|
|
*(out++) = high;
|
|
cpOffset += 2;
|
|
*(out++) = low;
|
|
cpOffset += 2;
|
|
}
|
|
}
|
|
|
|
return {pair2.first, cpOffset};
|
|
}
|
|
|
|
static int Count16(const void *base, AuUInt32 length, bool bytes = false, bool le = true)
|
|
{
|
|
AuUInt32 i {}, cps {};
|
|
|
|
for (; i < length; )
|
|
{
|
|
auto next = le ?
|
|
GetLenUC2CodePointLE(((const AuUInt8 *)base) + i, length - i) :
|
|
GetLenUC2CodePointBE(((const AuUInt8 *)base) + i, length - i);
|
|
if (next == 0)
|
|
{
|
|
return bytes ? i : cps;
|
|
}
|
|
|
|
if (i + next > length)
|
|
{
|
|
return bytes ? i : cps;
|
|
}
|
|
|
|
i += next;
|
|
cps++;
|
|
}
|
|
|
|
return bytes ? i : cps;
|
|
}
|
|
} |