AuroraRuntime/Source/Locale/Encoding/Encoding.hpp
2021-09-06 14:08:37 +01:00

191 lines
6.3 KiB
C++

/***
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: Encoding.hpp
Date: 2021-8-18
Author: Reece
***/
#pragma once
namespace Aurora::Locale::Encoding
{
AuStreamReadWrittenPair_t DecodeUTF8(void *binary, AuUInt32 binaryLength, AuString &out, ECodePage page = ECodePage::eUnsupported);
template <bool optimized = false>
struct TextStreamDecoderImpl
{
bool readHeader {};
ECodePage page = ECodePage::eUnsupported;
ECodePage defaultPage = ECodePage::eUnsupported;
EncoderAdapter state;
TextStreamDecoderImpl(ECodePage page = ECodePage::eSysUnk) : defaultPage(page) {}
using TypeIn_t = std::conditional_t<optimized, void *, const void *>;
using TypeCast_t = std::conditional_t<optimized, AuUInt8 *, const AuUInt8 *>;
static int GetLenSJISCodePoint(const AuUInt8 *in, AuUInt32 len)
{
if (len == 0) return 0;
auto b = in[0];
if (b >= 0x80)
{
if (b <= 0xDF)
{
if (len < 2) return 0;
else return 2;
}
else if (b <= 0xEF)
{
if (len < 3) return 0;
else return 3;
}
else
{
if (len < 4) return 0;
else return 4;
}
}
return 1;
}
static int GetLenSJISString(const AuUInt8 *in, AuUInt32 len)
{
AuUInt32 i;
for (i = 0; i < len; )
{
auto next = GetLenSJISCodePoint(in + i, len - i);
if (next == 0) return i;
i += next;
}
return i;
}
static int GetLenGBKCodePoint(const AuUInt8 *in, AuUInt32 len)
{
if (len == 0) return 0;
auto b = in[0];
if (b >= 0x80)
{
if (len < 2) return 0;
else return 2;
}
return 1;
}
static int GetLenGBKString(const AuUInt8 *in, AuUInt32 len)
{
AuUInt32 i;
for (i = 0; i < len; )
{
auto next = GetLenGBKCodePoint(in + i, len - i);
if (next == 0) return i;
i += next;
}
return i;
}
AuStreamReadWrittenPair_t EncodeUTF8(TypeIn_t binary, AuUInt32 binaryLength, void *utf8, AuUInt32 utfLen)
{
int offset = 0;
if (!std::exchange(readHeader, true))
{
if (page == ECodePage::eUnsupported)
{
auto header = DecodeBOM(binary, binaryLength);
if (header)
{
page = header->first;
offset = header->second;
}
else
{
if ((defaultPage != ECodePage::eUnsupported))
{
page = defaultPage;
}
else
{
page = GetInternalCodePage();
}
}
state.Init(page, true);
}
}
if (page == ECodePage::eUnsupported)
{
return {};
}
binaryLength = binaryLength - offset;
if (page == ECodePage::eGBK)
binaryLength = GetLenGBKString(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength);
else if (page == ECodePage::eSJIS)
binaryLength = GetLenSJISString(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength);
else if ((page == ECodePage::eUTF16) || (page == ECodePage::eUTF16BE))
binaryLength &= ~1;
else if ((page == ECodePage::eUTF32) || (page == ECodePage::eUTF32))
binaryLength &= ~3;
auto real = state.CPToUTF8(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength, utf8, utfLen);
return AuMakePair(real.first + offset, real.second);
}
AuStreamReadWrittenPair_t EncodeUTF8(TypeIn_t binary, AuUInt32 binaryLength, AuString &out)
{
int evil = 4;
if (page == ECodePage::eUTF16 || page == ECodePage::eUTF16BE) evil = 2; // utf 16 can only be 1.5x as long taking up 3 bytes
if (!AuTryResize(out, binaryLength * evil)) return {};
auto main = EncodeUTF8(binary, binaryLength, out.data(), out.size());
if (main.second == 0) return {};
if (!AuTryResize(out, main.second)) return {};
out.shrink_to_fit();
return main;
}
};
struct TextStreamEncoder
{
ECodePage page;
EncoderAdapter state;
TextStreamEncoder(ECodePage page = ECodePage::eUTF32) : page(page)
{
state.Init(page, false);
}
AuStreamReadWrittenPair_t DecodeUTF8(const void *utf8In, AuUInt32 length, AuString &out)
{
if (page == ECodePage::eUnsupported)
{
return {};
}
out.resize(length * 4);
auto written = state.UTF8ToCp(utf8In, length, out.data(), AuUInt32(out.size()));
out.resize(written.second);
out.shrink_to_fit();
return written;
}
AuStreamReadWrittenPair_t DecodeUTF8(const void *utf8In, AuUInt32 utf8Length, void *binaryOut, AuUInt32 binaryLength)
{
if (page == ECodePage::eUnsupported)
{
return {};
}
return state.UTF8ToCp(utf8In, utf8Length, binaryOut, binaryLength);
}
};
/// 'TextStreamProcessor', a stateful wrapper around DecodeUTF8
/// Using this you can handle a stateful, optionally bom prefixed, stream
/// Initialization (ie: setting a default codepage) is optional
using TextStreamProcessor = typename TextStreamDecoderImpl<false>;
using TextStreamProcessorInternal = typename TextStreamDecoderImpl<true>;
}