242 lines
7.8 KiB
C++
242 lines
7.8 KiB
C++
/***
|
|
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
|
|
|
File: Encoding.hpp
|
|
Date: 2021-8-18
|
|
Author: Reece
|
|
***/
|
|
#pragma once
|
|
|
|
namespace Aurora::Locale::Encoding
|
|
{
|
|
AuStreamReadWrittenPair_t DecodeUTF8(void *binary, AuUInt32 binaryLength, AuString &out, ECodePage page = ECodePage::eUnsupported);
|
|
|
|
template <bool optimized = false>
|
|
struct TextStreamDecoderImpl
|
|
{
|
|
bool readHeader {};
|
|
ECodePage page = ECodePage::eUnsupported;
|
|
ECodePage defaultPage = ECodePage::eUnsupported;
|
|
EncoderAdapter state;
|
|
|
|
TextStreamDecoderImpl(ECodePage page = ECodePage::eSysUnk) : defaultPage(page) {}
|
|
|
|
using TypeIn_t = std::conditional_t<optimized, void *, const void *>;
|
|
using TypeCast_t = std::conditional_t<optimized, AuUInt8 *, const AuUInt8 *>;
|
|
|
|
|
|
static int GetLenSJISCodePoint(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
if (len == 0) return 0;
|
|
auto b = in[0];
|
|
if (b >= 0x80)
|
|
{
|
|
if (b <= 0xDF)
|
|
{
|
|
if (len < 2) return 0;
|
|
else return 2;
|
|
}
|
|
else if (b <= 0xEF)
|
|
{
|
|
if (len < 3) return 0;
|
|
else return 3;
|
|
}
|
|
else
|
|
{
|
|
if (len < 4) return 0;
|
|
else return 4;
|
|
}
|
|
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int GetLenSJISString(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
AuUInt32 i;
|
|
for (i = 0; i < len; )
|
|
{
|
|
auto next = GetLenSJISCodePoint(in + i, len - i);
|
|
if (next == 0) return i;
|
|
i += next;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
static int GetLenGBKCodePoint(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
if (len == 0) return 0;
|
|
auto b = in[0];
|
|
if (b >= 0x80)
|
|
{
|
|
if (len < 2) return 0;
|
|
else return 2;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int GetLenGBKString(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
AuUInt32 i;
|
|
for (i = 0; i < len; )
|
|
{
|
|
auto next = GetLenGBKCodePoint(in + i, len - i);
|
|
if (next == 0) return i;
|
|
i += next;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
#define AU_HIGH_SURROGATE_START 0xd800
|
|
#define AU_HIGH_SURROGATE_END 0xdbff
|
|
#define AU_LOW_SURROGATE_START 0xdc00
|
|
#define AU_LOW_SURROGATE_END 0xdfff
|
|
#define AU_IS_HIGH_SURROGATE(wch) (((wch) >= AU_HIGH_SURROGATE_START) && ((wch) <= AU_HIGH_SURROGATE_END))
|
|
#define AU_IS_LOW_SURROGATE(wch) (((wch) >= AU_LOW_SURROGATE_START) && ((wch) <= AU_LOW_SURROGATE_END))
|
|
|
|
static int GetLenUC2CodePoint(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
if (len < 2) return 0;
|
|
auto a = *reinterpret_cast<const AuUInt16 *>(in);
|
|
if (!AU_IS_HIGH_SURROGATE(a)) return 2;
|
|
|
|
if (len < 4) return 0;
|
|
auto b = *reinterpret_cast<const AuUInt16 *>(in + 2);
|
|
return AU_IS_LOW_SURROGATE(b) ? 4 : 0;
|
|
}
|
|
|
|
static int GetLenUC2String(const AuUInt8 *in, AuUInt32 len)
|
|
{
|
|
AuUInt32 i;
|
|
for (i = 0; i < len; )
|
|
{
|
|
auto next = GetLenUC2CodePoint(in + i, len - i);
|
|
if (next == 0) return i;
|
|
i += next;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
AuStreamReadWrittenPair_t EncodeUTF8(TypeIn_t binary, AuUInt32 binaryLength, void *utf8, AuUInt32 utfLen)
|
|
{
|
|
int offset = 0;
|
|
|
|
if (!binary) return {};
|
|
if (!binaryLength) return {};
|
|
|
|
if (!std::exchange(readHeader, true))
|
|
{
|
|
if (page == ECodePage::eUnsupported)
|
|
{
|
|
auto header = DecodeBOM(binary, binaryLength);
|
|
if (header)
|
|
{
|
|
page = header->first;
|
|
offset = header->second;
|
|
}
|
|
else
|
|
{
|
|
if ((defaultPage != ECodePage::eUnsupported))
|
|
{
|
|
page = defaultPage;
|
|
}
|
|
else
|
|
{
|
|
page = GetInternalCodePage();
|
|
}
|
|
}
|
|
state.Init(page, true);
|
|
}
|
|
}
|
|
|
|
if (page == ECodePage::eUnsupported)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
binaryLength = binaryLength - offset;
|
|
|
|
if (page == ECodePage::eGBK)
|
|
{
|
|
binaryLength = GetLenGBKString(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength);
|
|
}
|
|
else if (page == ECodePage::eSJIS)
|
|
{
|
|
binaryLength = GetLenSJISString(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength);
|
|
}
|
|
else if ((page == ECodePage::eUTF16) || (page == ECodePage::eUTF16BE))
|
|
{
|
|
binaryLength = GetLenUC2String(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength);
|
|
}
|
|
else if ((page == ECodePage::eUTF32) || (page == ECodePage::eUTF32BE))
|
|
{
|
|
binaryLength &= ~3;
|
|
}
|
|
|
|
auto real = state.CPToUTF8(reinterpret_cast<TypeCast_t>(binary) + offset, binaryLength, utf8, utfLen);
|
|
return AuMakePair(real.first + offset, real.second);
|
|
}
|
|
|
|
AuStreamReadWrittenPair_t EncodeUTF8(TypeIn_t binary, AuUInt32 binaryLength, AuString &out)
|
|
{
|
|
auto preemptive = EncodeUTF8(binary, binaryLength, nullptr, 0);
|
|
if (!AuTryResize(out, preemptive.second)) return {};
|
|
auto main = EncodeUTF8(binary, preemptive.second, out.data(), out.size());
|
|
if (main.second == 0) return {};
|
|
if (!AuTryResize(out, main.second)) return {};
|
|
out.shrink_to_fit();
|
|
return main;
|
|
}
|
|
};
|
|
|
|
struct TextStreamEncoder
|
|
{
|
|
ECodePage page;
|
|
EncoderAdapter state;
|
|
|
|
TextStreamEncoder(ECodePage page = ECodePage::eUTF32) : page(page)
|
|
{
|
|
state.Init(page, false);
|
|
}
|
|
|
|
AuStreamReadWrittenPair_t DecodeUTF8(const void *utf8In, AuUInt32 length, AuString &out)
|
|
{
|
|
if (page == ECodePage::eUnsupported)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
if (!utf8In)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
if (!length)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
auto preemptive = state.UTF8ToCp(utf8In, length, nullptr, 0);
|
|
auto written = state.UTF8ToCp(utf8In, preemptive.second, out.data(), AuUInt32(out.size()));
|
|
out.resize(written.second);
|
|
out.shrink_to_fit();
|
|
return written;
|
|
}
|
|
|
|
AuStreamReadWrittenPair_t DecodeUTF8(const void *utf8In, AuUInt32 utf8Length, void *binaryOut, AuUInt32 binaryLength)
|
|
{
|
|
if (page == ECodePage::eUnsupported)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
return state.UTF8ToCp(utf8In, utf8Length, binaryOut, binaryLength);
|
|
}
|
|
};
|
|
|
|
/// 'TextStreamProcessor', a stateful wrapper around DecodeUTF8
|
|
/// Using this you can handle a stateful, optionally bom prefixed, stream
|
|
/// Initialization (ie: setting a default codepage) is optional
|
|
using TextStreamProcessor = typename TextStreamDecoderImpl<false>;
|
|
using TextStreamProcessorInternal = typename TextStreamDecoderImpl<true>;
|
|
} |