AuroraRuntime/Source/Locale/Encoding/UTFn/AuUTF16.hpp

/***
    Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: AuUTF16.hpp
    Date: 2021-10-31
    Author: Reece
***/
#pragma once

#include "AuUTF8.hpp"

namespace Aurora::Locale::Encoding::UTF16
{
    static void SwapU16(void *base, AuUInt32 count)
    {
        count *= 2;
        for (AuUInt32 i = 0; i < count; i += 2)
        {
            AuWriteU16BE(base, i, AuReadU16LE(base, i));
        }
    }

    static const AuUInt16 kHighSurrogateStart = 0xd800;
    static const AuUInt16 kHighSurrogateEnd = 0xdbff;
    static constexpr bool IsHighSurrogate(const AuUInt16 i)
    {
        return (i & AuUInt16(0xfffffc00)) == kHighSurrogateStart;
    }

    static const AuUInt16 kLowSurrogateStart  = 0xdc00;
    static const AuUInt16 kLowSurrogateEnd = 0xdfff;
    static constexpr bool IsLowSurrogate(const AuUInt16 i)
    {
        return (i & AuUInt16(0xfffffc00)) == kLowSurrogateStart;
    }

    static int GetLenUC2CodePointLE(const AuUInt8 *in, AuUInt32 len)
    {
        // Check for at least one U16 word
        if (len < 2)
        {
            return 0;
        }

        // Neeto, we found a codepoint in range
        if (!IsHighSurrogate(AuReadU16LE(in, 0)))
        {
            return 2;
        }

        // Check we have enough stream overhead to consume 4 bytes
        if (len < 4)
        {
            return 0;
        }

        return IsLowSurrogate(AuReadU16LE(in, 2)) ? 4 : 0;
    }

    static int GetLenUC2CodePointBE(const AuUInt8 *in, AuUInt32 len)
    {
        // Check for at least one U16 word
        if (len < 2)
        {
            return 0;
        }

        // Neeto, we found a codepoint in range
        if (!IsHighSurrogate(AuReadU16BE(in, 0)))
        {
            return 2;
        }

        // Check we have enough stream overhead to consume 4 bytes
        if (len < 4)
        {
            return 0;
        }

        return IsLowSurrogate(AuReadU16BE(in, 2)) ? 4 : 0;
    }

    static int GetLenUC2CodePoint(AuUInt32 &codepoint, const AuUInt8 *in, AuUInt32 len)
    {
        // Check for at least one U16 word
        if (len < 2)
        {
            return 0;
        }

        // Neeto, we found a codepoint in range
        auto high = AuReadU16LE(in, 0);
        if (!IsHighSurrogate(high))
        {
            codepoint = high;
            return 2;
        }

        // Check we have enough stream overhead to consume 4 bytes
        if (len < 4)
        {
            return 0;
        }

        auto low = AuReadU16LE(in, 2);
        if (!IsLowSurrogate(low))
        {
            return 0;
        }

        codepoint = ((AuUInt32(high & 0x3FFU) << 10) | AuUInt32(low & 0x3FFU)) + 0x10000;
        return 4;
    }

    static AuStreamReadWrittenPair_t CPToUTF8(const AuMemoryViewRead &utf16, const AuMemoryViewWrite &utf8)
    {
        AuUInt32 cp {};
        AuUInt nextOffset {};
        AuUInt totalOffset {};
        AuUInt32 writeOffset {};

        const AuUInt8 *pItr = utf16.Begin<AuUInt8>(); //reinterpret_cast<const AuUInt8 *>(in);
        auto pcUtf8Itr = utf8.Begin<char>();
        auto length = utf16.length;

        if (utf8)
        {
            while (nextOffset = GetLenUC2CodePoint(cp, pItr, length))
            {
                length -= nextOffset;
                pItr += nextOffset;
                totalOffset += nextOffset;

                if (!UTF8::WriteCp(cp, pcUtf8Itr, writeOffset, utf8.length))
                {
                    break;
                }
            }
        }
        else
        {
            while (nextOffset = GetLenUC2CodePoint(cp, pItr, length))
            {
                length -= nextOffset;
                pItr += nextOffset;
                totalOffset += nextOffset;

                writeOffset += UTF8::CountU8Overhead(cp);
            }
        }

        return {totalOffset, writeOffset};
    }

    static AuStreamReadWrittenPair_t UTF8ToCp(const AuMemoryViewRead &utf8, const AuMemoryViewWrite &utf16)
    {
        auto pair = ReadUTF8IntoUTF32ByteString(utf8, {});
        auto bytes = pair.first;
        auto codepoints = pair.second / sizeof(AuUInt32);

        if (!(pair.first & pair.second))
        {
            return {};
        }

        auto utf32Array = AuMakeSharedArray<AuUInt32>(codepoints);
        if (!utf32Array)
        {
            return {};
        }

        auto pair2 = ReadUTF8IntoUTF32ByteString(utf8, {utf32Array.get(), codepoints});

        if (!(pair2.first & pair2.second))
        {
            return {};
        }

        if (!utf16)
        {
            AuUInt cpOffset {};
            auto ptr = utf32Array.get();

            for (AuUInt i = 0; i < codepoints; i++)
            {
                if (ptr[i] <= 0xFFFFU)
                {
                    cpOffset += 2;
                }
                else
                {
                    cpOffset += 4;
                }
            }

            return {pair2.first, cpOffset};
        }

        auto out = utf16.Begin<AuUInt16>();
        auto end = utf16.End<AuUInt16>();

        AuUInt cpOffset {};
        for (AuUInt i = 0; i < codepoints; i++)
        {
            auto c = utf32Array.get()[i];

            if (c <= 0xFFFFU)
            {
                if ((out + 2) > end)
                {
                    break;
                }

                *(out++) = c;
                cpOffset += 2;
            }
            else
            {
                c -= 0x10000U;
                auto high = AuUInt16(0xD800U | ((c >> 10) & 0x3FFU));
                auto low  = AuUInt16(0xDC00U | (c & 0x3FFU));

                if ((out + 4) > end)
                {
                    break;
                }

                *(out++) = high;
                cpOffset += 2;
                *(out++) = low;
                cpOffset += 2;
            }
        }

        return {pair2.first, cpOffset};
    }

    static int Count16(const void *base, AuUInt32 length, bool bytes = false, bool le = true)
    {
        AuUInt32 i {}, cps {};

        for (; i < length; )
        {
            auto next = le ?
                        GetLenUC2CodePointLE(((const AuUInt8 *)base) + i, length - i) :
                        GetLenUC2CodePointBE(((const AuUInt8 *)base) + i, length - i);
            if (next == 0)
            {
                return bytes ? i : cps;
            }

            if (i + next > length)
            {
                return bytes ? i : cps;
            }

            i += next;
            cps++;
        }

        return bytes ? i : cps;
    }
}