AuROXTL/Include/auROXTL/auStringUtils.hpp

/***
    Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved.

    File: auStringUtils.hpp
    Date: 2022-2-1
    File: AuroraUtils.hpp
    File: auROXTLUtils.hpp
    Date: 2021-6-9
    Author: Reece
    Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
             AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)

    Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
                AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
    Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
                AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto,
                AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
                AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
                AuCodepointsContains,
                AuCodepointsReplaceAll, AuCodepointsSplitString (views),
                AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
                AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase,
                AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView

    For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
***/
#pragma once

// offset in bytes
/* using CodepointByteOffset_t = decltype(AuROString::npos); */

// offset in codepoints
/* using CodepointOffset_t = AuUInt; */

#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8)
    static const AuUInt8 kAuCodepointUTF8MaxBytes = 4;
#elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8)
    static const AuUInt8 kAuCodepointUTF8MaxBytes = 7;
#else
// default:
    static const AuUInt8 kAuCodepointUTF8MaxBytes = 6;
#endif

// none of these are defined by default
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
#if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
#define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL
#endif
#endif

// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences
//                                                                                    (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting)
//                                                                                    (Enable this if you're boring)
// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL           | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl)
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8            | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared;
//                                                                                           ...ForEach will break early with false (usually implies a user break early condition);
//                                                                                           ...Translate will return an empty container

static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
{
    return value.find(subpattern) != AuROString::npos;
}

static auline constexpr bool AuEndsWith(AuROString const &value, AuROString const &ending)
{
    if (ending.size() > value.size()) return false;
    return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}

static auline constexpr bool AuStartsWith(AuROString const &value, AuROString const &starting)
{
#if defined(AU_STRING_IS_TINYUTF_EXPERIMENT)
    return value.starts_with(starting);
#else
    return value.rfind(starting, 0) == 0;
#endif
}

template <class T>
static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
{
    AuString ret;
    auto uLength = in.length();

    ret.resize(uLength);

    const char *pItr = in.data();
    const char *pEnd = pItr + uLength;

    AuUInt32 uCounter {};
    while (pItr != pEnd)
    {
        AuUInt32     nby {};
        auto         ch = *pItr;
        unsigned int result = (ch & 0xF0);

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        if (nby == 1)
        {
            ret[uCounter] = op(in[uCounter]);
        }
        else
        {
            AuMemcpy(&ret[uCounter], &in[uCounter], nby);
        }

        uCounter += nby;
        pItr += nby;
    }

    return ret;
}

static auline constexpr CodepointOffset_t AuCodepointsCount(const AuROString &in)
{
    CodepointOffset_t uCounter {};
    auto uLength = in.length();

    const char *pItr = in.data();
    const char *pEnd = pItr + uLength;

    while (pItr != pEnd)
    {
        AuUInt32     nby {};
        auto         ch = *pItr;
        unsigned int result = (ch & 0xF0);

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        uCounter++;
        pItr += nby;
    }

    return uCounter;
}

static auline constexpr CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
{
    if (in.length())
    {
        auto         ch = in[0];
        unsigned int result = (ch & 0xF0);

        if ((ch & 0x80) == 0)
        {
            return 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            return 2;
        }
        else if (result == 0xE0)
        {
            return 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return 5;
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return 6;
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return 7;
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return 0;
            }
            else
            {
                return 4;
            }
        }
    }

    return 0;
}

static auline constexpr bool AuIsAlpha(char c)
{
    return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
}

static auline constexpr char AuToLower(char c)
{
    return AuIsAlpha(c) ? c | 0x20 : c;
}

static auline constexpr char AuToUpper(char c)
{
    return AuIsAlpha(c) ? c & ~0x20 : c;
}

static auline AuString AuCodepointsToLower(const AuROString &in)
{
    return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToLower), in);
}

static auline AuString AuCodepointsToUpper(const AuROString &in)
{
    return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToUpper), in);
}

static auline AuString AuToLower(const AuROString &in)
{
    return AuCodepointsToLower(in);
}

static auline AuString AuToUpper(const AuROString &in)
{
    return AuCodepointsToUpper(in);
}

static constexpr AuOptional<AuUInt32> AuCodepointsDecodeOne(const AuROString &in)
{
    if (in.empty())
    {
        return {};
    }

    auto uLength = in.length();

    const char *pItr = in.data();
    const char *pEnd = pItr + uLength;

    while (pItr < pEnd)
    {
        AuUInt32 c {};

        if ((c = *pItr) <= 0x7FU)
        {
            ++pItr;
        }
        else
        {
            AuUInt32 nby {};

            if ((*pItr & 0xC0U) != 0xC0U)
            {
                return {};
            }

            for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
            {
            }

            if (nby > kAuCodepointUTF8MaxBytes)
            {
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return {};
            }

            if (AuUInt(pEnd - pItr) < AuUInt(nby))
            {
                return {};
            }

            c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));

            for (AuUInt32 i = 1; i < nby; ++i)
            {
                if ((pItr[i] & 0xC0U) != 0x80U)
                {
                    return {};
                }

                c = (c << 6) | (pItr[i] & 0x3FU);
            }

            pItr += nby;
        }

        return c;
    }

    return {};
}

static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
{
    AuList<AuUInt32> ret;

    if (in.empty())
    {
        return ret;
    }

    auto uLength = in.length();

    ret.reserve(uLength);

    const char *pItr = in.data();
    const char *pEnd = pItr + uLength;

    while (pItr < pEnd)
    {
        AuUInt32 c {};

        if ((c = *pItr) <= 0x7FU)
        {
            ++pItr;
        }
        else
        {
            AuUInt32 nby {};

            if ((*pItr & 0xC0U) != 0xC0U)
            {
                return {};
            }

            for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
            {
            }

            if (nby > kAuCodepointUTF8MaxBytes)
            {
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return {};
            }

            if (AuUInt(pEnd - pItr) < AuUInt(nby))
            {
                return {};
            }

            c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));

            for (AuUInt32 i = 1; i < nby; ++i)
            {
                if ((pItr[i] & 0xC0U) != 0x80U)
                {
                    return {};
                }

                c = (c << 6) | (pItr[i] & 0x3FU);
            }

            pItr += nby;
        }

        ret.push_back(c);
    }

    return ret;
}

static void AuCodepointsEncodeInto(AuUInt32 uCodepoint, AuString &out)
{
    if (uCodepoint < 0x80)
    {
        auto uLength = out.size();
        out.resize(uLength + 1);
        out[uLength] = static_cast<AuUInt8>(uCodepoint);
    }
    else if (uCodepoint < 0x800)
    {
        auto uLength = out.size();
        out.resize(uLength + 2);
        out[uLength]     = static_cast<AuUInt8>((uCodepoint >> 6) | 0xc0);
        out[uLength + 1] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
    }
    else if (uCodepoint < 0x10000)
    {
        auto uLength = out.size();
        out.resize(uLength + 3);
        out[uLength]     = static_cast<AuUInt8>((uCodepoint >> 12) | 0xe0);
        out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
        out[uLength + 2] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
    }
    else if (uCodepoint < 0x200000)
    {
        auto uLength = out.size();
        out.resize(uLength + 4);
        out[uLength]     = static_cast<AuUInt8>((uCodepoint >> 18) | 0xf0);
        out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
        out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
        out[uLength + 3] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
    }
    else if (uCodepoint < 0x4000000)
    {
        auto uLength = out.size();
        out.resize(uLength + 5);
        out[uLength]     = static_cast<AuUInt8>((uCodepoint >> 24) | 0xf8);
        out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
        out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
        out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
        out[uLength + 4] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
    }
    else if (uCodepoint < 0x80000000)
    {
        auto uLength = out.size();
        out.resize(uLength + 6);
        out[uLength]     = static_cast<AuUInt8>((uCodepoint >> 30) | 0xfc);
        out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 24) & 0x3f) | 0x80);
        out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
        out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
        out[uLength + 4] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
        out[uLength + 5] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
    }
}

template <class T>
static AuString AuCodepointsTransform(T op, const AuROString &in)
{
    AuString ret;

    if (in.empty())
    {
        return ret;
    }

    auto uLength = in.length();

    ret.reserve(uLength);

    const char *pItr = in.data();
    const char *pEnd = pItr + uLength;

    while (pItr < pEnd)
    {
        AuUInt32 c {};

        if ((c = *pItr) <= 0x7FU)
        {
            ++pItr;
        }
        else
        {
            AuUInt32 nby {};

            if ((*pItr & 0xC0U) != 0xC0U)
            {
                return {};
            }

            for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
            {
            }

            if (nby > kAuCodepointUTF8MaxBytes)
            {
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return {};
            }

            if (AuUInt(pEnd - pItr) < AuUInt(nby))
            {
                return {};
            }

            c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));

            for (AuUInt32 i = 1; i < nby; ++i)
            {
                if ((pItr[i] & 0xC0U) != 0x80U)
                {
                    return {};
                }

                c = (c << 6) | (pItr[i] & 0x3FU);
            }

            pItr += nby;
        }

        c = op(c);

        AuCodepointsEncodeInto(c, ret);
    }

    return ret;
}

template <class T>
static bool AuCodepointsForEach(T op, const AuROString &in)
{
    if (in.empty())
    {
        return true;
    }

    auto uLength = in.length();

    const char *pItr = in.data();
    const char *pEnd = pItr + uLength;

    while (pItr < pEnd)
    {
        AuUInt32 c {};

        if ((c = *pItr) <= 0x7FU)
        {
            ++pItr;
        }
        else
        {
            AuUInt32 nby {};

            if ((*pItr & 0xC0U) != 0xC0U)
            {
                return false;
            }

            for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
            {
            }

            if (nby > kAuCodepointUTF8MaxBytes)
            {
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                return false;
            }

            if (AuUInt(pEnd - pItr) < AuUInt(nby))
            {
                return false;
            }

            c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));

            for (AuUInt32 i = 1; i < nby; ++i)
            {
                if ((pItr[i] & 0xC0U) != 0x80U)
                {
                    return {};
                }

                c = (c << 6) | (pItr[i] & 0x3FU);
            }

            pItr += nby;
        }

        if constexpr (AuIsSame_v<AuResultOf_t<T, AuUInt32>, bool>)
        {
            if (!op(c))
            {
                return false;
            }
        }
        else
        {
            op(c);
        }
    }

    return true;
}

static bool AuCodepointsIsEqualIgnoreCase(const AuROString &inA,
                                          const AuROString &inB)
{
    if (inA.size() !=
        inB.size())
    {
        return false;
    }

    if (inA.empty())
    {
        return true;
    }

    const char *pItr  = inA.data();
    const char *pItr2 = inB.data();
    const char *pEnd  = pItr + inA.length();

    while (pItr < pEnd)
    {
        AuUInt32 c {};

        if ((c = *pItr) <= 0x7FU)
        {
            if (AuToLower(c) != AuToLower(*pItr2))
            {
                return false;
            }
            ++pItr2;
            ++pItr;
        }
        else
        {
            AuUInt32 nby {};

            if ((*pItr & 0xC0U) != 0xC0U)
            {
                return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
            }

            for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
            {
            }

            if (nby > kAuCodepointUTF8MaxBytes)
            {
                return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
            }

            if (AuUInt(pEnd - pItr) < AuUInt(nby))
            {
                return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
            }

            if (AuMemcmp(pItr, pItr2, nby) != 0)
            {
                return false;
            }

            pItr  += nby;
            pItr2 += nby;
        }
    }

    return true;
}

static auline constexpr CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
                                                                        CodepointOffset_t uCodepointIndex)
{
    AuUInt uCounter {};
    auto uLength = in.length();

    const char *pStart = in.data();
    const char *pItr   = pStart;
    const char *pEnd   = pStart + uLength;

    while (pItr != pEnd)
    {
        AuUInt32     nby {};
        auto         ch = *pItr;
        unsigned int result = (ch & 0xF0);

        if (uCounter == uCodepointIndex)
        {
            return CodepointByteOffset_t(pItr - pStart);
        }

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                nby = 7;
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        uCounter++;
        pItr += nby;
    }

    return AuROString::npos;
}

static auline constexpr CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
                                                                        CodepointOffset_t uCodepointIndex)
{
    AuUInt uCounter {};
    auto uLength = in.length();

    const char *pStart = in.data();
    const char *pItr   = pStart;
    const char *pEnd   = pStart + uLength;

    while (pItr != pEnd)
    {
        AuUInt32     nby {};
        auto         ch = *pItr;
        unsigned int result = (ch & 0xF0);

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        pItr += nby;

        if (uCounter == uCodepointIndex)
        {
            return CodepointByteOffset_t(pItr - pStart);
        }

        uCounter++;
    }

    return AuROString::npos;
}

static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
                                                        const AuROString &find,
                                                        CodepointByteOffset_t uStartPosition = {})
{
    AuUInt uCounter    = 0;
    auto uLength       = in.length();
    auto uFindLength   = find.length();

    const char *pStart = in.data();
    const char *pItr   = pStart;
    const char *pEnd   = pStart + uLength;

    while (pItr != pEnd)
    {
        AuUInt32              nby {};
        auto                  ch = *pItr;
        unsigned int          result = (ch & 0xF0);
        CodepointByteOffset_t uByteOffset(pItr - pStart);

        if (uByteOffset >= uStartPosition)
        {
            AuROString suffixView(pItr, pEnd);
            if (suffixView.length() > uFindLength)
            {
                suffixView = { suffixView.data(), uFindLength };
            }

            if (suffixView == find)
            {
                return uByteOffset;
            }
        }

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        uCounter++;
        pItr += nby;
    }

    return AuROString::npos;
}

static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
                                                                        const AuROString &find,
                                                                        CodepointByteOffset_t uStartPosition)
{
    AuUInt uCounter    = 0;
    auto uLength       = in.length();
    auto uFindLength   = find.length();

    const char *pStart = in.data();
    const char *pItr   = pStart + uStartPosition;
    const char *pEnd   = pStart + uLength;

    while (pItr != pEnd)
    {
        AuUInt32              nby {};
        auto                  ch = *pItr;
        unsigned int          result = (ch & 0xF0);
        CodepointByteOffset_t uByteOffset(pItr - pStart);

        {
            AuROString suffixView(pItr, pEnd);
            if (suffixView.length() > uFindLength)
            {
                suffixView = { suffixView.data(), uFindLength };
            }

            if (suffixView == find)
            {
                return uByteOffset;
            }
        }

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        uCounter++;
        pItr += nby;
    }

    return AuROString::npos;
}

static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(const AuROString &in,
                                                                                         CodepointOffset_t uStartPosition = {})
{
    AuUInt uCounter    = 0;
    auto uLength       = in.length();

    const char *pStart = in.data();
    const char *pItr   = pStart;
    const char *pEnd   = pStart + uLength;

    if (uStartPosition == 0)
    {
        return AuROString::npos;
    }

    while (pItr != pEnd)
    {
        AuUInt32     nby {};
        auto         ch = *pItr;
        unsigned int result = (ch & 0xF0);

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        uCounter++;

        if (uCounter == uStartPosition)
        {
            return CodepointByteOffset_t(pItr - pStart);
        }

        pItr += nby;
    }

    return AuROString::npos;
}

static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
                                                                                             CodepointByteOffset_t uStartPosition = {})
{
    const char * pStart = in.data();
    const char * pItr   = pStart + uStartPosition - 1;

    if (uStartPosition == 0)
    {
        return AuROString::npos;
    }
    else if ((*pItr & 0x80) == 0)
    {
        return uStartPosition - 1;
    }
    else
    {
        while (pItr != pStart)
        {
            if ((*pItr & 0xC0U) == 0x80U)
            {
                pItr--;
            }
            else
            {
                break;
            }
        }

        if ((*pItr & 0xC0U) != 0xC0U)
        {
            return AuROString::npos;
        }

        return CodepointByteOffset_t(pItr - pStart);
    }
}

static constexpr CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
                                                                   CodepointByteOffset_t uBytePosition)
{
    return AuCodepointsCount(in.substr(0, uBytePosition));
}

static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
                                                         const AuROString &find,
                                                         CodepointOffset_t uStartPosition = {})
{
    AuUInt uCounter {};
    auto uLength       = in.length();
    auto uFindLength   = find.length();

    const char *pStart = in.data();
    const char *pItr   = pStart;
    const char *pEnd   = pStart + uLength;

    while (pItr != pEnd)
    {
        AuUInt32     nby {};
        auto         ch = *pItr;
        unsigned int result = (ch & 0xF0);

        if (uCounter >= uStartPosition)
        {
            AuROString suffixView(pItr, pEnd);
            if (suffixView.length() > uFindLength)
            {
                suffixView = { suffixView.data(), uFindLength };
            }

            if (suffixView == find)
            {
                return uCounter;
            }
        }

        if ((ch & 0x80) == 0)
        {
            nby = 1;
        }
        else if ((ch & 0xE0) == 0xC0)
        {
            nby = 2;
        }
        else if (result == 0xE0)
        {
            nby = 3;
        }
        else if (result == 0xF0)
        {
            if ((ch & 0x08) == 0x08)
            {
                // Historic UTF8
                nby = 5;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0c) == 0x0c)
            {
                // Special UTF8
                nby = 6;
            #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0e) == 0x0e)
            {
                // Illegal UTF8
                nby = 7;
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
            }
            else if ((ch & 0x0f) == 0x0f)
            {
                // Not even logical
            #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
                AU_THROW_CONST_STRING("Illegal UTF8");
            #endif
                break;
            }
            else
            {
                nby = 4;
            }
        }
        else
        {
            break;
        }

        if (pItr + nby > pEnd)
        {
            break;
        }

        uCounter++;
        pItr += nby;
    }

    return AuROString::npos;
}

static bool AuCodepointsStartsWithEqualIgnoreCase(const AuROString &inA,
                                                  const AuROString &inB)
{
    if (inA.size() < inB.size())
    {
        return false;
    }

    if (inA.empty() ||
        inB.empty())
    {
        return true;
    }

    const char *pItr  = inA.data();
    const char *pItr2 = inB.data();
    const char *pEnd  = pItr + inB.length();

    while (pItr < pEnd)
    {
        AuUInt32 c {};

        if ((c = *pItr) <= 0x7FU)
        {
            if (AuToLower(c) != AuToLower(*pItr2))
            {
                return false;
            }
            ++pItr2;
            ++pItr;
        }
        else
        {
            AuUInt32 nby {};

            if ((*pItr & 0xC0U) != 0xC0U)
            {
                return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
            }

            for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
            {
            }

            if (nby > kAuCodepointUTF8MaxBytes)
            {
                return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
            }

            if (AuUInt(pEnd - pItr) < AuUInt(nby))
            {
                return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
            }

            if (AuMemcmp(pItr, pItr2, nby) != 0)
            {
                return false;
            }

            pItr  += nby;
            pItr2 += nby;
        }
    }

    return true;
}

static bool AuCodepointsEndsWithEqualIgnoreCase(const AuROString &inA,
                                                const AuROString &inB)
{
    if (inA.size() < inB.size())
    {
        return false;
    }

    if (inA.empty() ||
        inB.empty())
    {
        return true;
    }

    AuUInt uOffset  { inA.Size() };
    AuUInt uOffset2 { inB.Size() };

    while (uOffset && uOffset2)
    {
        AuUInt32 c {};

        auto uLastValid = AuCodepointsFindPreviousValidByteOffsetFromByteOffset(inB, uOffset2);
        if (uLastValid == AuROString::npos)
        {
            return false;
        }

        auto nby      = uOffset2 - uLastValid;

        if (AuSInt(uOffset) - AuSInt(nby) < 0)
        {
            return false;
        }

        auto pItr  = inA.data() + uOffset  - nby;
        auto pItr2 = inB.data() + uOffset2 - nby;

        if ((c = *pItr) <= 0x7FU)
        {
            if (AuToLower(c) != AuToLower(*pItr2))
            {
                return false;
            }

            uOffset  -= 1;
            uOffset2 -= 1;
        }
        else
        {
            if (AuMemcmp(pItr, pItr2, nby) != 0)
            {
                return false;
            }

            uOffset  -= nby;
            uOffset2 -= nby;
        }
    }

    return true;
}

static constexpr AuUInt AuCodepointsReverseIterate(const AuROString &string)
{
    auto uLastValid = AuCodepointsFindPreviousValidByteOffsetFromByteOffset(string, string.Size());
    if (uLastValid == AuROString::npos)
    {
        return AuROString::npos;
    }

    return string.Size() - uLastValid;
}

static constexpr AuROString AuCodepointsReverseIterateSubStrPrefixView(const AuROString &string)
{
    auto uOffset = AuCodepointsReverseIterate(string);
    if (uOffset == AuROString::npos)
    {
        return AuROString {};
    }

    return string.RemoveSuffix(uOffset);
}

static constexpr AuROString AuCodepointsReverseIterateSubStrSuffixView(const AuROString &string)
{
    auto uLastValid = AuCodepointsFindPreviousValidByteOffsetFromByteOffset(string, string.Size());
    if (uLastValid == AuROString::npos)
    {
        return AuROString {};
    }

    return string.RemovePrefix(uLastValid);
}

static bool AuCodepointsContains(const AuROString &value, const AuROString &subpattern, CodepointOffset_t uStartPosition = {})
{
    return AuCodepointsFindCodepointOffset(value, subpattern, uStartPosition) != AuROString::npos;
}

static AuString &AuCodepointsReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
    AuUInt uStartPosition {};
    while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos)
    {
        str.replace(uStartPosition, from.length(), to);
        uStartPosition += to.length();
    }
    return str;
}

static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
                                                  const AuROString &delim,
                                                  bool bIgnoreEmpty = true)
{
    AuList<AuROString> tokens;
    AuUInt uPrev {}, uPos {};
    tokens.reserve(str.size() / 16);
    do
    {
        uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev);
        if (uPos == AuROString::npos)
        {
            uPos = str.length();
        }
        auto token = str.substr(uPrev, uPos - uPrev);
        if ((!token.empty()) && bIgnoreEmpty)
        {
            tokens.push_back(token);
        }
        uPrev = uPos + delim.length();
    }
    while (uPos < str.length() && uPrev < str.length());
    return tokens;
}

static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
    AuUInt uStartPosition {};
    while ((uStartPosition = str.find(from, uStartPosition)) != AuROString::npos)
    {
        str.replace(uStartPosition, from.length(), to);
        uStartPosition += to.length();
    }
    return str;
}

// i told myself not to copy this, required a split function twice, now here we are :D
static AuList<AuROString> AuSplitString(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{
    AuList<AuROString> tokens;
    AuUInt uPrev {}, uPos {};
    tokens.reserve(str.size() / 16);
    do
    {
        uPos = str.find(delim, uPrev);
        if (uPos == AuROString::npos)
        {
            uPos = str.length();
        }
        auto token = str.substr(uPrev, uPos - uPrev);
        if ((!token.empty()) && bIgnoreEmpty)
        {
            tokens.push_back(token);
        }
        uPrev = uPos + delim.length();
    }
    while (uPos < str.length() && uPrev < str.length());
    return tokens;
}

static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{
    AuList<AuString> tokens;
    AuUInt prev = 0, pos = 0;
    tokens.reserve(str.size() / 16);
    do
    {
        pos = str.find(delim, prev);
        if (pos == AuROString::npos)
        {
            pos = str.length();
        }
        auto token = str.substr(prev, pos - prev);
        if ((!token.empty()) && bIgnoreEmpty)
        {
            tokens.push_back(AuString(token));
        }
        prev = pos + delim.length();
    }
    while (pos < str.length() && prev < str.length());
    return tokens;
}

#if !defined(AURORA_RUNTIME_TO_STRING)
    #define AURORA_RUNTIME_TO_STRING std::to_string
#endif

template <class T>
static auline AuString AuToString(const T &obj)
{
#if defined(_AUHAS_FMT)
    // locale independent and better optimized!
    return AuString(fmt::format("{}", obj));
#else
    // TODO: to_chars (locale independent)
    return AURORA_RUNTIME_TO_STRING(obj);
#endif
}