AuROXTL/Include/auROXTL/auStringUtils.hpp

1653 lines
42 KiB
C++

/***
Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: auStringUtils.hpp
Date: 2022-2-1
File: AuroraUtils.hpp
File: auROXTLUtils.hpp
Date: 2021-6-9
Author: Reece
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto,
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
AuCodepointsContains,
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase,
AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView
For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
***/
#pragma once
// offset in bytes
/* using CodepointByteOffset_t = decltype(AuROString::npos); */
// offset in codepoints
/* using CodepointOffset_t = AuUInt; */
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 4;
#elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 7;
#else
// default:
static const AuUInt8 kAuCodepointUTF8MaxBytes = 6;
#endif
// none of these are defined by default
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
#if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
#define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL
#endif
#endif
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences
// (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting)
// (Enable this if you're boring)
// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl)
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared;
// ...ForEach will break early with false (usually implies a user break early condition);
// ...Translate will return an empty container
static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
{
return value.find(subpattern) != AuROString::npos;
}
static auline constexpr bool AuEndsWith(AuROString const &value, AuROString const &ending)
{
if (ending.size() > value.size()) return false;
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
static auline constexpr bool AuStartsWith(AuROString const &value, AuROString const &starting)
{
#if defined(AU_STRING_IS_TINYUTF_EXPERIMENT)
return value.starts_with(starting);
#else
return value.rfind(starting, 0) == 0;
#endif
}
template <class T>
static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
{
AuString ret;
auto uLength = in.length();
ret.resize(uLength);
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
AuUInt32 uCounter {};
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
if (nby == 1)
{
ret[uCounter] = op(in[uCounter]);
}
else
{
AuMemcpy(&ret[uCounter], &in[uCounter], nby);
}
uCounter += nby;
pItr += nby;
}
return ret;
}
static auline constexpr CodepointOffset_t AuCodepointsCount(const AuROString &in)
{
CodepointOffset_t uCounter {};
auto uLength = in.length();
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return uCounter;
}
static auline constexpr CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
{
if (in.length())
{
auto ch = in[0];
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
return 1;
}
else if ((ch & 0xE0) == 0xC0)
{
return 2;
}
else if (result == 0xE0)
{
return 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 6;
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 0;
}
else
{
return 4;
}
}
}
return 0;
}
static auline constexpr bool AuIsAlpha(char c)
{
return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
}
static auline constexpr char AuToLower(char c)
{
return AuIsAlpha(c) ? c | 0x20 : c;
}
static auline constexpr char AuToUpper(char c)
{
return AuIsAlpha(c) ? c & ~0x20 : c;
}
static auline AuString AuCodepointsToLower(const AuROString &in)
{
return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToLower), in);
}
static auline AuString AuCodepointsToUpper(const AuROString &in)
{
return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToUpper), in);
}
static auline AuString AuToLower(const AuROString &in)
{
return AuCodepointsToLower(in);
}
static auline AuString AuToUpper(const AuROString &in)
{
return AuCodepointsToUpper(in);
}
static constexpr AuOptional<AuUInt32> AuCodepointsDecodeOne(const AuROString &in)
{
if (in.empty())
{
return {};
}
auto uLength = in.length();
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return {};
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > kAuCodepointUTF8MaxBytes)
{
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return {};
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return {};
}
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((pItr[i] & 0xC0U) != 0x80U)
{
return {};
}
c = (c << 6) | (pItr[i] & 0x3FU);
}
pItr += nby;
}
return c;
}
return {};
}
static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
{
AuList<AuUInt32> ret;
if (in.empty())
{
return ret;
}
auto uLength = in.length();
ret.reserve(uLength);
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return {};
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > kAuCodepointUTF8MaxBytes)
{
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return {};
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return {};
}
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((pItr[i] & 0xC0U) != 0x80U)
{
return {};
}
c = (c << 6) | (pItr[i] & 0x3FU);
}
pItr += nby;
}
ret.push_back(c);
}
return ret;
}
static void AuCodepointsEncodeInto(AuUInt32 uCodepoint, AuString &out)
{
if (uCodepoint < 0x80)
{
auto uLength = out.size();
out.resize(uLength + 1);
out[uLength] = static_cast<AuUInt8>(uCodepoint);
}
else if (uCodepoint < 0x800)
{
auto uLength = out.size();
out.resize(uLength + 2);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 6) | 0xc0);
out[uLength + 1] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x10000)
{
auto uLength = out.size();
out.resize(uLength + 3);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 12) | 0xe0);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x200000)
{
auto uLength = out.size();
out.resize(uLength + 4);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 18) | 0xf0);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 3] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x4000000)
{
auto uLength = out.size();
out.resize(uLength + 5);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 24) | 0xf8);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 4] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x80000000)
{
auto uLength = out.size();
out.resize(uLength + 6);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 30) | 0xfc);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 24) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
out[uLength + 4] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 5] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
}
template <class T>
static AuString AuCodepointsTransform(T op, const AuROString &in)
{
AuString ret;
if (in.empty())
{
return ret;
}
auto uLength = in.length();
ret.reserve(uLength);
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return {};
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > kAuCodepointUTF8MaxBytes)
{
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return {};
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return {};
}
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((pItr[i] & 0xC0U) != 0x80U)
{
return {};
}
c = (c << 6) | (pItr[i] & 0x3FU);
}
pItr += nby;
}
c = op(c);
AuCodepointsEncodeInto(c, ret);
}
return ret;
}
template <class T>
static bool AuCodepointsForEach(T op, const AuROString &in)
{
if (in.empty())
{
return true;
}
auto uLength = in.length();
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return false;
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > kAuCodepointUTF8MaxBytes)
{
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return false;
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return false;
}
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((pItr[i] & 0xC0U) != 0x80U)
{
return {};
}
c = (c << 6) | (pItr[i] & 0x3FU);
}
pItr += nby;
}
if constexpr (AuIsSame_v<AuResultOf_t<T, AuUInt32>, bool>)
{
if (!op(c))
{
return false;
}
}
else
{
op(c);
}
}
return true;
}
static bool AuCodepointsIsEqualIgnoreCase(const AuROString &inA,
const AuROString &inB)
{
if (inA.size() !=
inB.size())
{
return false;
}
if (inA.empty())
{
return true;
}
const char *pItr = inA.data();
const char *pItr2 = inB.data();
const char *pEnd = pItr + inA.length();
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
if (AuToLower(c) != AuToLower(*pItr2))
{
return false;
}
++pItr2;
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > kAuCodepointUTF8MaxBytes)
{
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
if (AuMemcmp(pItr, pItr2, nby) != 0)
{
return false;
}
pItr += nby;
pItr2 += nby;
}
}
return true;
}
static auline constexpr CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
CodepointOffset_t uCodepointIndex)
{
AuUInt uCounter {};
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if (uCounter == uCodepointIndex)
{
return CodepointByteOffset_t(pItr - pStart);
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
nby = 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static auline constexpr CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
CodepointOffset_t uCodepointIndex)
{
AuUInt uCounter {};
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
pItr += nby;
if (uCounter == uCodepointIndex)
{
return CodepointByteOffset_t(pItr - pStart);
}
uCounter++;
}
return AuROString::npos;
}
static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition = {})
{
AuUInt uCounter = 0;
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
CodepointByteOffset_t uByteOffset(pItr - pStart);
if (uByteOffset >= uStartPosition)
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uByteOffset;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition)
{
AuUInt uCounter = 0;
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart + uStartPosition;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
CodepointByteOffset_t uByteOffset(pItr - pStart);
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uByteOffset;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(const AuROString &in,
CodepointOffset_t uStartPosition = {})
{
AuUInt uCounter = 0;
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
if (uStartPosition == 0)
{
return AuROString::npos;
}
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
if (uCounter == uStartPosition)
{
return CodepointByteOffset_t(pItr - pStart);
}
pItr += nby;
}
return AuROString::npos;
}
static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
CodepointByteOffset_t uStartPosition = {})
{
const char * pStart = in.data();
const char * pItr = pStart + uStartPosition - 1;
if (uStartPosition == 0)
{
return AuROString::npos;
}
else if ((*pItr & 0x80) == 0)
{
return uStartPosition - 1;
}
else
{
while (pItr != pStart)
{
if ((*pItr & 0xC0U) == 0x80U)
{
pItr--;
}
else
{
break;
}
}
if ((*pItr & 0xC0U) != 0xC0U)
{
return AuROString::npos;
}
return CodepointByteOffset_t(pItr - pStart);
}
}
static constexpr CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
CodepointByteOffset_t uBytePosition)
{
return AuCodepointsCount(in.substr(0, uBytePosition));
}
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
const AuROString &find,
CodepointOffset_t uStartPosition = {})
{
AuUInt uCounter {};
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if (uCounter >= uStartPosition)
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uCounter;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static bool AuCodepointsStartsWithEqualIgnoreCase(const AuROString &inA,
const AuROString &inB)
{
if (inA.size() < inB.size())
{
return false;
}
if (inA.empty() ||
inB.empty())
{
return true;
}
const char *pItr = inA.data();
const char *pItr2 = inB.data();
const char *pEnd = pItr + inB.length();
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
if (AuToLower(c) != AuToLower(*pItr2))
{
return false;
}
++pItr2;
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > kAuCodepointUTF8MaxBytes)
{
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
if (AuMemcmp(pItr, pItr2, nby) != 0)
{
return false;
}
pItr += nby;
pItr2 += nby;
}
}
return true;
}
static bool AuCodepointsEndsWithEqualIgnoreCase(const AuROString &inA,
const AuROString &inB)
{
if (inA.size() < inB.size())
{
return false;
}
if (inA.empty() ||
inB.empty())
{
return true;
}
AuUInt uOffset { inA.Size() };
AuUInt uOffset2 { inB.Size() };
while (uOffset && uOffset2)
{
AuUInt32 c {};
auto uLastValid = AuCodepointsFindPreviousValidByteOffsetFromByteOffset(inB, uOffset2);
if (uLastValid == AuROString::npos)
{
return false;
}
auto nby = uOffset2 - uLastValid;
if (AuSInt(uOffset) - AuSInt(nby) < 0)
{
return false;
}
auto pItr = inA.data() + uOffset - nby;
auto pItr2 = inB.data() + uOffset2 - nby;
if ((c = *pItr) <= 0x7FU)
{
if (AuToLower(c) != AuToLower(*pItr2))
{
return false;
}
uOffset -= 1;
uOffset2 -= 1;
}
else
{
if (AuMemcmp(pItr, pItr2, nby) != 0)
{
return false;
}
uOffset -= nby;
uOffset2 -= nby;
}
}
return true;
}
static constexpr AuUInt AuCodepointsReverseIterate(const AuROString &string)
{
auto uLastValid = AuCodepointsFindPreviousValidByteOffsetFromByteOffset(string, string.Size());
if (uLastValid == AuROString::npos)
{
return AuROString::npos;
}
return string.Size() - uLastValid;
}
static constexpr AuROString AuCodepointsReverseIterateSubStrPrefixView(const AuROString &string)
{
auto uOffset = AuCodepointsReverseIterate(string);
if (uOffset == AuROString::npos)
{
return AuROString {};
}
return string.RemoveSuffix(uOffset);
}
static constexpr AuROString AuCodepointsReverseIterateSubStrSuffixView(const AuROString &string)
{
auto uLastValid = AuCodepointsFindPreviousValidByteOffsetFromByteOffset(string, string.Size());
if (uLastValid == AuROString::npos)
{
return AuROString {};
}
return string.RemovePrefix(uLastValid);
}
static bool AuCodepointsContains(const AuROString &value, const AuROString &subpattern, CodepointOffset_t uStartPosition = {})
{
return AuCodepointsFindCodepointOffset(value, subpattern, uStartPosition) != AuROString::npos;
}
static AuString &AuCodepointsReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
AuUInt uStartPosition {};
while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos)
{
str.replace(uStartPosition, from.length(), to);
uStartPosition += to.length();
}
return str;
}
static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
const AuROString &delim,
bool bIgnoreEmpty = true)
{
AuList<AuROString> tokens;
AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16);
do
{
uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev);
if (uPos == AuROString::npos)
{
uPos = str.length();
}
auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(token);
}
uPrev = uPos + delim.length();
}
while (uPos < str.length() && uPrev < str.length());
return tokens;
}
static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
AuUInt uStartPosition {};
while ((uStartPosition = str.find(from, uStartPosition)) != AuROString::npos)
{
str.replace(uStartPosition, from.length(), to);
uStartPosition += to.length();
}
return str;
}
// i told myself not to copy this, required a split function twice, now here we are :D
static AuList<AuROString> AuSplitString(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{
AuList<AuROString> tokens;
AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16);
do
{
uPos = str.find(delim, uPrev);
if (uPos == AuROString::npos)
{
uPos = str.length();
}
auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(token);
}
uPrev = uPos + delim.length();
}
while (uPos < str.length() && uPrev < str.length());
return tokens;
}
static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{
AuList<AuString> tokens;
AuUInt prev = 0, pos = 0;
tokens.reserve(str.size() / 16);
do
{
pos = str.find(delim, prev);
if (pos == AuROString::npos)
{
pos = str.length();
}
auto token = str.substr(prev, pos - prev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(AuString(token));
}
prev = pos + delim.length();
}
while (pos < str.length() && prev < str.length());
return tokens;
}
#if !defined(AURORA_RUNTIME_TO_STRING)
#define AURORA_RUNTIME_TO_STRING std::to_string
#endif
template <class T>
static auline AuString AuToString(const T &obj)
{
#if defined(_AUHAS_FMT)
// locale independent and better optimized!
return AuString(fmt::format("{}", obj));
#else
// TODO: to_chars (locale independent)
return AURORA_RUNTIME_TO_STRING(obj);
#endif
}