1204 lines
28 KiB
C++
1204 lines
28 KiB
C++
/***
|
|
Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
|
|
|
File: auStringUtils.hpp
|
|
Date: 2022-2-1
|
|
File: AuroraUtils.hpp
|
|
File: auROXTLUtils.hpp
|
|
Date: 2021-6-9
|
|
Author: Reece
|
|
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
|
|
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
|
|
|
|
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
|
|
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
|
|
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
|
|
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecode, AuCodepointsEncodeInto,
|
|
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
|
|
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
|
|
AuCodepointsContains,
|
|
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
|
|
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
|
|
AuCodepointsIsEqualIgnoreCase
|
|
|
|
For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
|
|
***/
|
|
#pragma once
|
|
|
|
// offset in bytes
|
|
/* using CodepointByteOffset_t = decltype(AuROString::npos); */
|
|
|
|
// offset in codepoints
|
|
/* using CodepointOffset_t = AuUInt; */
|
|
|
|
#if defined(AURORA_I_SUCK_AND_WANT_MODERN_UTF8)
|
|
static const AuUInt8 kAuCodepointUTF8MaxBytes = 4;
|
|
#else
|
|
static const AuUInt8 kAuCodepointUTF8MaxBytes = 6;
|
|
#endif
|
|
|
|
static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
|
|
{
|
|
return value.find(subpattern) != AuROString::npos;
|
|
}
|
|
|
|
static auline constexpr bool AuEndsWith(AuROString const &value, AuROString const &ending)
|
|
{
|
|
if (ending.size() > value.size()) return false;
|
|
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
|
|
}
|
|
|
|
static auline constexpr bool AuStartsWith(AuROString const &value, AuROString const &starting)
|
|
{
|
|
#if defined(AU_STRING_IS_TINYUTF_EXPERIMENT)
|
|
return value.starts_with(starting);
|
|
#else
|
|
return value.rfind(starting, 0) == 0;
|
|
#endif
|
|
}
|
|
|
|
template <class T>
|
|
static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
|
|
{
|
|
AuString ret;
|
|
auto uLength = in.length();
|
|
|
|
ret.resize(uLength);
|
|
|
|
const char *pItr = in.data();
|
|
const char *pEnd = pItr + uLength;
|
|
|
|
AuUInt32 uCounter {};
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (nby == 1)
|
|
{
|
|
ret[uCounter] = op(in[uCounter]);
|
|
}
|
|
else
|
|
{
|
|
AuMemcpy(&ret[uCounter], &in[uCounter], nby);
|
|
}
|
|
|
|
uCounter += nby;
|
|
pItr += nby;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static auline CodepointOffset_t AuCodepointsCount(const AuROString &in)
|
|
{
|
|
CodepointOffset_t uCounter {};
|
|
auto uLength = in.length();
|
|
|
|
const char *pItr = in.data();
|
|
const char *pEnd = pItr + uLength;
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
uCounter++;
|
|
pItr += nby;
|
|
}
|
|
|
|
return uCounter;
|
|
}
|
|
|
|
static auline CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
|
|
{
|
|
if (in.length())
|
|
{
|
|
auto ch = in[0];
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
return 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
return 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
return 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
return 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
return 6;
|
|
}
|
|
else
|
|
{
|
|
return 4;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static auline bool AuIsAlpha(char c)
|
|
{
|
|
return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
|
|
}
|
|
|
|
static auline char AuToLower(char c)
|
|
{
|
|
return AuIsAlpha(c) ? c | 0x20 : c;
|
|
}
|
|
|
|
static auline char AuToUpper(char c)
|
|
{
|
|
return AuIsAlpha(c) ? c & ~0x20 : c;
|
|
}
|
|
|
|
static auline AuString AuCodepointsToLower(const AuROString &in)
|
|
{
|
|
return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToLower), in);
|
|
}
|
|
|
|
static auline AuString AuCodepointsToUpper(const AuROString &in)
|
|
{
|
|
return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToUpper), in);
|
|
}
|
|
|
|
static auline AuString AuToLower(const AuROString &in)
|
|
{
|
|
return AuCodepointsToLower(in);
|
|
}
|
|
|
|
static auline AuString AuToUpper(const AuROString &in)
|
|
{
|
|
return AuCodepointsToUpper(in);
|
|
}
|
|
|
|
static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
|
|
{
|
|
AuList<AuUInt32> ret;
|
|
|
|
if (in.empty())
|
|
{
|
|
return ret;
|
|
}
|
|
|
|
auto uLength = in.length();
|
|
|
|
ret.reserve(uLength);
|
|
|
|
const char *pItr = in.data();
|
|
const char *pEnd = pItr + uLength;
|
|
|
|
while (pItr < pEnd)
|
|
{
|
|
AuUInt32 c {};
|
|
|
|
if ((c = *pItr) <= 0x7FU)
|
|
{
|
|
++pItr;
|
|
}
|
|
else
|
|
{
|
|
AuUInt32 nby {};
|
|
|
|
if ((*pItr & 0xC0U) != 0xC0U)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
|
|
{
|
|
}
|
|
|
|
if (nby > kAuCodepointUTF8MaxBytes)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
if (AuUInt(pEnd - pItr) < AuUInt(nby))
|
|
{
|
|
return {};
|
|
}
|
|
|
|
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
|
|
|
|
for (AuUInt32 i = 1; i < nby; ++i)
|
|
{
|
|
if ((pItr[i] & 0xC0U) != 0x80U)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
c = (c << 6) | (pItr[i] & 0x3FU);
|
|
}
|
|
|
|
pItr += nby;
|
|
}
|
|
|
|
ret.push_back(c);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void AuCodepointsEncodeInto(AuUInt32 uCodepoint, AuString &out)
|
|
{
|
|
if (uCodepoint < 0x80)
|
|
{
|
|
auto uLength = out.size();
|
|
out.resize(uLength + 1);
|
|
out[uLength] = static_cast<AuUInt8>(uCodepoint);
|
|
}
|
|
else if (uCodepoint < 0x800)
|
|
{
|
|
auto uLength = out.size();
|
|
out.resize(uLength + 2);
|
|
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 6) | 0xc0);
|
|
out[uLength + 1] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
|
|
}
|
|
else if (uCodepoint < 0x10000)
|
|
{
|
|
auto uLength = out.size();
|
|
out.resize(uLength + 3);
|
|
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 12) | 0xe0);
|
|
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
|
|
out[uLength + 2] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
|
|
}
|
|
else if (uCodepoint < 0x200000)
|
|
{
|
|
auto uLength = out.size();
|
|
out.resize(uLength + 4);
|
|
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 18) | 0xf0);
|
|
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
|
|
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
|
|
out[uLength + 3] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
|
|
}
|
|
else if (uCodepoint < 0x4000000)
|
|
{
|
|
auto uLength = out.size();
|
|
out.resize(uLength + 5);
|
|
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 24) | 0xf8);
|
|
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
|
|
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
|
|
out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
|
|
out[uLength + 4] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
|
|
}
|
|
else if (uCodepoint < 0x80000000)
|
|
{
|
|
auto uLength = out.size();
|
|
out.resize(uLength + 6);
|
|
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 30) | 0xfc);
|
|
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 24) & 0x3f) | 0x80);
|
|
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
|
|
out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
|
|
out[uLength + 4] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
|
|
out[uLength + 5] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
|
|
}
|
|
}
|
|
|
|
template <class T>
|
|
static AuString AuCodepointsTransform(T op, const AuROString &in)
|
|
{
|
|
AuString ret;
|
|
|
|
if (in.empty())
|
|
{
|
|
return ret;
|
|
}
|
|
|
|
auto uLength = in.length();
|
|
|
|
ret.reserve(uLength);
|
|
|
|
const char *pItr = in.data();
|
|
const char *pEnd = pItr + uLength;
|
|
|
|
while (pItr < pEnd)
|
|
{
|
|
AuUInt32 c {};
|
|
|
|
if ((c = *pItr) <= 0x7FU)
|
|
{
|
|
++pItr;
|
|
}
|
|
else
|
|
{
|
|
AuUInt32 nby {};
|
|
|
|
if ((*pItr & 0xC0U) != 0xC0U)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
|
|
{
|
|
}
|
|
|
|
if (nby > kAuCodepointUTF8MaxBytes)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
if (AuUInt(pEnd - pItr) < AuUInt(nby))
|
|
{
|
|
return {};
|
|
}
|
|
|
|
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
|
|
|
|
for (AuUInt32 i = 1; i < nby; ++i)
|
|
{
|
|
if ((pItr[i] & 0xC0U) != 0x80U)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
c = (c << 6) | (pItr[i] & 0x3FU);
|
|
}
|
|
|
|
pItr += nby;
|
|
}
|
|
|
|
c = op(c);
|
|
|
|
AuCodepointsEncodeInto(c, ret);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
template <class T>
|
|
static bool AuCodepointsForEach(T op, const AuROString &in)
|
|
{
|
|
if (in.empty())
|
|
{
|
|
return true;
|
|
}
|
|
|
|
auto uLength = in.length();
|
|
|
|
const char *pItr = in.data();
|
|
const char *pEnd = pItr + uLength;
|
|
|
|
while (pItr < pEnd)
|
|
{
|
|
AuUInt32 c {};
|
|
|
|
if ((c = *pItr) <= 0x7FU)
|
|
{
|
|
++pItr;
|
|
}
|
|
else
|
|
{
|
|
AuUInt32 nby {};
|
|
|
|
if ((*pItr & 0xC0U) != 0xC0U)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
|
|
{
|
|
}
|
|
|
|
if (nby > kAuCodepointUTF8MaxBytes)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (AuUInt(pEnd - pItr) < AuUInt(nby))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
|
|
|
|
for (AuUInt32 i = 1; i < nby; ++i)
|
|
{
|
|
if ((pItr[i] & 0xC0U) != 0x80U)
|
|
{
|
|
return {};
|
|
}
|
|
|
|
c = (c << 6) | (pItr[i] & 0x3FU);
|
|
}
|
|
|
|
pItr += nby;
|
|
}
|
|
|
|
if constexpr (AuIsSame_v<AuResultOf_t<T, AuUInt32>, bool>)
|
|
{
|
|
if (!op(c))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
op(c);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool AuCodepointsIsEqualIgnoreCase(const AuROString &inA,
|
|
const AuROString &inB)
|
|
{
|
|
if (inA.size() !=
|
|
inB.size())
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (inA.empty())
|
|
{
|
|
return true;
|
|
}
|
|
|
|
const char *pItr = inA.data();
|
|
const char *pItr2 = inB.data();
|
|
const char *pEnd = pItr + inA.length();
|
|
|
|
while (pItr < pEnd)
|
|
{
|
|
AuUInt32 c {};
|
|
|
|
if ((c = *pItr) <= 0x7FU)
|
|
{
|
|
if (AuToLower(c) != AuToLower(*pItr2))
|
|
{
|
|
return false;
|
|
}
|
|
++pItr2;
|
|
++pItr;
|
|
}
|
|
else
|
|
{
|
|
AuUInt32 nby {};
|
|
|
|
if ((*pItr & 0xC0U) != 0xC0U)
|
|
{
|
|
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
|
|
}
|
|
|
|
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
|
|
{
|
|
}
|
|
|
|
if (nby > kAuCodepointUTF8MaxBytes)
|
|
{
|
|
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
|
|
}
|
|
|
|
if (AuUInt(pEnd - pItr) < AuUInt(nby))
|
|
{
|
|
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
|
|
}
|
|
|
|
if (AuMemcmp(pItr, pItr2, nby) != 0)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
pItr += nby;
|
|
pItr2 += nby;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
|
|
CodepointOffset_t uCodepointIndex)
|
|
{
|
|
AuUInt uCounter {};
|
|
auto uLength = in.length();
|
|
|
|
const char *pStart = in.data();
|
|
const char *pItr = pStart;
|
|
const char *pEnd = pStart + uLength;
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if (uCounter == uCodepointIndex)
|
|
{
|
|
return CodepointByteOffset_t(pItr - pStart);
|
|
}
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
uCounter++;
|
|
pItr += nby;
|
|
}
|
|
|
|
return AuROString::npos;
|
|
}
|
|
|
|
static auline CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
|
|
CodepointOffset_t uCodepointIndex)
|
|
{
|
|
AuUInt uCounter {};
|
|
auto uLength = in.length();
|
|
|
|
const char *pStart = in.data();
|
|
const char *pItr = pStart;
|
|
const char *pEnd = pStart + uLength;
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
pItr += nby;
|
|
|
|
if (uCounter == uCodepointIndex)
|
|
{
|
|
return CodepointByteOffset_t(pItr - pStart);
|
|
}
|
|
|
|
uCounter++;
|
|
}
|
|
|
|
return AuROString::npos;
|
|
}
|
|
|
|
static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
|
|
const AuROString &find,
|
|
CodepointByteOffset_t uStartPosition = {})
|
|
{
|
|
AuUInt uCounter = 0;
|
|
auto uLength = in.length();
|
|
auto uFindLength = find.length();
|
|
|
|
const char *pStart = in.data();
|
|
const char *pItr = pStart;
|
|
const char *pEnd = pStart + uLength;
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
CodepointByteOffset_t uByteOffset(pItr - pStart);
|
|
|
|
if (uByteOffset >= uStartPosition)
|
|
{
|
|
AuROString suffixView(pItr, pEnd);
|
|
if (suffixView.length() > uFindLength)
|
|
{
|
|
suffixView = { suffixView.data(), uFindLength };
|
|
}
|
|
|
|
if (suffixView == find)
|
|
{
|
|
return uByteOffset;
|
|
}
|
|
}
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
uCounter++;
|
|
pItr += nby;
|
|
}
|
|
|
|
return AuROString::npos;
|
|
}
|
|
|
|
static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
|
|
const AuROString &find,
|
|
CodepointByteOffset_t uStartPosition)
|
|
{
|
|
AuUInt uCounter = 0;
|
|
auto uLength = in.length();
|
|
auto uFindLength = find.length();
|
|
|
|
const char *pStart = in.data();
|
|
const char *pItr = pStart + uStartPosition;
|
|
const char *pEnd = pStart + uLength;
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
CodepointByteOffset_t uByteOffset(pItr - pStart);
|
|
|
|
{
|
|
AuROString suffixView(pItr, pEnd);
|
|
if (suffixView.length() > uFindLength)
|
|
{
|
|
suffixView = { suffixView.data(), uFindLength };
|
|
}
|
|
|
|
if (suffixView == find)
|
|
{
|
|
return uByteOffset;
|
|
}
|
|
}
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
uCounter++;
|
|
pItr += nby;
|
|
}
|
|
|
|
return AuROString::npos;
|
|
}
|
|
|
|
static CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(const AuROString &in,
|
|
CodepointOffset_t uStartPosition = {})
|
|
{
|
|
AuUInt uCounter = 0;
|
|
auto uLength = in.length();
|
|
|
|
const char *pStart = in.data();
|
|
const char *pItr = pStart;
|
|
const char *pEnd = pStart + uLength;
|
|
|
|
if (uStartPosition == 0)
|
|
{
|
|
return AuROString::npos;
|
|
}
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
uCounter++;
|
|
|
|
if (uCounter == uStartPosition)
|
|
{
|
|
return CodepointByteOffset_t(pItr - pStart);
|
|
}
|
|
|
|
pItr += nby;
|
|
}
|
|
|
|
return AuROString::npos;
|
|
}
|
|
|
|
static CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
|
|
CodepointByteOffset_t uStartPosition = {})
|
|
{
|
|
const char * pStart = in.data();
|
|
const char * pItr = pStart + uStartPosition - 1;
|
|
|
|
if (uStartPosition == 0)
|
|
{
|
|
return AuROString::npos;
|
|
}
|
|
else if ((*pItr & 0x80) == 0)
|
|
{
|
|
return uStartPosition - 1;
|
|
}
|
|
else
|
|
{
|
|
while (pItr != pStart)
|
|
{
|
|
if ((*pItr & 0xC0U) == 0x80U)
|
|
{
|
|
pItr--;
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ((*pItr & 0xC0U) != 0xC0U)
|
|
{
|
|
return AuROString::npos;
|
|
}
|
|
|
|
return CodepointByteOffset_t(pItr - pStart);
|
|
}
|
|
}
|
|
|
|
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
|
|
CodepointByteOffset_t uBytePosition)
|
|
{
|
|
return AuCodepointsCount(in.substr(0, uBytePosition));
|
|
}
|
|
|
|
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
|
|
const AuROString &find,
|
|
CodepointOffset_t uStartPosition = {})
|
|
{
|
|
AuUInt uCounter {};
|
|
auto uLength = in.length();
|
|
auto uFindLength = find.length();
|
|
|
|
const char *pStart = in.data();
|
|
const char *pItr = pStart;
|
|
const char *pEnd = pStart + uLength;
|
|
|
|
while (pItr != pEnd)
|
|
{
|
|
AuUInt32 nby {};
|
|
auto ch = *pItr;
|
|
unsigned int result = (ch & 0xF0);
|
|
|
|
if (uCounter >= uStartPosition)
|
|
{
|
|
AuROString suffixView(pItr, pEnd);
|
|
if (suffixView.length() > uFindLength)
|
|
{
|
|
suffixView = { suffixView.data(), uFindLength };
|
|
}
|
|
|
|
if (suffixView == find)
|
|
{
|
|
return uCounter;
|
|
}
|
|
}
|
|
|
|
if ((ch & 0x80) == 0)
|
|
{
|
|
nby = 1;
|
|
}
|
|
else if ((ch & 0xE0) == 0xC0)
|
|
{
|
|
nby = 2;
|
|
}
|
|
else if (result == 0xE0)
|
|
{
|
|
nby = 3;
|
|
}
|
|
else if (result == 0xF0)
|
|
{
|
|
if ((ch & 0x08) == 0x08)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 5;
|
|
}
|
|
else if ((ch & 0x0c) == 0x0c)
|
|
{
|
|
// Special/Historic UTF8
|
|
nby = 6;
|
|
}
|
|
else
|
|
{
|
|
nby = 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (pItr + nby > pEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
uCounter++;
|
|
pItr += nby;
|
|
}
|
|
|
|
return AuROString::npos;
|
|
}
|
|
|
|
static bool AuCodepointsContains(const AuROString &value, const AuROString &subpattern, CodepointOffset_t uStartPosition = {})
|
|
{
|
|
return AuCodepointsFindCodepointOffset(value, subpattern, uStartPosition) != AuROString::npos;
|
|
}
|
|
|
|
static AuString &AuCodepointsReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
|
|
{
|
|
AuUInt uStartPosition {};
|
|
while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos)
|
|
{
|
|
str.replace(uStartPosition, from.length(), to);
|
|
uStartPosition += to.length();
|
|
}
|
|
return str;
|
|
}
|
|
|
|
static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
|
|
const AuROString &delim,
|
|
bool bIgnoreEmpty = true)
|
|
{
|
|
AuList<AuROString> tokens;
|
|
AuUInt uPrev {}, uPos {};
|
|
tokens.reserve(str.size() / 16);
|
|
do
|
|
{
|
|
uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev);
|
|
if (uPos == AuROString::npos)
|
|
{
|
|
uPos = str.length();
|
|
}
|
|
auto token = str.substr(uPrev, uPos - uPrev);
|
|
if ((!token.empty()) && bIgnoreEmpty)
|
|
{
|
|
tokens.push_back(token);
|
|
}
|
|
uPrev = uPos + delim.length();
|
|
}
|
|
while (uPos < str.length() && uPrev < str.length());
|
|
return tokens;
|
|
}
|
|
|
|
static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
|
|
{
|
|
AuUInt uStartPosition {};
|
|
while ((uStartPosition = str.find(from, uStartPosition)) != AuROString::npos)
|
|
{
|
|
str.replace(uStartPosition, from.length(), to);
|
|
uStartPosition += to.length();
|
|
}
|
|
return str;
|
|
}
|
|
|
|
// i told myself not to copy this, required a split function twice, now here we are :D
|
|
static AuList<AuROString> AuSplitString(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
|
|
{
|
|
AuList<AuROString> tokens;
|
|
AuUInt uPrev {}, uPos {};
|
|
tokens.reserve(str.size() / 16);
|
|
do
|
|
{
|
|
uPos = str.find(delim, uPrev);
|
|
if (uPos == AuROString::npos)
|
|
{
|
|
uPos = str.length();
|
|
}
|
|
auto token = str.substr(uPrev, uPos - uPrev);
|
|
if ((!token.empty()) && bIgnoreEmpty)
|
|
{
|
|
tokens.push_back(token);
|
|
}
|
|
uPrev = uPos + delim.length();
|
|
}
|
|
while (uPos < str.length() && uPrev < str.length());
|
|
return tokens;
|
|
}
|
|
|
|
static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
|
|
{
|
|
AuList<AuString> tokens;
|
|
AuUInt prev = 0, pos = 0;
|
|
tokens.reserve(str.size() / 16);
|
|
do
|
|
{
|
|
pos = str.find(delim, prev);
|
|
if (pos == AuROString::npos)
|
|
{
|
|
pos = str.length();
|
|
}
|
|
auto token = str.substr(prev, pos - prev);
|
|
if ((!token.empty()) && bIgnoreEmpty)
|
|
{
|
|
tokens.push_back(AuString(token));
|
|
}
|
|
prev = pos + delim.length();
|
|
}
|
|
while (pos < str.length() && prev < str.length());
|
|
return tokens;
|
|
}
|
|
|
|
#if !defined(AURORA_RUNTIME_TO_STRING)
|
|
#define AURORA_RUNTIME_TO_STRING std::to_string
|
|
#endif
|
|
|
|
template <class T>
|
|
static auline AuString AuToString(const T &obj)
|
|
{
|
|
#if defined(_AUHAS_FMT)
|
|
// locale independent and better optimized!
|
|
return AuString(fmt::format("{}", obj));
|
|
#else
|
|
// TODO: to_chars (locale independent)
|
|
return AURORA_RUNTIME_TO_STRING(obj);
|
|
#endif
|
|
} |