AuROXTL/Include/auROXTL/auStringUtils.hpp

1055 lines
25 KiB
C++

/***
Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: auStringUtils.hpp
Date: 2022-2-1
File: AuroraUtils.hpp
File: auROXTLUtils.hpp
Date: 2021-6-9
Author: Reece
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsToLower, AuCodepointsToUpper,
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecode, AuCodepointsEncodeInto,
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
AuCodepointsContains,
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
***/
#pragma once
// offset in bytes
/* using CodepointByteOffset_t = decltype(AuROString::npos); */
// offset in codepoints
/* using CodepointOffset_t = AuUInt; */
static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
{
return value.find(subpattern) != AuROString::npos;
}
static auline constexpr bool AuEndsWith(AuROString const &value, AuROString const &ending)
{
if (ending.size() > value.size()) return false;
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
static auline constexpr bool AuStartsWith(AuROString const &value, AuROString const &starting)
{
#if defined(AU_STRING_IS_TINYUTF_EXPERIMENT)
return value.starts_with(starting);
#else
return value.rfind(starting, 0) == 0;
#endif
}
template <class T>
static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
{
AuString ret;
auto uLength = in.length();
ret.resize(uLength);
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
AuUInt32 uCounter {};
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
if (nby == 1)
{
ret[uCounter] = op(in[uCounter]);
}
else
{
AuMemcpy(&ret[uCounter], &in[uCounter], nby);
}
uCounter += nby;
pItr += nby;
}
return ret;
}
static auline CodepointOffset_t AuCodepointsCount(const AuROString &in)
{
CodepointOffset_t uCounter {};
auto uLength = in.length();
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return uCounter;
}
static auline CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
{
if (in.length())
{
auto ch = in[0];
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
return 1;
}
else if ((ch & 0xE0) == 0xC0)
{
return 2;
}
else if (result == 0xE0)
{
return 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
return 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
return 6;
}
else
{
return 4;
}
}
}
return 0;
}
static auline bool AuIsAlpha(char c)
{
return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
}
static auline char AuToLower(char c)
{
return AuIsAlpha(c) ? c | 0x20 : c;
}
static auline char AuToUpper(char c)
{
return AuIsAlpha(c) ? c & ~0x20 : c;
}
static auline AuString AuCodepointsToLower(const AuROString &in)
{
return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToLower), in);
}
static auline AuString AuCodepointsToUpper(const AuROString &in)
{
return AuCodepointsTransformASCIIOp(((char(*)(char))&AuToUpper), in);
}
static auline AuString AuToLower(const AuROString &in)
{
return AuCodepointsToLower(in);
}
static auline AuString AuToUpper(const AuROString &in)
{
return AuCodepointsToUpper(in);
}
static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
{
AuList<AuUInt32> ret;
if (in.empty())
{
return ret;
}
auto uLength = in.length();
ret.reserve(uLength);
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return {};
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > 6)
{
return {};
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return {};
}
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((pItr[i] & 0xC0U) != 0x80U)
{
return {};
}
c = (c << 6) | (pItr[i] & 0x3FU);
}
pItr += nby;
}
ret.push_back(c);
}
return ret;
}
static void AuCodepointsEncodeInto(AuUInt32 uCodepoint, AuString &out)
{
if (uCodepoint < 0x80)
{
auto uLength = out.size();
out.resize(uLength + 1);
out[uLength] = static_cast<AuUInt8>(uCodepoint);
}
else if (uCodepoint < 0x800)
{
auto uLength = out.size();
out.resize(uLength + 2);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 6) | 0xc0);
out[uLength + 1] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x10000)
{
auto uLength = out.size();
out.resize(uLength + 3);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 12) | 0xe0);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x200000)
{
auto uLength = out.size();
out.resize(uLength + 4);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 18) | 0xf0);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 3] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x4000000)
{
auto uLength = out.size();
out.resize(uLength + 5);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 24) | 0xf8);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 4] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
else if (uCodepoint < 0x80000000)
{
auto uLength = out.size();
out.resize(uLength + 6);
out[uLength] = static_cast<AuUInt8>((uCodepoint >> 30) | 0xfc);
out[uLength + 1] = static_cast<AuUInt8>(((uCodepoint >> 24) & 0x3f) | 0x80);
out[uLength + 2] = static_cast<AuUInt8>(((uCodepoint >> 18) & 0x3f) | 0x80);
out[uLength + 3] = static_cast<AuUInt8>(((uCodepoint >> 12) & 0x3f) | 0x80);
out[uLength + 4] = static_cast<AuUInt8>(((uCodepoint >> 6) & 0x3f) | 0x80);
out[uLength + 5] = static_cast<AuUInt8>((uCodepoint & 0x3f) | 0x80);
}
}
template <class T>
static AuString AuCodepointsTransform(T op, const AuROString &in)
{
AuString ret;
if (in.empty())
{
return ret;
}
auto uLength = in.length();
ret.reserve(uLength);
const char *pItr = in.data();
const char *pEnd = pItr + uLength;
while (pItr < pEnd)
{
AuUInt32 c {};
if ((c = *pItr) <= 0x7FU)
{
++pItr;
}
else
{
AuUInt32 nby {};
if ((*pItr & 0xC0U) != 0xC0U)
{
return {};
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
if (nby > 6)
{
return {};
}
if (AuUInt(pEnd - pItr) < AuUInt(nby))
{
return {};
}
c = *pItr & (AuUInt8(0xFFU) >> (nby + 1));
for (AuUInt32 i = 1; i < nby; ++i)
{
if ((pItr[i] & 0xC0U) != 0x80U)
{
return {};
}
c = (c << 6) | (pItr[i] & 0x3FU);
}
pItr += nby;
}
c = op(c);
AuCodepointsEncodeInto(c, ret);
}
return ret;
}
static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
CodepointOffset_t uCodepointIndex)
{
AuUInt uCounter {};
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if (uCounter == uCodepointIndex)
{
return CodepointByteOffset_t(pItr - pStart);
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static auline CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
CodepointOffset_t uCodepointIndex)
{
AuUInt uCounter {};
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
pItr += nby;
if (uCounter == uCodepointIndex)
{
return CodepointByteOffset_t(pItr - pStart);
}
uCounter++;
}
return AuROString::npos;
}
static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition = {})
{
AuUInt uCounter = 0;
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
CodepointByteOffset_t uByteOffset(pItr - pStart);
if (uByteOffset >= uStartPosition)
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uByteOffset;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition)
{
AuUInt uCounter = 0;
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart + uStartPosition;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
CodepointByteOffset_t uByteOffset(pItr - pStart);
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uByteOffset;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(const AuROString &in,
CodepointOffset_t uStartPosition = {})
{
AuUInt uCounter = 0;
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
if (uStartPosition == 0)
{
return AuROString::npos;
}
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
if (uCounter == uStartPosition)
{
return CodepointByteOffset_t(pItr - pStart);
}
pItr += nby;
}
return AuROString::npos;
}
static CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
CodepointByteOffset_t uStartPosition = {})
{
const char * pStart = in.data();
const char * pItr = pStart + uStartPosition - 1;
if (uStartPosition == 0)
{
return AuROString::npos;
}
else if ((*pItr & 0x80) == 0)
{
return uStartPosition - 1;
}
else
{
while (pItr != pStart)
{
if ((*pItr & 0xC0U) == 0x80U)
{
pItr--;
}
else
{
break;
}
}
if ((*pItr & 0xC0U) != 0xC0U)
{
return AuROString::npos;
}
return CodepointByteOffset_t(pItr - pStart);
}
}
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
CodepointByteOffset_t uBytePosition)
{
return AuCodepointsCount(in.substr(0, uBytePosition));
}
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
const AuROString &find,
CodepointOffset_t uStartPosition = {})
{
AuUInt uCounter {};
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if (uCounter >= uStartPosition)
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uCounter;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static bool AuCodepointsContains(const AuROString &value, const AuROString &subpattern, CodepointOffset_t uStartPosition = {})
{
return AuCodepointsFindCodepointOffset(value, subpattern, uStartPosition) != AuROString::npos;
}
static AuString &AuCodepointsReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
AuUInt uStartPosition {};
while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos)
{
str.replace(uStartPosition, from.length(), to);
uStartPosition += to.length();
}
return str;
}
static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
const AuROString &delim,
bool bIgnoreEmpty = true)
{
AuList<AuROString> tokens;
AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16);
do
{
uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev);
if (uPos == AuROString::npos)
{
uPos = str.length();
}
auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(token);
}
uPrev = uPos + delim.length();
}
while (uPos < str.length() && uPrev < str.length());
return tokens;
}
static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
AuUInt uStartPosition {};
while ((uStartPosition = str.find(from, uStartPosition)) != AuROString::npos)
{
str.replace(uStartPosition, from.length(), to);
uStartPosition += to.length();
}
return str;
}
// i told myself not to copy this, required a split function twice, now here we are :D
static AuList<AuROString> AuSplitString(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{
AuList<AuROString> tokens;
AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16);
do
{
uPos = str.find(delim, uPrev);
if (uPos == AuROString::npos)
{
uPos = str.length();
}
auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(token);
}
uPrev = uPos + delim.length();
}
while (uPos < str.length() && uPrev < str.length());
return tokens;
}
static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{
AuList<AuString> tokens;
AuUInt prev = 0, pos = 0;
tokens.reserve(str.size() / 16);
do
{
pos = str.find(delim, prev);
if (pos == AuROString::npos)
{
pos = str.length();
}
auto token = str.substr(prev, pos - prev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(AuString(token));
}
prev = pos + delim.length();
}
while (pos < str.length() && prev < str.length());
return tokens;
}
#if !defined(AURORA_RUNTIME_TO_STRING)
#define AURORA_RUNTIME_TO_STRING std::to_string
#endif
template <class T>
static auline AuString AuToString(const T &obj)
{
#if defined(_AUHAS_FMT)
// locale independent and better optimized!
return AuString(fmt::format("{}", obj));
#else
// TODO: to_chars (locale independent)
return AURORA_RUNTIME_TO_STRING(obj);
#endif
}