[+] AuCodepointsGetByteOffset

[+] AuCodepointsGetByteLength
[+] AuCodepointsFindByteOffset
[+] AuCodepointsFindByteOffsetUnsafe
[+] AuCodepointsFindCodepointOffset
[+] AuCodepointsContains
[+] AuCodepointsReplaceAll
[+] AuCodepointsSplitString
This commit is contained in:
Reece Wilson 2024-04-19 22:08:32 +01:00
parent b1d6eb0d80
commit 72853a54ab

View File

@ -7,9 +7,28 @@
File: auROXTLUtils.hpp File: auROXTLUtils.hpp
Date: 2021-6-9 Date: 2021-6-9
Author: Reece Author: Reece
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsToLower, AuCodepointsToUpper,
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecode, AuCodepointsEncodeInto,
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset,
AuCodepointsContains,
AuCodepointsReplaceAll, AuCodepointsSplitString (views)
For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
***/ ***/
#pragma once #pragma once
// offset in bytes
using CodepointByteOffset_t = decltype(AuROString::npos);
// offset in codepoints
using CodepointOffset_t = AuUInt;
static auline bool AuStringContains(const AuROString &value, const AuROString &subpattern) static auline bool AuStringContains(const AuROString &value, const AuROString &subpattern)
{ {
return value.find(subpattern) != AuROString::npos; return value.find(subpattern) != AuROString::npos;
@ -64,10 +83,12 @@ static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8
nby = 5; nby = 5;
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8
nby = 6; nby = 6;
} }
else else
@ -101,9 +122,9 @@ static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
return ret; return ret;
} }
static auline AuUInt AuCodepointsCount(const AuROString &in) static auline CodepointOffset_t AuCodepointsCount(const AuROString &in)
{ {
AuUInt uCounter {}; CodepointOffset_t uCounter {};
auto uLength = in.length(); auto uLength = in.length();
const char *pItr = in.data(); const char *pItr = in.data();
@ -131,6 +152,7 @@ static auline AuUInt AuCodepointsCount(const AuROString &in)
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8
nby = 5; nby = 5;
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
@ -160,7 +182,7 @@ static auline AuUInt AuCodepointsCount(const AuROString &in)
return uCounter; return uCounter;
} }
static auline AuUInt AuCodepointsNextLength(const AuROString &in) static auline CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
{ {
if (in.length()) if (in.length())
{ {
@ -183,6 +205,7 @@ static auline AuUInt AuCodepointsNextLength(const AuROString &in)
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8
return 5; return 5;
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
@ -200,14 +223,19 @@ static auline AuUInt AuCodepointsNextLength(const AuROString &in)
return 0; return 0;
} }
static auline bool AuIsAlpha(char c)
{
return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
}
static auline char AuToLower(char c) static auline char AuToLower(char c)
{ {
return c ? c | 0x20 : 0; return AuIsAlpha(c) ? c | 0x20 : c;
} }
static auline char AuToUpper(char c) static auline char AuToUpper(char c)
{ {
return c & ~0x20; return AuIsAlpha(c) ? c & ~0x20 : c;
} }
static auline AuString AuCodepointsToLower(const AuROString &in) static auline AuString AuCodepointsToLower(const AuROString &in)
@ -424,6 +452,417 @@ static AuString AuCodepointsTransform(T op, const AuROString &in)
return ret; return ret;
} }
static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
CodepointOffset_t uCodepointIndex)
{
AuUInt uCounter {};
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if (uCounter == uCodepointIndex)
{
return CodepointByteOffset_t(pItr - pStart);
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static auline CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
CodepointOffset_t uCodepointIndex)
{
AuUInt uCounter {};
auto uLength = in.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
pItr += nby;
if (uCounter == uCodepointIndex)
{
return CodepointByteOffset_t(pItr - pStart);
}
uCounter++;
}
return AuROString::npos;
}
static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition = {})
{
AuUInt uCounter {};
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
CodepointByteOffset_t uByteOffset(pItr - pStart);
if (uByteOffset >= uStartPosition)
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uByteOffset;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition = {})
{
AuUInt uCounter {};
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart + uStartPosition;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
CodepointByteOffset_t uByteOffset(pItr - pStart);
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uByteOffset;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
const AuROString &find,
CodepointOffset_t uStartPosition = {})
{
AuUInt uCounter {};
auto uLength = in.length();
auto uFindLength = find.length();
const char *pStart = in.data();
const char *pItr = pStart;
const char *pEnd = pStart + uLength;
while (pItr != pEnd)
{
AuUInt32 nby {};
auto ch = *pItr;
unsigned int result = (ch & 0xF0);
if (uCounter >= uStartPosition)
{
AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength)
{
suffixView = { suffixView.data(), uFindLength };
}
if (suffixView == find)
{
return uCounter;
}
}
if ((ch & 0x80) == 0)
{
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Special/Historic UTF8
nby = 5;
}
else if ((ch & 0x0c) == 0x0c)
{
// Special/Historic UTF8
nby = 6;
}
else
{
nby = 4;
}
}
else
{
break;
}
if (pItr + nby > pEnd)
{
break;
}
uCounter++;
pItr += nby;
}
return AuROString::npos;
}
static bool AuCodepointsContains(const AuROString &value, const AuROString &subpattern, CodepointOffset_t uStartPosition = {})
{
return AuCodepointsFindCodepointOffset(value, subpattern, uStartPosition) != AuROString::npos;
}
static AuString &AuCodepointsReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{
AuUInt uStartPosition {};
while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos)
{
str.replace(uStartPosition, from.length(), to);
uStartPosition += to.length();
}
return str;
}
static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
const AuROString &delim,
bool bIgnoreEmpty = true)
{
AuList<AuROString> tokens;
AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16);
do
{
uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev);
if (uPos == AuROString::npos)
{
uPos = str.length();
}
auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(token);
}
uPrev = uPos + delim.length();
}
while (uPos < str.length() && uPrev < str.length());
return tokens;
}
static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to) static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to)
{ {
AuUInt uStartPosition {}; AuUInt uStartPosition {};
@ -436,30 +875,30 @@ static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROS
} }
// i told myself not to copy this, required a split function twice, now here we are :D // i told myself not to copy this, required a split function twice, now here we are :D
static AuList<AuROString> AuSplitString(const AuROString &str, const AuROString &delim, bool ignoreEmpty = true) static AuList<AuROString> AuSplitString(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{ {
AuList<AuROString> tokens; AuList<AuROString> tokens;
AuUInt prev = 0, pos = 0; AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16); tokens.reserve(str.size() / 16);
do do
{ {
pos = str.find(delim, prev); uPos = str.find(delim, uPrev);
if (pos == AuROString::npos) if (uPos == AuROString::npos)
{ {
pos = str.length(); uPos = str.length();
} }
auto token = str.substr(prev, pos - prev); auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && ignoreEmpty) if ((!token.empty()) && bIgnoreEmpty)
{ {
tokens.push_back(token); tokens.push_back(token);
} }
prev = pos + delim.length(); uPrev = uPos + delim.length();
} }
while (pos < str.length() && prev < str.length()); while (uPos < str.length() && uPrev < str.length());
return tokens; return tokens;
} }
static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool ignoreEmpty = true) static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true)
{ {
AuList<AuString> tokens; AuList<AuString> tokens;
AuUInt prev = 0, pos = 0; AuUInt prev = 0, pos = 0;
@ -472,7 +911,7 @@ static AuList<AuString> AuSplitStringLegacy(const AuROString &str, const AuROStr
pos = str.length(); pos = str.length();
} }
auto token = str.substr(prev, pos - prev); auto token = str.substr(prev, pos - prev);
if ((!token.empty()) && ignoreEmpty) if ((!token.empty()) && bIgnoreEmpty)
{ {
tokens.push_back(AuString(token)); tokens.push_back(AuString(token));
} }