From 72853a54ab8e593a93101ccfbefdf7954e95c043 Mon Sep 17 00:00:00 2001 From: Jamie Reece Wilson Date: Fri, 19 Apr 2024 22:08:32 +0100 Subject: [PATCH] [+] AuCodepointsGetByteOffset [+] AuCodepointsGetByteLength [+] AuCodepointsFindByteOffset [+] AuCodepointsFindByteOffsetUnsafe [+] AuCodepointsFindCodepointOffset [+] AuCodepointsContains [+] AuCodepointsReplaceAll [+] AuCodepointsSplitString --- Include/auROXTL/auStringUtils.hpp | 471 +++++++++++++++++++++++++++++- 1 file changed, 455 insertions(+), 16 deletions(-) diff --git a/Include/auROXTL/auStringUtils.hpp b/Include/auROXTL/auStringUtils.hpp index 7a9cbb3..388f6f5 100644 --- a/Include/auROXTL/auStringUtils.hpp +++ b/Include/auROXTL/auStringUtils.hpp @@ -7,9 +7,28 @@ File: auROXTLUtils.hpp Date: 2021-6-9 Author: Reece + Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods. + AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16) + + Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views) + AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view). + Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsToLower, AuCodepointsToUpper, + AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecode, AuCodepointsEncodeInto, + AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t), + AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset, + AuCodepointsContains, + AuCodepointsReplaceAll, AuCodepointsSplitString (views) + + For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime. ***/ #pragma once +// offset in bytes +using CodepointByteOffset_t = decltype(AuROString::npos); + +// offset in codepoints +using CodepointOffset_t = AuUInt; + static auline bool AuStringContains(const AuROString &value, const AuROString &subpattern) { return value.find(subpattern) != AuROString::npos; @@ -64,10 +83,12 @@ static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in) { if ((ch & 0x08) == 0x08) { + // Special/Historic UTF8 nby = 5; } else if ((ch & 0x0c) == 0x0c) { + // Special/Historic UTF8 nby = 6; } else @@ -101,9 +122,9 @@ static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in) return ret; } -static auline AuUInt AuCodepointsCount(const AuROString &in) +static auline CodepointOffset_t AuCodepointsCount(const AuROString &in) { - AuUInt uCounter {}; + CodepointOffset_t uCounter {}; auto uLength = in.length(); const char *pItr = in.data(); @@ -131,6 +152,7 @@ static auline AuUInt AuCodepointsCount(const AuROString &in) { if ((ch & 0x08) == 0x08) { + // Special/Historic UTF8 nby = 5; } else if ((ch & 0x0c) == 0x0c) @@ -160,7 +182,7 @@ static auline AuUInt AuCodepointsCount(const AuROString &in) return uCounter; } -static auline AuUInt AuCodepointsNextLength(const AuROString &in) +static auline CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in) { if (in.length()) { @@ -183,6 +205,7 @@ static auline AuUInt AuCodepointsNextLength(const AuROString &in) { if ((ch & 0x08) == 0x08) { + // Special/Historic UTF8 return 5; } else if ((ch & 0x0c) == 0x0c) @@ -200,14 +223,19 @@ static auline AuUInt AuCodepointsNextLength(const AuROString &in) return 0; } +static auline bool AuIsAlpha(char c) +{ + return (c) && (((unsigned char)c | 0x20) - 'a' < 26); +} + static auline char AuToLower(char c) { - return c ? c | 0x20 : 0; + return AuIsAlpha(c) ? c | 0x20 : c; } static auline char AuToUpper(char c) { - return c & ~0x20; + return AuIsAlpha(c) ? c & ~0x20 : c; } static auline AuString AuCodepointsToLower(const AuROString &in) @@ -424,6 +452,417 @@ static AuString AuCodepointsTransform(T op, const AuROString &in) return ret; } +static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in, + CodepointOffset_t uCodepointIndex) +{ + AuUInt uCounter {}; + auto uLength = in.length(); + + const char *pStart = in.data(); + const char *pItr = pStart; + const char *pEnd = pStart + uLength; + + while (pItr != pEnd) + { + AuUInt32 nby {}; + auto ch = *pItr; + unsigned int result = (ch & 0xF0); + + if (uCounter == uCodepointIndex) + { + return CodepointByteOffset_t(pItr - pStart); + } + + if ((ch & 0x80) == 0) + { + nby = 1; + } + else if ((ch & 0xE0) == 0xC0) + { + nby = 2; + } + else if (result == 0xE0) + { + nby = 3; + } + else if (result == 0xF0) + { + if ((ch & 0x08) == 0x08) + { + // Special/Historic UTF8 + nby = 5; + } + else if ((ch & 0x0c) == 0x0c) + { + // Special/Historic UTF8 + nby = 6; + } + else + { + nby = 4; + } + } + else + { + break; + } + + if (pItr + nby > pEnd) + { + break; + } + + uCounter++; + pItr += nby; + } + + return AuROString::npos; +} + +static auline CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in, + CodepointOffset_t uCodepointIndex) +{ + AuUInt uCounter {}; + auto uLength = in.length(); + + const char *pStart = in.data(); + const char *pItr = pStart; + const char *pEnd = pStart + uLength; + + while (pItr != pEnd) + { + AuUInt32 nby {}; + auto ch = *pItr; + unsigned int result = (ch & 0xF0); + + if ((ch & 0x80) == 0) + { + nby = 1; + } + else if ((ch & 0xE0) == 0xC0) + { + nby = 2; + } + else if (result == 0xE0) + { + nby = 3; + } + else if (result == 0xF0) + { + if ((ch & 0x08) == 0x08) + { + // Special/Historic UTF8 + nby = 5; + } + else if ((ch & 0x0c) == 0x0c) + { + // Special/Historic UTF8 + nby = 6; + } + else + { + nby = 4; + } + } + else + { + break; + } + + if (pItr + nby > pEnd) + { + break; + } + + pItr += nby; + + if (uCounter == uCodepointIndex) + { + return CodepointByteOffset_t(pItr - pStart); + } + + uCounter++; + } + + return AuROString::npos; +} + +static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in, + const AuROString &find, + CodepointByteOffset_t uStartPosition = {}) +{ + AuUInt uCounter {}; + auto uLength = in.length(); + auto uFindLength = find.length(); + + const char *pStart = in.data(); + const char *pItr = pStart; + const char *pEnd = pStart + uLength; + + while (pItr != pEnd) + { + AuUInt32 nby {}; + auto ch = *pItr; + unsigned int result = (ch & 0xF0); + CodepointByteOffset_t uByteOffset(pItr - pStart); + + if (uByteOffset >= uStartPosition) + { + AuROString suffixView(pItr, pEnd); + if (suffixView.length() > uFindLength) + { + suffixView = { suffixView.data(), uFindLength }; + } + + if (suffixView == find) + { + return uByteOffset; + } + } + + if ((ch & 0x80) == 0) + { + nby = 1; + } + else if ((ch & 0xE0) == 0xC0) + { + nby = 2; + } + else if (result == 0xE0) + { + nby = 3; + } + else if (result == 0xF0) + { + if ((ch & 0x08) == 0x08) + { + // Special/Historic UTF8 + nby = 5; + } + else if ((ch & 0x0c) == 0x0c) + { + // Special/Historic UTF8 + nby = 6; + } + else + { + nby = 4; + } + } + else + { + break; + } + + if (pItr + nby > pEnd) + { + break; + } + + uCounter++; + pItr += nby; + } + + return AuROString::npos; +} + +static CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in, + const AuROString &find, + CodepointByteOffset_t uStartPosition = {}) +{ + AuUInt uCounter {}; + auto uLength = in.length(); + auto uFindLength = find.length(); + + const char *pStart = in.data(); + const char *pItr = pStart + uStartPosition; + const char *pEnd = pStart + uLength; + + while (pItr != pEnd) + { + AuUInt32 nby {}; + auto ch = *pItr; + unsigned int result = (ch & 0xF0); + CodepointByteOffset_t uByteOffset(pItr - pStart); + + { + AuROString suffixView(pItr, pEnd); + if (suffixView.length() > uFindLength) + { + suffixView = { suffixView.data(), uFindLength }; + } + + if (suffixView == find) + { + return uByteOffset; + } + } + + if ((ch & 0x80) == 0) + { + nby = 1; + } + else if ((ch & 0xE0) == 0xC0) + { + nby = 2; + } + else if (result == 0xE0) + { + nby = 3; + } + else if (result == 0xF0) + { + if ((ch & 0x08) == 0x08) + { + // Special/Historic UTF8 + nby = 5; + } + else if ((ch & 0x0c) == 0x0c) + { + // Special/Historic UTF8 + nby = 6; + } + else + { + nby = 4; + } + } + else + { + break; + } + + if (pItr + nby > pEnd) + { + break; + } + + uCounter++; + pItr += nby; + } + + return AuROString::npos; +} + +static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in, + const AuROString &find, + CodepointOffset_t uStartPosition = {}) +{ + AuUInt uCounter {}; + auto uLength = in.length(); + auto uFindLength = find.length(); + + const char *pStart = in.data(); + const char *pItr = pStart; + const char *pEnd = pStart + uLength; + + while (pItr != pEnd) + { + AuUInt32 nby {}; + auto ch = *pItr; + unsigned int result = (ch & 0xF0); + + if (uCounter >= uStartPosition) + { + AuROString suffixView(pItr, pEnd); + if (suffixView.length() > uFindLength) + { + suffixView = { suffixView.data(), uFindLength }; + } + + if (suffixView == find) + { + return uCounter; + } + } + + if ((ch & 0x80) == 0) + { + nby = 1; + } + else if ((ch & 0xE0) == 0xC0) + { + nby = 2; + } + else if (result == 0xE0) + { + nby = 3; + } + else if (result == 0xF0) + { + if ((ch & 0x08) == 0x08) + { + // Special/Historic UTF8 + nby = 5; + } + else if ((ch & 0x0c) == 0x0c) + { + // Special/Historic UTF8 + nby = 6; + } + else + { + nby = 4; + } + } + else + { + break; + } + + if (pItr + nby > pEnd) + { + break; + } + + uCounter++; + pItr += nby; + } + + return AuROString::npos; +} + +static bool AuCodepointsContains(const AuROString &value, const AuROString &subpattern, CodepointOffset_t uStartPosition = {}) +{ + return AuCodepointsFindCodepointOffset(value, subpattern, uStartPosition) != AuROString::npos; +} + +static AuString &AuCodepointsReplaceAll(AuString &str, const AuROString &from, const AuROString &to) +{ + AuUInt uStartPosition {}; + while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos) + { + str.replace(uStartPosition, from.length(), to); + uStartPosition += to.length(); + } + return str; +} + +static AuList AuCodepointsSplitString(const AuROString &str, + const AuROString &delim, + bool bIgnoreEmpty = true) +{ + AuList tokens; + AuUInt uPrev {}, uPos {}; + tokens.reserve(str.size() / 16); + do + { + uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev); + if (uPos == AuROString::npos) + { + uPos = str.length(); + } + auto token = str.substr(uPrev, uPos - uPrev); + if ((!token.empty()) && bIgnoreEmpty) + { + tokens.push_back(token); + } + uPrev = uPos + delim.length(); + } + while (uPos < str.length() && uPrev < str.length()); + return tokens; +} + static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROString &to) { AuUInt uStartPosition {}; @@ -436,30 +875,30 @@ static AuString &AuReplaceAll(AuString &str, const AuROString &from, const AuROS } // i told myself not to copy this, required a split function twice, now here we are :D -static AuList AuSplitString(const AuROString &str, const AuROString &delim, bool ignoreEmpty = true) +static AuList AuSplitString(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true) { AuList tokens; - AuUInt prev = 0, pos = 0; + AuUInt uPrev {}, uPos {}; tokens.reserve(str.size() / 16); do { - pos = str.find(delim, prev); - if (pos == AuROString::npos) + uPos = str.find(delim, uPrev); + if (uPos == AuROString::npos) { - pos = str.length(); + uPos = str.length(); } - auto token = str.substr(prev, pos - prev); - if ((!token.empty()) && ignoreEmpty) + auto token = str.substr(uPrev, uPos - uPrev); + if ((!token.empty()) && bIgnoreEmpty) { tokens.push_back(token); } - prev = pos + delim.length(); + uPrev = uPos + delim.length(); } - while (pos < str.length() && prev < str.length()); + while (uPos < str.length() && uPrev < str.length()); return tokens; } -static AuList AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool ignoreEmpty = true) +static AuList AuSplitStringLegacy(const AuROString &str, const AuROString &delim, bool bIgnoreEmpty = true) { AuList tokens; AuUInt prev = 0, pos = 0; @@ -472,7 +911,7 @@ static AuList AuSplitStringLegacy(const AuROString &str, const AuROStr pos = str.length(); } auto token = str.substr(prev, pos - prev); - if ((!token.empty()) && ignoreEmpty) + if ((!token.empty()) && bIgnoreEmpty) { tokens.push_back(AuString(token)); }