diff --git a/Include/auROXTL/auStringUtils.hpp b/Include/auROXTL/auStringUtils.hpp index ec9689b..7b86513 100644 --- a/Include/auROXTL/auStringUtils.hpp +++ b/Include/auROXTL/auStringUtils.hpp @@ -30,6 +30,12 @@ // offset in codepoints /* using CodepointOffset_t = AuUInt; */ +#if defined(AURORA_I_SUCK_AND_WANT_MODERN_UTF8) + static const AuUInt8 kAuCodepointUTF8MaxBytes = 4; +#else + static const AuUInt8 kAuCodepointUTF8MaxBytes = 6; +#endif + static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern) { return value.find(subpattern) != AuROString::npos; @@ -296,7 +302,7 @@ static AuList AuCodepointsDecode(const AuROString &in) { } - if (nby > 6) + if (nby > kAuCodepointUTF8MaxBytes) { return {}; } @@ -420,7 +426,7 @@ static AuString AuCodepointsTransform(T op, const AuROString &in) { } - if (nby > 6) + if (nby > kAuCodepointUTF8MaxBytes) { return {}; } @@ -453,6 +459,81 @@ static AuString AuCodepointsTransform(T op, const AuROString &in) return ret; } +template +static bool AuCodepointsForEach(T op, const AuROString &in) +{ + if (in.empty()) + { + return true; + } + + auto uLength = in.length(); + + const char *pItr = in.data(); + const char *pEnd = pItr + uLength; + + while (pItr < pEnd) + { + AuUInt32 c {}; + + if ((c = *pItr) <= 0x7FU) + { + ++pItr; + } + else + { + AuUInt32 nby {}; + + if ((*pItr & 0xC0U) != 0xC0U) + { + return false; + } + + for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) + { + } + + if (nby > kAuCodepointUTF8MaxBytes) + { + return false; + } + + if (AuUInt(pEnd - pItr) < AuUInt(nby)) + { + return false; + } + + c = *pItr & (AuUInt8(0xFFU) >> (nby + 1)); + + for (AuUInt32 i = 1; i < nby; ++i) + { + if ((pItr[i] & 0xC0U) != 0x80U) + { + return {}; + } + + c = (c << 6) | (pItr[i] & 0x3FU); + } + + pItr += nby; + } + + if constexpr (AuIsSame_v, bool>) + { + if (!op(c)) + { + return false; + } + } + else + { + op(c); + } + } + + return true; +} + static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in, CodepointOffset_t uCodepointIndex) {