From 67e1948120095586c1a1f9689c95e88dfef8ce48 Mon Sep 17 00:00:00 2001 From: Jamie Reece Wilson Date: Mon, 1 Jul 2024 14:06:05 +0100 Subject: [PATCH] [+] Added more configurability to the UTF 8 decoder [*] Harden against 0xFF bytes --- Include/auROXTL/auStringUtils.hpp | 257 +++++++++++++++++++++++++++--- 1 file changed, 238 insertions(+), 19 deletions(-) diff --git a/Include/auROXTL/auStringUtils.hpp b/Include/auROXTL/auStringUtils.hpp index 090b7bd..d5caa1e 100644 --- a/Include/auROXTL/auStringUtils.hpp +++ b/Include/auROXTL/auStringUtils.hpp @@ -31,12 +31,30 @@ // offset in codepoints /* using CodepointOffset_t = AuUInt; */ -#if defined(AURORA_I_SUCK_AND_WANT_MODERN_UTF8) +#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8) static const AuUInt8 kAuCodepointUTF8MaxBytes = 4; +#elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8) + static const AuUInt8 kAuCodepointUTF8MaxBytes = 7; #else +// default: static const AuUInt8 kAuCodepointUTF8MaxBytes = 6; #endif +// none of these are defined by default +#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) +#if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) +#define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL +#endif +#endif + +// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences +// (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting) +// (Enable this if you're boring) +// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl) +// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared; +// ...ForEach will break early with false (usually implies a user break early condition); +// ...Translate will return an empty container + static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern) { return value.find(subpattern) != AuROString::npos; @@ -91,13 +109,35 @@ static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in) { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else { @@ -160,13 +200,35 @@ static auline CodepointOffset_t AuCodepointsCount(const AuROString &in) { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else { @@ -213,14 +275,36 @@ static auline CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in) { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif return 5; } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif return 6; } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + return 7; + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + return 0; + } else { return 4; @@ -305,6 +389,9 @@ static AuList AuCodepointsDecode(const AuROString &in) if (nby > kAuCodepointUTF8MaxBytes) { + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif return {}; } @@ -429,6 +516,9 @@ static AuString AuCodepointsTransform(T op, const AuROString &in) if (nby > kAuCodepointUTF8MaxBytes) { + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif return {}; } @@ -496,6 +586,9 @@ static bool AuCodepointsForEach(T op, const AuROString &in) if (nby > kAuCodepointUTF8MaxBytes) { + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif return false; } @@ -639,13 +732,35 @@ static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString & { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + nby = 7; + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else { @@ -701,13 +816,29 @@ static auline CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString & { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + break; } else { @@ -786,13 +917,35 @@ static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in, { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else { @@ -864,13 +1017,35 @@ static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const Au { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else { @@ -931,13 +1106,35 @@ static CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(c { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else { @@ -1058,13 +1255,35 @@ static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in, { if ((ch & 0x08) == 0x08) { - // Special/Historic UTF8 + // Historic UTF8 nby = 5; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif } else if ((ch & 0x0c) == 0x0c) { - // Special/Historic UTF8 + // Special UTF8 nby = 6; + #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0e) == 0x0e) + { + // Illegal UTF8 + nby = 7; + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + } + else if ((ch & 0x0f) == 0x0f) + { + // Not even logical + #if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) + AU_THROW_CONST_STRING("Illegal UTF8"); + #endif + break; } else {