[+] Added more configurability to the UTF 8 decoder

[*] Harden against 0xFF bytes
This commit is contained in:
Reece Wilson 2024-07-01 14:06:05 +01:00
parent 546ad4f41c
commit 67e1948120

View File

@ -31,12 +31,30 @@
// offset in codepoints // offset in codepoints
/* using CodepointOffset_t = AuUInt; */ /* using CodepointOffset_t = AuUInt; */
#if defined(AURORA_I_SUCK_AND_WANT_MODERN_UTF8) #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 4; static const AuUInt8 kAuCodepointUTF8MaxBytes = 4;
#elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 7;
#else #else
// default:
static const AuUInt8 kAuCodepointUTF8MaxBytes = 6; static const AuUInt8 kAuCodepointUTF8MaxBytes = 6;
#endif #endif
// none of these are defined by default
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
#if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
#define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL
#endif
#endif
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences
// (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting)
// (Enable this if you're boring)
// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl)
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared;
// ...ForEach will break early with false (usually implies a user break early condition);
// ...Translate will return an empty container
static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern) static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
{ {
return value.find(subpattern) != AuROString::npos; return value.find(subpattern) != AuROString::npos;
@ -91,13 +109,35 @@ static auline AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {
@ -160,13 +200,35 @@ static auline CodepointOffset_t AuCodepointsCount(const AuROString &in)
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {
@ -213,14 +275,36 @@ static auline CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 5; return 5;
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 6; return 6;
} }
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return 0;
}
else else
{ {
return 4; return 4;
@ -305,6 +389,9 @@ static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return {}; return {};
} }
@ -429,6 +516,9 @@ static AuString AuCodepointsTransform(T op, const AuROString &in)
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return {}; return {};
} }
@ -496,6 +586,9 @@ static bool AuCodepointsForEach(T op, const AuROString &in)
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
return false; return false;
} }
@ -639,13 +732,35 @@ static auline CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
nby = 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {
@ -701,13 +816,29 @@ static auline CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
break;
} }
else else
{ {
@ -786,13 +917,35 @@ static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {
@ -864,13 +1017,35 @@ static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const Au
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {
@ -931,13 +1106,35 @@ static CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(c
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {
@ -1058,13 +1255,35 @@ static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
{ {
if ((ch & 0x08) == 0x08) if ((ch & 0x08) == 0x08)
{ {
// Special/Historic UTF8 // Historic UTF8
nby = 5; nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
} }
else if ((ch & 0x0c) == 0x0c) else if ((ch & 0x0c) == 0x0c)
{ {
// Special/Historic UTF8 // Special UTF8
nby = 6; nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
} }
else else
{ {