[+] AuCodepointsFindByteOffsetIgnoreCase

[+] AuCodepointsNextLengthFromCharacter
[*] Clean up questionable iteration code / massive speed up
This commit is contained in:
Reece Wilson 2024-12-30 13:23:22 +00:00
parent 5ece0d5878
commit 03a2c6e030
4 changed files with 159 additions and 507 deletions

View File

@ -56,6 +56,8 @@ AU_INLINE_CONSTEXPR_17 CodepointOffset_t AuCodepointsCount(const
/// Counts the bytes required to iterate over a UTF8 encoded codepoint /// Counts the bytes required to iterate over a UTF8 encoded codepoint
AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in); AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in);
AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsNextLengthFromCharacter(AuUInt8 uCharacter);
/// Iterates over a UTF8 sequence using OP. /// Iterates over a UTF8 sequence using OP.
/// If op returns void, AuCodepointsForEach will continue until EOS or invalid squence. /// If op returns void, AuCodepointsForEach will continue until EOS or invalid squence.
/// If op returns bool, AuCodepointsForEach will continue until EOS, invalid squence, or !op. /// If op returns bool, AuCodepointsForEach will continue until EOS, invalid squence, or !op.
@ -85,6 +87,11 @@ static bool AuCodepointsStartsWithI
static bool AuCodepointsEndsWithIgnoreCase(const AuROString &inA, static bool AuCodepointsEndsWithIgnoreCase(const AuROString &inA,
const AuROString &inB); const AuROString &inB);
/// Also see: AuCodepointsFindByteOffset, AuCodepointsContainsIgnoreCase
static CodepointByteOffset_t AuCodepointsFindByteOffsetIgnoreCase(const AuROString &inA,
const AuROString &inB);
/// Also see: AuCodepointsFindByteOffsetIgnoreCase
static bool AuCodepointsContainsIgnoreCase(const AuROString &inA, static bool AuCodepointsContainsIgnoreCase(const AuROString &inA,
const AuROString &inB); const AuROString &inB);

View File

@ -37,9 +37,9 @@ AU_STATIC_CONSTEXPR_17 AuOptional<AuUInt32> AuCodepointsDecodeOne(const AuROStr
return AuOptional<AuUInt32>(); return AuOptional<AuUInt32>();
} }
auto uLength = in.length(); auto uLength = in.Length();
const char *pItr = in.data(); const char *pItr = in.Begin();
const char *pEnd = pItr + uLength; const char *pEnd = pItr + uLength;
while (pItr < pEnd) while (pItr < pEnd)
@ -59,9 +59,7 @@ AU_STATIC_CONSTEXPR_17 AuOptional<AuUInt32> AuCodepointsDecodeOne(const AuROStr
return AuOptional<AuUInt32>(); return AuOptional<AuUInt32>();
} }
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) nby = AuCodepointsNextLengthFromCharacter(*pItr);
{
}
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
@ -107,11 +105,11 @@ static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
return ret; return ret;
} }
auto uLength = in.length(); auto uLength = in.Length();
ret.reserve(uLength); ret.reserve(uLength);
const char *pItr = in.data(); const char *pItr = in.Begin();
const char *pEnd = pItr + uLength; const char *pEnd = pItr + uLength;
while (pItr < pEnd) while (pItr < pEnd)
@ -131,9 +129,7 @@ static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in)
return AuList<AuUInt32>(); return AuList<AuUInt32>();
} }
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) nby = AuCodepointsNextLengthFromCharacter(*pItr);
{
}
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
@ -243,9 +239,9 @@ AU_OPTIMIZED AuString AuCodepointsToUpper(const AuROString &in)
AU_INLINE_CONSTEXPR_17 CodepointOffset_t AuCodepointsCount(const AuROString &in) AU_INLINE_CONSTEXPR_17 CodepointOffset_t AuCodepointsCount(const AuROString &in)
{ {
CodepointOffset_t uCounter (0); CodepointOffset_t uCounter (0);
auto uLength = in.length(); auto uLength = in.Length();
const char *pItr = in.data(); const char *pItr = in.Begin();
const char *pEnd = pItr + uLength; const char *pEnd = pItr + uLength;
while (pItr != pEnd) while (pItr != pEnd)
@ -254,58 +250,8 @@ AU_INLINE_CONSTEXPR_17 CodepointOffset_t AuCodepointsCount(const AuROString &in)
auto ch = *pItr; auto ch = *pItr;
unsigned int result = (ch & 0xF0); unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -322,10 +268,63 @@ AU_INLINE_CONSTEXPR_17 CodepointOffset_t AuCodepointsCount(const AuROString &in)
return uCounter; return uCounter;
} }
AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsNextLengthFromCharacter(AuUInt8 uCharacter)
{
#if defined(AU_CPU_ENDIAN_BIG)
const char uChar = uCharacter;
return AuCodepointsNextLength(AuROString(&uChar, 1));
#else
#if defined(AU_LANG_CPP_17_)
if (__builtin_is_constant_evaluated())
{
const char uChar = uCharacter;
return AuCodepointsNextLength(AuROString(&uChar, 1));
}
#endif
if ((uCharacter & 0x80) == 0)
{
return 1;
}
AuUInt8 uBits(0);
if (AuBitScanReverse(uBits, ~(AuUInt32(uCharacter) << 24u)))
{
return 31 - uBits;
}
else
{
return 0;
}
#endif
}
/// Counts the bytes required to iterate over a UTF8 encoded codepoint /// Counts the bytes required to iterate over a UTF8 encoded codepoint
AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in) AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in)
{ {
if (in.length())
#if defined(AU_LANG_CPP_17_)
if (!__builtin_is_constant_evaluated())
{
if (in.Empty())
{
return 0;
}
return AuCodepointsNextLengthFromCharacter(in.Data()[0]);
}
#endif
if (in.Length())
{ {
auto ch = in[0]; auto ch = in[0];
unsigned int result = (ch & 0xF0); unsigned int result = (ch & 0xF0);
@ -398,9 +397,9 @@ static bool AuCodepointsForEach(T op, const AuROStrin
return true; return true;
} }
auto uLength = in.length(); auto uLength = in.Length();
const char *pItr = in.data(); const char *pItr = in.Begin();
const char *pEnd = pItr + uLength; const char *pEnd = pItr + uLength;
while (pItr < pEnd) while (pItr < pEnd)
@ -420,9 +419,7 @@ static bool AuCodepointsForEach(T op, const AuROStrin
return false; return false;
} }
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) nby = AuCodepointsNextLengthFromCharacter(*pItr);
{
}
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
@ -479,11 +476,11 @@ static AuString AuCodepointsTransform(T op, const AuROStr
return ret; return ret;
} }
auto uLength = in.length(); auto uLength = in.Length();
ret.reserve(uLength); ret.reserve(uLength);
const char *pItr = in.data(); const char *pItr = in.Begin();
const char *pEnd = pItr + uLength; const char *pEnd = pItr + uLength;
while (pItr < pEnd) while (pItr < pEnd)
@ -503,9 +500,7 @@ static AuString AuCodepointsTransform(T op, const AuROStr
return AuString(); return AuString();
} }
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) nby = AuCodepointsNextLengthFromCharacter(*pItr);
{
}
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
@ -548,11 +543,11 @@ template <class T>
AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in) AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in)
{ {
AuString ret; AuString ret;
auto uLength = in.length(); auto uLength = in.Length();
ret.resize(uLength); ret.resize(uLength);
const char *pItr = in.data(); const char *pItr = in.Begin();
const char *pEnd = pItr + uLength; const char *pEnd = pItr + uLength;
AuUInt32 uCounter (0); AuUInt32 uCounter (0);
@ -562,58 +557,9 @@ AuString AuCodepointsTransformASCIIOp(T op, const
auto ch = *pItr; auto ch = *pItr;
unsigned int result = (ch & 0xF0); unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{
nby = 1; if (!nby)
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -655,9 +601,9 @@ static bool AuCodepointsIsEqualIgnoreCase(const AuROS
return true; return true;
} }
const char *pItr = inA.data(); const char *pItr = inA.Begin();
const char *pItr2 = inB.data(); const char *pItr2 = inB.Begin();
const char *pEnd = pItr + inA.length(); const char *pEnd = pItr + inA.Length();
while (pItr < pEnd) while (pItr < pEnd)
{ {
@ -681,9 +627,7 @@ static bool AuCodepointsIsEqualIgnoreCase(const AuROS
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0; return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
} }
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) nby = AuCodepointsNextLengthFromCharacter(*pItr);
{
}
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
@ -723,9 +667,9 @@ static bool AuCodepointsStartsWithIgnoreCase(const Au
return true; return true;
} }
const char *pItr = inA.data(); const char *pItr = inA.Begin();
const char *pItr2 = inB.data(); const char *pItr2 = inB.Begin();
const char *pEnd = pItr + inB.length(); const char *pEnd = pItr + inB.Length();
const char *pEnd2 = inB.End(); const char *pEnd2 = inB.End();
while (pItr < pEnd) while (pItr < pEnd)
@ -754,9 +698,7 @@ static bool AuCodepointsStartsWithIgnoreCase(const Au
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0; return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
} }
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby) nby = AuCodepointsNextLengthFromCharacter(*pItr);
{
}
if (nby > kAuCodepointUTF8MaxBytes) if (nby > kAuCodepointUTF8MaxBytes)
{ {
@ -776,7 +718,7 @@ static bool AuCodepointsStartsWithIgnoreCase(const Au
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0; return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
} }
if (nby > pEnd2 - pItr2) if (nby > AuUInt(pEnd2 - pItr2))
{ {
return false; return false;
} }
@ -828,8 +770,8 @@ static bool AuCodepointsEndsWithIgnoreCase(const AuRO
return false; return false;
} }
auto pItr = inA.data() + uOffset - nby; auto pItr = inA.Begin() + uOffset - nby;
auto pItr2 = inB.data() + uOffset2 - nby; auto pItr2 = inB.Begin() + uOffset2 - nby;
if ((c = *pItr) <= 0x7FU) if ((c = *pItr) <= 0x7FU)
{ {
@ -856,54 +798,44 @@ static bool AuCodepointsEndsWithIgnoreCase(const AuRO
return true; return true;
} }
static bool AuCodepointsContainsIgnoreCase(const AuROString &inA, static CodepointByteOffset_t AuCodepointsFindByteOffsetIgnoreCase(const AuROString &inA,
const AuROString &inB) const AuROString &inB)
{ {
if (inA.size() < inB.size()) if (inA.size() < inB.size())
{ {
return false; return AuROString::npos;
} }
const char *pItr = inA.data(); const char *pItr = inA.Begin();
const char *pItr2 = inB.data(); const char *pItr2 = inB.Begin();
const char *pEnd = pItr + inA.length(); const char *pEnd = pItr + inA.Length();
const char *pEnd2 = inB.end(); const char *pEnd2 = inB.End();
while (pItr < pEnd) while (pItr < pEnd)
{ {
AuUInt32 c (0);
AuUInt32 nby (0); AuUInt32 nby (0);
if ((c = *pItr) <= 0x7FU) nby = AuCodepointsNextLengthFromCharacter(*pItr);
if (!nby)
{ {
nby = 1; break;
}
else
{
if ((*pItr & 0xC0U) != 0xC0U)
{
if (pEnd - pItr != pEnd2 - pItr2)
{
return false;
}
return AuMemcmp(pItr, pItr2, pEnd - pItr) == 0;
}
for (AuUInt8 b = *pItr; (b & 0x80U) != 0; b <<= 1, ++nby)
{
}
} }
if (AuCodepointsStartsWithIgnoreCase(AuROString(pItr, AuUInt(pEnd - pItr)), inB)) if (AuCodepointsStartsWithIgnoreCase(AuROString(pItr, AuUInt(pEnd - pItr)), inB))
{ {
return true; return pItr - inA.Begin();
} }
pItr += nby; pItr += nby;
} }
return false; return AuROString::npos;
}
static bool AuCodepointsContainsIgnoreCase(const AuROString &inA,
const AuROString &inB)
{
return AuCodepointsFindByteOffsetIgnoreCase(inA, inB) != AuROString::npos;
} }
/// Returns the byte offset of the codepoint index or AuROString::npos /// Returns the byte offset of the codepoint index or AuROString::npos
@ -911,9 +843,9 @@ AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsGetByteOffset(const AuR
CodepointOffset_t uCodepointIndex) CodepointOffset_t uCodepointIndex)
{ {
AuUInt uCounter (0); AuUInt uCounter (0);
auto uLength = in.length(); auto uLength = in.Length();
const char *pStart = in.data(); const char *pStart = in.Begin();
const char *pItr = pStart; const char *pItr = pStart;
const char *pEnd = pStart + uLength; const char *pEnd = pStart + uLength;
@ -928,58 +860,8 @@ AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsGetByteOffset(const AuR
return CodepointByteOffset_t(pItr - pStart); return CodepointByteOffset_t(pItr - pStart);
} }
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
nby = 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -1001,9 +883,9 @@ AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsGetByteLength(const AuR
CodepointOffset_t uCodepointIndex) CodepointOffset_t uCodepointIndex)
{ {
AuUInt uCounter (0); AuUInt uCounter (0);
auto uLength = in.length(); auto uLength = in.Length();
const char *pStart = in.data(); const char *pStart = in.Begin();
const char *pItr = pStart; const char *pItr = pStart;
const char *pEnd = pStart + uLength; const char *pEnd = pStart + uLength;
@ -1013,52 +895,8 @@ AU_INLINE_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsGetByteLength(const AuR
auto ch = *pItr; auto ch = *pItr;
unsigned int result = (ch & 0xF0); unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -1098,10 +936,10 @@ static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROStri
CodepointByteOffset_t uStartPosition) CodepointByteOffset_t uStartPosition)
{ {
AuUInt uCounter = 0; AuUInt uCounter = 0;
auto uLength = in.length(); auto uLength = in.Length();
auto uFindLength = find.length(); auto uFindLength = find.Length();
const char *pStart = in.data(); const char *pStart = in.Begin();
const char *pItr = pStart; const char *pItr = pStart;
const char *pEnd = pStart + uLength; const char *pEnd = pStart + uLength;
@ -1115,9 +953,9 @@ static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROStri
if (uByteOffset >= uStartPosition) if (uByteOffset >= uStartPosition)
{ {
AuROString suffixView(pItr, pEnd); AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength) if (suffixView.Length() > uFindLength)
{ {
suffixView = AuROString ( suffixView.data(), uFindLength ); suffixView = AuROString ( suffixView.Begin(), uFindLength );
} }
if (suffixView == find) if (suffixView == find)
@ -1126,58 +964,8 @@ static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROStri
} }
} }
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -1202,10 +990,10 @@ AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindByteOffsetUnsa
CodepointByteOffset_t uStartPosition) CodepointByteOffset_t uStartPosition)
{ {
AuUInt uCounter = 0; AuUInt uCounter = 0;
auto uLength = in.length(); auto uLength = in.Length();
auto uFindLength = find.length(); auto uFindLength = find.Length();
const char *pStart = in.data(); const char *pStart = in.Begin();
const char *pItr = pStart + uStartPosition; const char *pItr = pStart + uStartPosition;
const char *pEnd = pStart + uLength; const char *pEnd = pStart + uLength;
@ -1218,9 +1006,9 @@ AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindByteOffsetUnsa
{ {
AuROString suffixView(pItr, pEnd); AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength) if (suffixView.Length() > uFindLength)
{ {
suffixView = AuROString ( suffixView.data(), uFindLength ); suffixView = AuROString ( suffixView.Begin(), uFindLength );
} }
if (suffixView == find) if (suffixView == find)
@ -1229,58 +1017,8 @@ AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindByteOffsetUnsa
} }
} }
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -1303,9 +1041,9 @@ AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindPreviousValidB
CodepointOffset_t uStartPosition) CodepointOffset_t uStartPosition)
{ {
AuUInt uCounter = 0; AuUInt uCounter = 0;
auto uLength = in.length(); auto uLength = in.Length();
const char *pStart = in.data(); const char *pStart = in.Begin();
const char *pItr = pStart; const char *pItr = pStart;
const char *pEnd = pStart + uLength; const char *pEnd = pStart + uLength;
@ -1320,58 +1058,8 @@ AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindPreviousValidB
auto ch = *pItr; auto ch = *pItr;
unsigned int result = (ch & 0xF0); unsigned int result = (ch & 0xF0);
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -1400,7 +1088,7 @@ AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindPreviousValidB
AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in, AU_STATIC_CONSTEXPR_17 CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
CodepointByteOffset_t uStartPosition) CodepointByteOffset_t uStartPosition)
{ {
const char * pStart = in.data(); const char * pStart = in.Begin();
const char * pItr = pStart + uStartPosition - 1; const char * pItr = pStart + uStartPosition - 1;
if (uStartPosition == 0) if (uStartPosition == 0)
@ -1446,10 +1134,10 @@ static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuR
CodepointOffset_t uStartPosition) CodepointOffset_t uStartPosition)
{ {
AuUInt uCounter (0); AuUInt uCounter (0);
auto uLength = in.length(); auto uLength = in.Length();
auto uFindLength = find.length(); auto uFindLength = find.Length();
const char *pStart = in.data(); const char *pStart = in.Begin();
const char *pItr = pStart; const char *pItr = pStart;
const char *pEnd = pStart + uLength; const char *pEnd = pStart + uLength;
@ -1462,9 +1150,9 @@ static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuR
if (uCounter >= uStartPosition) if (uCounter >= uStartPosition)
{ {
AuROString suffixView(pItr, pEnd); AuROString suffixView(pItr, pEnd);
if (suffixView.length() > uFindLength) if (suffixView.Length() > uFindLength)
{ {
suffixView = AuROString ( suffixView.data(), uFindLength ); suffixView = AuROString ( suffixView.Begin(), uFindLength );
} }
if (suffixView == find) if (suffixView == find)
@ -1473,58 +1161,8 @@ static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuR
} }
} }
if ((ch & 0x80) == 0) nby = AuCodepointsNextLengthFromCharacter(ch);
{ if (!nby)
nby = 1;
}
else if ((ch & 0xE0) == 0xC0)
{
nby = 2;
}
else if (result == 0xE0)
{
nby = 3;
}
else if (result == 0xF0)
{
if ((ch & 0x08) == 0x08)
{
// Historic UTF8
nby = 5;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0c) == 0x0c)
{
// Special UTF8
nby = 6;
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0e) == 0x0e)
{
// Illegal UTF8
nby = 7;
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
}
else if ((ch & 0x0f) == 0x0f)
{
// Not even logical
#if defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
AU_THROW_CONST_STRING("Illegal UTF8");
#endif
break;
}
else
{
nby = 4;
}
}
else
{ {
break; break;
} }
@ -1582,14 +1220,18 @@ static AuString & AuCodepointsReplaceAll(AuString &str,
const AuROString &to) const AuROString &to)
{ {
AuUInt uStartPosition (0); AuUInt uStartPosition (0);
#if !defined(AU_LANG_CPP_17_)
std::string hack(to);
#endif
while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos) while ((uStartPosition = AuCodepointsFindByteOffsetUnsafe(str, from, uStartPosition)) != AuROString::npos)
{ {
#if defined(AU_LANG_CPP_17_) #if defined(AU_LANG_CPP_17_)
str.replace(uStartPosition, from.length(), to); str.replace(uStartPosition, from.Length(), to);
#else #else
str.replace(uStartPosition, from.length(), std::string(to).c_str(), 0, to.length()); str.replace(uStartPosition, from.Length(), hack.c_str(), 0, to.Length());
#endif #endif
uStartPosition += to.length(); uStartPosition += to.Length();
} }
return str; return str;
} }
@ -1606,16 +1248,16 @@ static AuList<AuROString> AuCodepointsSplitString(const AuROString
uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev); uPos = AuCodepointsFindByteOffsetUnsafe(str, delim, uPrev);
if (uPos == AuROString::npos) if (uPos == AuROString::npos)
{ {
uPos = str.length(); uPos = str.Length();
} }
auto token = str.substr(uPrev, uPos - uPrev); auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty) if ((!token.empty()) && bIgnoreEmpty)
{ {
tokens.push_back(token); tokens.push_back(token);
} }
uPrev = uPos + delim.length(); uPrev = uPos + delim.Length();
} }
while (uPos < str.length() && uPrev < str.length()); while (uPos < str.Length() && uPrev < str.Length());
return tokens; return tokens;
} }

View File

@ -63,7 +63,6 @@
#include <auROXTL/Strings/auStringUtils.hpp> #include <auROXTL/Strings/auStringUtils.hpp>
#include <auROXTL/Strings/auCodepointsUTF8.hpp> #include <auROXTL/Strings/auCodepointsUTF8.hpp>
#include <auROXTL/Strings/auCodepointsUTF8.ipp>
#if !defined(AURORA_RUNTIME_TO_STRING) #if !defined(AURORA_RUNTIME_TO_STRING)
#define AURORA_RUNTIME_TO_STRING std::to_string #define AURORA_RUNTIME_TO_STRING std::to_string

View File

@ -33,6 +33,7 @@
#include <auROXTL/auContainerUtils.hpp> #include <auROXTL/auContainerUtils.hpp>
#include <auROXTL/auListUtils.hpp> #include <auROXTL/auListUtils.hpp>
#include <auROXTL/auStringUtils.hpp> #include <auROXTL/auStringUtils.hpp>
#include <auROXTL/Strings/auCodepointsUTF8.hpp>
#include <auROXTL/Iterators/auUTF8Iterator.ipp> #include <auROXTL/Iterators/auUTF8Iterator.ipp>
#include <auROXTL/Iterators/auReverseIterator.ipp> #include <auROXTL/Iterators/auReverseIterator.ipp>
#include <auROXTL/auTupleUtils.hpp> #include <auROXTL/auTupleUtils.hpp>
@ -84,6 +85,9 @@ namespace __audetail
#include <auROXTL/Strings/auLinerParserAndSplitter.ipp> #include <auROXTL/Strings/auLinerParserAndSplitter.ipp>
#include <auROXTL/Strings/auCodepointsUTF8.hpp>
#include <auROXTL/Strings/auCodepointsUTF8.ipp>
struct IAuNullDelegate struct IAuNullDelegate
{ {
virtual void OnCall() = 0; virtual void OnCall() = 0;