[*] Move strings to auROXTL/Strings/* (excl iterator)

This commit is contained in:
Reece Wilson 2024-09-10 08:27:37 +01:00
parent ea2703e559
commit 37bb7e805e
7 changed files with 1848 additions and 1581 deletions

View File

@ -0,0 +1,149 @@
/***
Copyright (C) 2024 Jamie Reece Wilson (a/k/a "Reece"). All rights reserved.
File: auCodepointsUTF8.hpp
Date: 2024-09-09
File: auStringUtils.hpp
Date: 2022-2-1
File: AuroraUtils.hpp
File: auROXTLUtils.hpp
Date: 2021-6-9
Author: Reece
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto,
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
AuCodepointsContains,
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase,
AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView
Warning: By codepoints, we mean UTF32.
Aurora uses UTF8 strings everywhere by convention.
Warning: For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
Although, you can decode and encode UTF8 to 32 here (AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto).
***/
#pragma once
/// Try decode UTF32 codepoint from UTF8 sequence
static constexpr AuOptional<AuUInt32> AuCodepointsDecodeOne(const AuROString &in);
/// Try decode all UTF32 codepoints from UTF8 sequence
static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in);
/// Try encode one UTF8 codepoint into a string buffer
static void AuCodepointsEncodeInto(AuUInt32 uCodepoint,
AuString &out);
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToLower counterparts.
/// No localization complexities are, will, or should be involved.
static auline AuString AuCodepointsToLower(const AuROString &in);
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToUpper counterparts.
/// No localization complexities are, will, or should be involved.
static auline AuString AuCodepointsToUpper(const AuROString &in);
/// Counts the UTF8/UTF32 codepoints in a byte sequence
static auline constexpr CodepointOffset_t AuCodepointsCount(const AuROString &in);
/// Counts the bytes required to iterate over a UTF8 encoded codepoint
static auline constexpr CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in);
/// Iterates over a UTF8 sequence using OP.
/// If op returns void, AuCodepointsForEach will continue until EOS or invalid squence.
/// If op returns bool, AuCodepointsForEach will continue until EOS, invalid squence, or !op.
/// Returns false on invalid sequence or user break, otherwise returns true.
template <class T>
static bool AuCodepointsForEach(T op, const AuROString &in);
/// Translates the in UTF8 sequence using a transformer of U32(*fMyUTF32Translator)(U32)
template <class T>
static AuString AuCodepointsTransform(T op, const AuROString &in);
/// Translates the in UTF8 sequence using a transformer of U8(*fMyASCIITranslator)(U8)
template <class T>
AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in);
/// Performs a memcmp on unknown chunks, performs a memcmp for each UTF8 sequence of bytes, and performs a AuToLower(a) != AuToLower(b) operation on any ASCII English characters.
/// Also see: AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase
static bool AuCodepointsIsEqualIgnoreCase(const AuROString &inA,
const AuROString &inB);
/// Performs a memcmp on unknown chunks, performs a memcmp for each UTF8 sequence of bytes, and performs a AuToLower(a) != AuToLower(b) operation on any ASCII English characters.
/// Also see: AuCodepointsIsEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase
static bool AuCodepointsStartsWithEqualIgnoreCase(const AuROString &inA,
const AuROString &inB);
/// Also see: AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase
static bool AuCodepointsEndsWithEqualIgnoreCase(const AuROString &inA,
const AuROString &inB);
/// Returns the byte offset of the codepoint index or AuROString::npos
static auline constexpr CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
CodepointOffset_t uCodepointIndex);
/// Returns the length of the codepoint index in bytes or AuROString::npos
static auline constexpr CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
CodepointOffset_t uCodepointIndex);
/// Finds subpattern in value at codepoint offset, returning bool
static bool AuCodepointsContains(const AuROString &value,
const AuROString &subpattern,
CodepointOffset_t uStartPosition = {});
/// Performs a terribly inefficient find in sequence operation.
/// Returns the byte offset for find in in starting at uStartPosition bytes, or AuROString::npos
/// Assuming the in sequence is valid, you can use other traditional binary methods.
/// UTF8 sets the higher most bits to signify multibyte sequence position; you can arbitrarily scan for a complete sequences of UTF8 characters in a legal buffer without any special logic.
static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition = {});
/// AuCodepointsFindByteOffset same as above (AuCodepointsFindByteOffset), slightly less dumb, but still too bloated.
/// The main difference is that uStartPosition is trusted to be a valid start offset.
/// Worst thing about this is, assuming a valid string view, we don't need to worry about testing the validity of the previous bytes (<uStartPosition), or even the current codepoint byte offset (uStartPosition).
static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
const AuROString &find,
CodepointByteOffset_t uStartPosition);
/// An inefficient seek backwards to byte offset given an arbitrary codepoint offset.
/// Returns byte offset for codepoint offset - 1 or AuROString::npos
static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(const AuROString &in,
CodepointOffset_t uStartPosition = {});
/// An efficient seek backwards to byte offset given an arbitrary codepoint byte offset.
/// Returns byte offset for codepoint byte offset - 1 or AuROString::npos
static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
CodepointByteOffset_t uStartPosition = {});
static constexpr CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
CodepointByteOffset_t uBytePosition);
/// Finds subpattern in value at codepoint offset, returning a codepoint offset or AuROString::npos
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
const AuROString &find,
CodepointOffset_t uStartPosition = {});
/// For a given valid UTF8 string view, provides the delta byte offset or AuROString::npos, to delete the last most codepoint
static constexpr AuUInt AuCodepointsReverseIterate(const AuROString &string);
/// For a given valid UTF8 string view, to delete the last most codepoint, this provides a view with the suffix removed or an empty view.
static constexpr AuROString AuCodepointsReverseIterateSubStrPrefixView(const AuROString &string);
/// For a given valid UTF8 string view, to delete the last most codepoint, this provides a view with the just suffix to be removed or an empty view.
static constexpr AuROString AuCodepointsReverseIterateSubStrSuffixView(const AuROString &string);
static AuString & AuCodepointsReplaceAll(AuString &str,
const AuROString &from,
const AuROString &to);
static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
const AuROString &delim,
bool bIgnoreEmpty = true);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,138 @@
/***
Copyright (C) 2022-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: auStringUtils.hpp
Date: 2024-09-09
File: auStringUtils.hpp
Date: 2022-2-1
File: AuroraUtils.hpp
File: auROXTLUtils.hpp
Date: 2021-6-9
Author: Reece
***/
#pragma once
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToLower counterparts.
/// No localization complexities are, will, or should be involved.
static AuString AuCodepointsToLower(const AuROString &in);
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToUpper counterparts.
/// No localization complexities are, will, or should be involved.
static AuString AuCodepointsToUpper(const AuROString &in);
static auline constexpr bool AuIsAlpha(char c)
{
return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
}
static auline constexpr char AuToLower(char c)
{
return AuIsAlpha(c) ? c | 0x20 : c;
}
static auline constexpr char AuToUpper(char c)
{
return AuIsAlpha(c) ? c & ~0x20 : c;
}
static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
{
return value.find(subpattern) != AuROString::npos;
}
static auline constexpr bool AuEndsWith(AuROString const &value, AuROString const &ending)
{
if (ending.size() > value.size())
{
return false;
}
else
{
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
}
static auline constexpr bool AuStartsWith(AuROString const &value, AuROString const &starting)
{
#if defined(AU_STRING_IS_TINYUTF_EXPERIMENT)
return value.starts_with(starting);
#else
return value.rfind(starting, 0) == 0;
#endif
}
/// Alias for AuCodepointsToLower
static AuString AuToLower(const AuROString &in)
{
return AuCodepointsToLower(in);
}
/// Alias for AuCodepointsToUpper
static AuString AuToUpper(const AuROString &in)
{
return AuCodepointsToUpper(in);
}
static AuString &AuReplaceAll(AuString &str,
const AuROString &from,
const AuROString &to)
{
AuUInt uStartPosition {};
while ((uStartPosition = str.find(from, uStartPosition)) != AuROString::npos)
{
str.replace(uStartPosition, from.length(), to);
uStartPosition += to.length();
}
return str;
}
// i told myself not to copy this, required a split function twice, now here we are :D
static AuList<AuROString> AuSplitString(const AuROString &str,
const AuROString &delim,
bool bIgnoreEmpty = true)
{
AuList<AuROString> tokens;
AuUInt uPrev {}, uPos {};
tokens.reserve(str.size() / 16);
do
{
uPos = str.find(delim, uPrev);
if (uPos == AuROString::npos)
{
uPos = str.length();
}
auto token = str.substr(uPrev, uPos - uPrev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(token);
}
uPrev = uPos + delim.length();
}
while (uPos < str.length() && uPrev < str.length());
return tokens;
}
static AuList<AuString> AuSplitStringLegacy(const AuROString &str,
const AuROString &delim,
bool bIgnoreEmpty = true)
{
AuList<AuString> tokens;
AuUInt prev = 0, pos = 0;
tokens.reserve(str.size() / 16);
do
{
pos = str.find(delim, prev);
if (pos == AuROString::npos)
{
pos = str.length();
}
auto token = str.substr(prev, pos - prev);
if ((!token.empty()) && bIgnoreEmpty)
{
tokens.push_back(AuString(token));
}
prev = pos + delim.length();
}
while (pos < str.length() && prev < str.length());
return tokens;
}

File diff suppressed because it is too large Load Diff

View File

@ -23,8 +23,8 @@ static auline constexpr bool AuStartsWith(AuROString const &value, AuROString co
#include "auMemoryUtils.hpp" #include "auMemoryUtils.hpp"
#include "Iterators/auROString.hpp" #include "Strings/auROString.hpp"
#include "Iterators/auRONString.hpp" #include "Strings/auRONString.hpp"
inline constexpr bool operator==(const AuROString &lhs, inline constexpr bool operator==(const AuROString &lhs,
const AuROString &rhs) noexcept const AuROString &rhs) noexcept