[*] Move strings to auROXTL/Strings/* (excl iterator)
This commit is contained in:
parent
ea2703e559
commit
37bb7e805e
149
Include/auROXTL/Strings/auCodepointsUTF8.hpp
Normal file
149
Include/auROXTL/Strings/auCodepointsUTF8.hpp
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
/***
|
||||||
|
Copyright (C) 2024 Jamie Reece Wilson (a/k/a "Reece"). All rights reserved.
|
||||||
|
|
||||||
|
File: auCodepointsUTF8.hpp
|
||||||
|
Date: 2024-09-09
|
||||||
|
File: auStringUtils.hpp
|
||||||
|
Date: 2022-2-1
|
||||||
|
File: AuroraUtils.hpp
|
||||||
|
File: auROXTLUtils.hpp
|
||||||
|
Date: 2021-6-9
|
||||||
|
Author: Reece
|
||||||
|
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
|
||||||
|
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
|
||||||
|
|
||||||
|
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
|
||||||
|
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
|
||||||
|
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
|
||||||
|
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto,
|
||||||
|
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
|
||||||
|
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
|
||||||
|
AuCodepointsContains,
|
||||||
|
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
|
||||||
|
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
|
||||||
|
AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase,
|
||||||
|
AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView
|
||||||
|
|
||||||
|
Warning: By codepoints, we mean UTF32.
|
||||||
|
Aurora uses UTF8 strings everywhere by convention.
|
||||||
|
|
||||||
|
Warning: For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
|
||||||
|
Although, you can decode and encode UTF8 to 32 here (AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto).
|
||||||
|
***/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
/// Try decode UTF32 codepoint from UTF8 sequence
|
||||||
|
static constexpr AuOptional<AuUInt32> AuCodepointsDecodeOne(const AuROString &in);
|
||||||
|
|
||||||
|
/// Try decode all UTF32 codepoints from UTF8 sequence
|
||||||
|
static AuList<AuUInt32> AuCodepointsDecode(const AuROString &in);
|
||||||
|
|
||||||
|
/// Try encode one UTF8 codepoint into a string buffer
|
||||||
|
static void AuCodepointsEncodeInto(AuUInt32 uCodepoint,
|
||||||
|
AuString &out);
|
||||||
|
|
||||||
|
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToLower counterparts.
|
||||||
|
/// No localization complexities are, will, or should be involved.
|
||||||
|
static auline AuString AuCodepointsToLower(const AuROString &in);
|
||||||
|
|
||||||
|
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToUpper counterparts.
|
||||||
|
/// No localization complexities are, will, or should be involved.
|
||||||
|
static auline AuString AuCodepointsToUpper(const AuROString &in);
|
||||||
|
|
||||||
|
/// Counts the UTF8/UTF32 codepoints in a byte sequence
|
||||||
|
static auline constexpr CodepointOffset_t AuCodepointsCount(const AuROString &in);
|
||||||
|
|
||||||
|
/// Counts the bytes required to iterate over a UTF8 encoded codepoint
|
||||||
|
static auline constexpr CodepointByteOffset_t AuCodepointsNextLength(const AuROString &in);
|
||||||
|
|
||||||
|
/// Iterates over a UTF8 sequence using OP.
|
||||||
|
/// If op returns void, AuCodepointsForEach will continue until EOS or invalid squence.
|
||||||
|
/// If op returns bool, AuCodepointsForEach will continue until EOS, invalid squence, or !op.
|
||||||
|
/// Returns false on invalid sequence or user break, otherwise returns true.
|
||||||
|
template <class T>
|
||||||
|
static bool AuCodepointsForEach(T op, const AuROString &in);
|
||||||
|
|
||||||
|
/// Translates the in UTF8 sequence using a transformer of U32(*fMyUTF32Translator)(U32)
|
||||||
|
template <class T>
|
||||||
|
static AuString AuCodepointsTransform(T op, const AuROString &in);
|
||||||
|
|
||||||
|
/// Translates the in UTF8 sequence using a transformer of U8(*fMyASCIITranslator)(U8)
|
||||||
|
template <class T>
|
||||||
|
AuString AuCodepointsTransformASCIIOp(T op, const AuROString &in);
|
||||||
|
|
||||||
|
/// Performs a memcmp on unknown chunks, performs a memcmp for each UTF8 sequence of bytes, and performs a AuToLower(a) != AuToLower(b) operation on any ASCII English characters.
|
||||||
|
/// Also see: AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase
|
||||||
|
static bool AuCodepointsIsEqualIgnoreCase(const AuROString &inA,
|
||||||
|
const AuROString &inB);
|
||||||
|
|
||||||
|
/// Performs a memcmp on unknown chunks, performs a memcmp for each UTF8 sequence of bytes, and performs a AuToLower(a) != AuToLower(b) operation on any ASCII English characters.
|
||||||
|
/// Also see: AuCodepointsIsEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase
|
||||||
|
static bool AuCodepointsStartsWithEqualIgnoreCase(const AuROString &inA,
|
||||||
|
const AuROString &inB);
|
||||||
|
|
||||||
|
/// Also see: AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase
|
||||||
|
static bool AuCodepointsEndsWithEqualIgnoreCase(const AuROString &inA,
|
||||||
|
const AuROString &inB);
|
||||||
|
|
||||||
|
/// Returns the byte offset of the codepoint index or AuROString::npos
|
||||||
|
static auline constexpr CodepointByteOffset_t AuCodepointsGetByteOffset(const AuROString &in,
|
||||||
|
CodepointOffset_t uCodepointIndex);
|
||||||
|
|
||||||
|
/// Returns the length of the codepoint index in bytes or AuROString::npos
|
||||||
|
static auline constexpr CodepointByteOffset_t AuCodepointsGetByteLength(const AuROString &in,
|
||||||
|
CodepointOffset_t uCodepointIndex);
|
||||||
|
|
||||||
|
/// Finds subpattern in value at codepoint offset, returning bool
|
||||||
|
static bool AuCodepointsContains(const AuROString &value,
|
||||||
|
const AuROString &subpattern,
|
||||||
|
CodepointOffset_t uStartPosition = {});
|
||||||
|
|
||||||
|
/// Performs a terribly inefficient find in sequence operation.
|
||||||
|
/// Returns the byte offset for find in in starting at uStartPosition bytes, or AuROString::npos
|
||||||
|
/// Assuming the in sequence is valid, you can use other traditional binary methods.
|
||||||
|
/// UTF8 sets the higher most bits to signify multibyte sequence position; you can arbitrarily scan for a complete sequences of UTF8 characters in a legal buffer without any special logic.
|
||||||
|
static CodepointByteOffset_t AuCodepointsFindByteOffset(const AuROString &in,
|
||||||
|
const AuROString &find,
|
||||||
|
CodepointByteOffset_t uStartPosition = {});
|
||||||
|
|
||||||
|
/// AuCodepointsFindByteOffset same as above (AuCodepointsFindByteOffset), slightly less dumb, but still too bloated.
|
||||||
|
/// The main difference is that uStartPosition is trusted to be a valid start offset.
|
||||||
|
/// Worst thing about this is, assuming a valid string view, we don't need to worry about testing the validity of the previous bytes (<uStartPosition), or even the current codepoint byte offset (uStartPosition).
|
||||||
|
static constexpr CodepointByteOffset_t AuCodepointsFindByteOffsetUnsafe(const AuROString &in,
|
||||||
|
const AuROString &find,
|
||||||
|
CodepointByteOffset_t uStartPosition);
|
||||||
|
|
||||||
|
/// An inefficient seek backwards to byte offset given an arbitrary codepoint offset.
|
||||||
|
/// Returns byte offset for codepoint offset - 1 or AuROString::npos
|
||||||
|
static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromOffset(const AuROString &in,
|
||||||
|
CodepointOffset_t uStartPosition = {});
|
||||||
|
|
||||||
|
/// An efficient seek backwards to byte offset given an arbitrary codepoint byte offset.
|
||||||
|
/// Returns byte offset for codepoint byte offset - 1 or AuROString::npos
|
||||||
|
static constexpr CodepointByteOffset_t AuCodepointsFindPreviousValidByteOffsetFromByteOffset(const AuROString &in,
|
||||||
|
CodepointByteOffset_t uStartPosition = {});
|
||||||
|
|
||||||
|
static constexpr CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
|
||||||
|
CodepointByteOffset_t uBytePosition);
|
||||||
|
|
||||||
|
/// Finds subpattern in value at codepoint offset, returning a codepoint offset or AuROString::npos
|
||||||
|
static CodepointOffset_t AuCodepointsFindCodepointOffset(const AuROString &in,
|
||||||
|
const AuROString &find,
|
||||||
|
CodepointOffset_t uStartPosition = {});
|
||||||
|
|
||||||
|
/// For a given valid UTF8 string view, provides the delta byte offset or AuROString::npos, to delete the last most codepoint
|
||||||
|
static constexpr AuUInt AuCodepointsReverseIterate(const AuROString &string);
|
||||||
|
|
||||||
|
/// For a given valid UTF8 string view, to delete the last most codepoint, this provides a view with the suffix removed or an empty view.
|
||||||
|
static constexpr AuROString AuCodepointsReverseIterateSubStrPrefixView(const AuROString &string);
|
||||||
|
|
||||||
|
/// For a given valid UTF8 string view, to delete the last most codepoint, this provides a view with the just suffix to be removed or an empty view.
|
||||||
|
static constexpr AuROString AuCodepointsReverseIterateSubStrSuffixView(const AuROString &string);
|
||||||
|
|
||||||
|
static AuString & AuCodepointsReplaceAll(AuString &str,
|
||||||
|
const AuROString &from,
|
||||||
|
const AuROString &to);
|
||||||
|
|
||||||
|
static AuList<AuROString> AuCodepointsSplitString(const AuROString &str,
|
||||||
|
const AuROString &delim,
|
||||||
|
bool bIgnoreEmpty = true);
|
1551
Include/auROXTL/Strings/auCodepointsUTF8.ipp
Normal file
1551
Include/auROXTL/Strings/auCodepointsUTF8.ipp
Normal file
File diff suppressed because it is too large
Load Diff
138
Include/auROXTL/Strings/auStringUtils.hpp
Normal file
138
Include/auROXTL/Strings/auStringUtils.hpp
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
/***
|
||||||
|
Copyright (C) 2022-2024 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
||||||
|
|
||||||
|
File: auStringUtils.hpp
|
||||||
|
Date: 2024-09-09
|
||||||
|
File: auStringUtils.hpp
|
||||||
|
Date: 2022-2-1
|
||||||
|
File: AuroraUtils.hpp
|
||||||
|
File: auROXTLUtils.hpp
|
||||||
|
Date: 2021-6-9
|
||||||
|
Author: Reece
|
||||||
|
***/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToLower counterparts.
|
||||||
|
/// No localization complexities are, will, or should be involved.
|
||||||
|
static AuString AuCodepointsToLower(const AuROString &in);
|
||||||
|
|
||||||
|
/// Similar to AuCodepointsIsEqualIgnoreCase, translates all ASCII English characters to their AuToUpper counterparts.
|
||||||
|
/// No localization complexities are, will, or should be involved.
|
||||||
|
static AuString AuCodepointsToUpper(const AuROString &in);
|
||||||
|
|
||||||
|
static auline constexpr bool AuIsAlpha(char c)
|
||||||
|
{
|
||||||
|
return (c) && (((unsigned char)c | 0x20) - 'a' < 26);
|
||||||
|
}
|
||||||
|
|
||||||
|
static auline constexpr char AuToLower(char c)
|
||||||
|
{
|
||||||
|
return AuIsAlpha(c) ? c | 0x20 : c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static auline constexpr char AuToUpper(char c)
|
||||||
|
{
|
||||||
|
return AuIsAlpha(c) ? c & ~0x20 : c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static auline constexpr bool AuStringContains(const AuROString &value, const AuROString &subpattern)
|
||||||
|
{
|
||||||
|
return value.find(subpattern) != AuROString::npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static auline constexpr bool AuEndsWith(AuROString const &value, AuROString const &ending)
|
||||||
|
{
|
||||||
|
if (ending.size() > value.size())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static auline constexpr bool AuStartsWith(AuROString const &value, AuROString const &starting)
|
||||||
|
{
|
||||||
|
#if defined(AU_STRING_IS_TINYUTF_EXPERIMENT)
|
||||||
|
return value.starts_with(starting);
|
||||||
|
#else
|
||||||
|
return value.rfind(starting, 0) == 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Alias for AuCodepointsToLower
|
||||||
|
static AuString AuToLower(const AuROString &in)
|
||||||
|
{
|
||||||
|
return AuCodepointsToLower(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Alias for AuCodepointsToUpper
|
||||||
|
static AuString AuToUpper(const AuROString &in)
|
||||||
|
{
|
||||||
|
return AuCodepointsToUpper(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
static AuString &AuReplaceAll(AuString &str,
|
||||||
|
const AuROString &from,
|
||||||
|
const AuROString &to)
|
||||||
|
{
|
||||||
|
AuUInt uStartPosition {};
|
||||||
|
while ((uStartPosition = str.find(from, uStartPosition)) != AuROString::npos)
|
||||||
|
{
|
||||||
|
str.replace(uStartPosition, from.length(), to);
|
||||||
|
uStartPosition += to.length();
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
// i told myself not to copy this, required a split function twice, now here we are :D
|
||||||
|
static AuList<AuROString> AuSplitString(const AuROString &str,
|
||||||
|
const AuROString &delim,
|
||||||
|
bool bIgnoreEmpty = true)
|
||||||
|
{
|
||||||
|
AuList<AuROString> tokens;
|
||||||
|
AuUInt uPrev {}, uPos {};
|
||||||
|
tokens.reserve(str.size() / 16);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
uPos = str.find(delim, uPrev);
|
||||||
|
if (uPos == AuROString::npos)
|
||||||
|
{
|
||||||
|
uPos = str.length();
|
||||||
|
}
|
||||||
|
auto token = str.substr(uPrev, uPos - uPrev);
|
||||||
|
if ((!token.empty()) && bIgnoreEmpty)
|
||||||
|
{
|
||||||
|
tokens.push_back(token);
|
||||||
|
}
|
||||||
|
uPrev = uPos + delim.length();
|
||||||
|
}
|
||||||
|
while (uPos < str.length() && uPrev < str.length());
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AuList<AuString> AuSplitStringLegacy(const AuROString &str,
|
||||||
|
const AuROString &delim,
|
||||||
|
bool bIgnoreEmpty = true)
|
||||||
|
{
|
||||||
|
AuList<AuString> tokens;
|
||||||
|
AuUInt prev = 0, pos = 0;
|
||||||
|
tokens.reserve(str.size() / 16);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
pos = str.find(delim, prev);
|
||||||
|
if (pos == AuROString::npos)
|
||||||
|
{
|
||||||
|
pos = str.length();
|
||||||
|
}
|
||||||
|
auto token = str.substr(prev, pos - prev);
|
||||||
|
if ((!token.empty()) && bIgnoreEmpty)
|
||||||
|
{
|
||||||
|
tokens.push_back(AuString(token));
|
||||||
|
}
|
||||||
|
prev = pos + delim.length();
|
||||||
|
}
|
||||||
|
while (pos < str.length() && prev < str.length());
|
||||||
|
return tokens;
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
@ -23,8 +23,8 @@ static auline constexpr bool AuStartsWith(AuROString const &value, AuROString co
|
|||||||
|
|
||||||
#include "auMemoryUtils.hpp"
|
#include "auMemoryUtils.hpp"
|
||||||
|
|
||||||
#include "Iterators/auROString.hpp"
|
#include "Strings/auROString.hpp"
|
||||||
#include "Iterators/auRONString.hpp"
|
#include "Strings/auRONString.hpp"
|
||||||
|
|
||||||
inline constexpr bool operator==(const AuROString &lhs,
|
inline constexpr bool operator==(const AuROString &lhs,
|
||||||
const AuROString &rhs) noexcept
|
const AuROString &rhs) noexcept
|
||||||
|
Loading…
Reference in New Issue
Block a user