AuROXTL/Include/auROXTL/auStringUtils.hpp

82 lines
4.3 KiB
C++

/***
Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: auStringUtils.hpp
Date: 2022-2-1
File: AuroraUtils.hpp
File: auROXTLUtils.hpp
Date: 2021-6-9
Author: Reece
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto,
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
AuCodepointsContains,
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase,
AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView
Warning: By codepoints, we mean UTF32.
Aurora uses UTF8 strings everywhere by convention.
Warning: For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
Although, you can decode and encode UTF8 to 32 here (AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto).
***/
#pragma once
// offset in bytes
/* using CodepointByteOffset_t = decltype(AuROString::npos); */
// offset in codepoints
/* using CodepointOffset_t = AuUInt; */
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 4;
#elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 7;
#else
// default:
static const AuUInt8 kAuCodepointUTF8MaxBytes = 6;
#endif
// none of these are defined by default
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
#if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
#define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL
#endif
#endif
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences
// (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting)
// (Enable this if you're boring)
// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl)
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared;
// ...ForEach will break early with false (usually implies a user break early condition);
// ...Translate will return an empty container
#include <auROXTL/Strings/auStringUtils.hpp>
#include <auROXTL/Strings/auCodepointsUTF8.hpp>
#include <auROXTL/Strings/auCodepointsUTF8.ipp>
#if !defined(AURORA_RUNTIME_TO_STRING)
#define AURORA_RUNTIME_TO_STRING std::to_string
#endif
template <class T>
static auline AuString AuToString(const T &obj)
{
#if defined(_AUHAS_FMT)
// locale independent and better optimized!
return AuString(fmt::format("{}", obj));
#else
// TODO: to_chars (locale independent)
return AURORA_RUNTIME_TO_STRING(obj);
#endif
}