/*** Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: auStringUtils.hpp Date: 2022-2-1 File: AuroraUtils.hpp File: auROXTLUtils.hpp Date: 2021-6-9 Author: Reece Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods. AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16) Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views) AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view). Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper, AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto, AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t), AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t), AuCodepointsContains, AuCodepointsReplaceAll, AuCodepointsSplitString (views), AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithIgnoreCase, AuCodepointsEndsWithIgnoreCase, AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView Warning: By codepoints, we mean UTF32. Aurora uses UTF8 strings everywhere by convention. Warning: For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime. Although, you can decode and encode UTF8 to 32 here (AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto). ***/ #pragma once // offset in bytes /* using CodepointByteOffset_t = decltype(AuROString::npos); */ // offset in codepoints /* using CodepointOffset_t = AuUInt; */ #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8) static const AuUInt8 kAuCodepointUTF8MaxBytes = 4; #elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8) static const AuUInt8 kAuCodepointUTF8MaxBytes = 7; #else // default: static const AuUInt8 kAuCodepointUTF8MaxBytes = 6; #endif // none of these are defined by default #if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW) #if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL) #define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL #endif #endif // AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences // (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting) // (Enable this if you're boring) // AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl) // AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared; // ...ForEach will break early with false (usually implies a user break early condition); // ...Translate will return an empty container #include #include #include #if !defined(AURORA_RUNTIME_TO_STRING) #define AURORA_RUNTIME_TO_STRING std::to_string #endif template AU_OPTIMIZED AuString AuToString(const T &obj) { #if defined(_AUHAS_FMT) // locale independent and better optimized! return AuString(fmt::format("{}", obj)); #else // TODO: to_chars (locale independent) return AURORA_RUNTIME_TO_STRING(obj); #endif }