82 lines
4.3 KiB
C++
82 lines
4.3 KiB
C++
/***
|
|
Copyright (C) 2022 J Reece Wilson (a/k/a "Reece"). All rights reserved.
|
|
|
|
File: auStringUtils.hpp
|
|
Date: 2022-2-1
|
|
File: AuroraUtils.hpp
|
|
File: auROXTLUtils.hpp
|
|
Date: 2021-6-9
|
|
Author: Reece
|
|
Purpose: Introduces UTF-8 utilities and the historically missing c++ string methods.
|
|
AuCodepoints**** supports the initial UTF-8 specifications with codepoints up to 2^31 combinations (illegal UTF-16)
|
|
|
|
Implements: AuStringContains, AuEndsWith, AuStartsWith, AuReplaceAll, AuSplitString (views), AuSplitStringLegacy (returns an array of strings instead of views)
|
|
AuToLower(char), AuToUpper(char), AuToLower(view), AuToUpper(view).
|
|
Implements: AuCodepointsTransform, AuCodepointsTransformASCIIOp, AuCodepointsForEach, AuCodepointsToLower, AuCodepointsToUpper,
|
|
AuCodepointsCount, AuCodepointsNextLength, AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto,
|
|
AuCodepointsGetByteOffset(CodepointOffset_t),AuCodepointsGetByteLength(CodepointOffset_t),
|
|
AuCodepointsFindByteOffset[Unsafe], AuCodepointsFindCodepointOffset(view, CodepointOffset_t), AuCodepointsFindCodepointOffset(CodepointByteOffset_t),
|
|
AuCodepointsContains,
|
|
AuCodepointsReplaceAll, AuCodepointsSplitString (views),
|
|
AuCodepointsFindPreviousValidByteOffsetFromOffset, AuCodepointsFindPreviousValidByteOffsetFromByteOffset
|
|
AuCodepointsIsEqualIgnoreCase, AuCodepointsStartsWithEqualIgnoreCase, AuCodepointsEndsWithEqualIgnoreCase,
|
|
AuCodepointsReverseIterate, AuCodepointsReverseIterateSubStrPrefixView, AuCodepointsReverseIterateSubStrSuffixView
|
|
|
|
Warning: By codepoints, we mean UTF32.
|
|
Aurora uses UTF8 strings everywhere by convention.
|
|
|
|
Warning: For translating between locales (including utf8-32), defer to AuLocale (Aurora::Locale) in the Aurora Runtime.
|
|
Although, you can decode and encode UTF8 to 32 here (AuCodepointsDecodeOne, AuCodepointsDecode, AuCodepointsEncodeInto).
|
|
***/
|
|
#pragma once
|
|
|
|
// offset in bytes
|
|
/* using CodepointByteOffset_t = decltype(AuROString::npos); */
|
|
|
|
// offset in codepoints
|
|
/* using CodepointOffset_t = AuUInt; */
|
|
|
|
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8)
|
|
static const AuUInt8 kAuCodepointUTF8MaxBytes = 4;
|
|
#elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8)
|
|
static const AuUInt8 kAuCodepointUTF8MaxBytes = 7;
|
|
#else
|
|
// default:
|
|
static const AuUInt8 kAuCodepointUTF8MaxBytes = 6;
|
|
#endif
|
|
|
|
// none of these are defined by default
|
|
#if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
|
|
#if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
|
|
#define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL
|
|
#endif
|
|
#endif
|
|
|
|
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences
|
|
// (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting)
|
|
// (Enable this if you're boring)
|
|
// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl)
|
|
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared;
|
|
// ...ForEach will break early with false (usually implies a user break early condition);
|
|
// ...Translate will return an empty container
|
|
|
|
|
|
#include <auROXTL/Strings/auStringUtils.hpp>
|
|
#include <auROXTL/Strings/auCodepointsUTF8.hpp>
|
|
#include <auROXTL/Strings/auCodepointsUTF8.ipp>
|
|
|
|
#if !defined(AURORA_RUNTIME_TO_STRING)
|
|
#define AURORA_RUNTIME_TO_STRING std::to_string
|
|
#endif
|
|
|
|
template <class T>
|
|
static auline AuString AuToString(const T &obj)
|
|
{
|
|
#if defined(_AUHAS_FMT)
|
|
// locale independent and better optimized!
|
|
return AuString(fmt::format("{}", obj));
|
|
#else
|
|
// TODO: to_chars (locale independent)
|
|
return AURORA_RUNTIME_TO_STRING(obj);
|
|
#endif
|
|
} |