2022-04-01 04:06:53 +00:00
/***
Copyright ( C ) 2022 J Reece Wilson ( a / k / a " Reece " ) . All rights reserved .
File : auStringUtils . hpp
Date : 2022 - 2 - 1
File : AuroraUtils . hpp
File : auROXTLUtils . hpp
Date : 2021 - 6 - 9
Author : Reece
2024-04-19 21:08:32 +00:00
Purpose : Introduces UTF - 8 utilities and the historically missing c + + string methods .
AuCodepoints * * * * supports the initial UTF - 8 specifications with codepoints up to 2 ^ 31 combinations ( illegal UTF - 16 )
Implements : AuStringContains , AuEndsWith , AuStartsWith , AuReplaceAll , AuSplitString ( views ) , AuSplitStringLegacy ( returns an array of strings instead of views )
AuToLower ( char ) , AuToUpper ( char ) , AuToLower ( view ) , AuToUpper ( view ) .
2024-06-14 13:32:12 +00:00
Implements : AuCodepointsTransform , AuCodepointsTransformASCIIOp , AuCodepointsForEach , AuCodepointsToLower , AuCodepointsToUpper ,
2024-08-31 20:21:07 +00:00
AuCodepointsCount , AuCodepointsNextLength , AuCodepointsDecodeOne , AuCodepointsDecode , AuCodepointsEncodeInto ,
2024-04-19 21:08:32 +00:00
AuCodepointsGetByteOffset ( CodepointOffset_t ) , AuCodepointsGetByteLength ( CodepointOffset_t ) ,
2024-04-19 22:12:39 +00:00
AuCodepointsFindByteOffset [ Unsafe ] , AuCodepointsFindCodepointOffset ( view , CodepointOffset_t ) , AuCodepointsFindCodepointOffset ( CodepointByteOffset_t ) ,
2024-04-19 21:08:32 +00:00
AuCodepointsContains ,
2024-04-19 22:12:39 +00:00
AuCodepointsReplaceAll , AuCodepointsSplitString ( views ) ,
AuCodepointsFindPreviousValidByteOffsetFromOffset , AuCodepointsFindPreviousValidByteOffsetFromByteOffset
2024-08-22 18:32:24 +00:00
AuCodepointsIsEqualIgnoreCase , AuCodepointsStartsWithEqualIgnoreCase , AuCodepointsEndsWithEqualIgnoreCase ,
AuCodepointsReverseIterate , AuCodepointsReverseIterateSubStrPrefixView , AuCodepointsReverseIterateSubStrSuffixView
2024-04-19 21:08:32 +00:00
2024-09-10 07:27:37 +00:00
Warning : By codepoints , we mean UTF32 .
Aurora uses UTF8 strings everywhere by convention .
Warning : For translating between locales ( including utf8 - 32 ) , defer to AuLocale ( Aurora : : Locale ) in the Aurora Runtime .
Although , you can decode and encode UTF8 to 32 here ( AuCodepointsDecodeOne , AuCodepointsDecode , AuCodepointsEncodeInto ) .
2022-04-01 04:06:53 +00:00
* * */
# pragma once
2024-04-19 21:08:32 +00:00
// offset in bytes
2024-04-28 10:56:30 +00:00
/* using CodepointByteOffset_t = decltype(AuROString::npos); */
2024-04-19 21:08:32 +00:00
// offset in codepoints
2024-04-28 10:56:30 +00:00
/* using CodepointOffset_t = AuUInt; */
2024-04-19 21:08:32 +00:00
2024-07-01 13:06:05 +00:00
# if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8)
2024-06-14 13:14:51 +00:00
static const AuUInt8 kAuCodepointUTF8MaxBytes = 4 ;
2024-07-01 13:06:05 +00:00
# elif defined(AURORA_UTF8_I_AM_REALLY_SPECIAL_AND_WANT_7_BYTE_UTF8)
static const AuUInt8 kAuCodepointUTF8MaxBytes = 7 ;
2024-06-14 13:14:51 +00:00
# else
2024-07-01 13:06:05 +00:00
// default:
2024-06-14 13:14:51 +00:00
static const AuUInt8 kAuCodepointUTF8MaxBytes = 6 ;
# endif
2024-07-01 13:06:05 +00:00
// none of these are defined by default
# if defined(AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW)
# if !defined(AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL)
# define AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL
# endif
# endif
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8_WANT_THROW | throws on 5-8 byte sequences, otherwise allows 5-6 byte sequences
// (We have use cases in ecosystem for non-utf16 strings, such as side-channel low-overhead text formatting)
// (Enable this if you're boring)
// AURORA_UTF8_THROW_WHEN_STUPIDILY_ILLEGAL | throws on 7-8 byte sequences, otherwise tries to process 7 byte sequences, breaks early, returns 0, and/or returns empty (check impl)
// AURORA_UTF8_I_SUCK_AND_WANT_MODERN_UTF8 | disallows 5-8 byte sequence decodes. ...IsEqualIgnoreCase gets dumbs down to a memcmp assuming the entire block of memory is to be compared;
// ...ForEach will break early with false (usually implies a user break early condition);
// ...Translate will return an empty container
2024-04-19 22:12:39 +00:00
2024-09-10 07:27:37 +00:00
# include <auROXTL/Strings/auStringUtils.hpp>
# include <auROXTL/Strings/auCodepointsUTF8.hpp>
# include <auROXTL/Strings/auCodepointsUTF8.ipp>
2022-04-01 04:06:53 +00:00
# if !defined(AURORA_RUNTIME_TO_STRING)
# define AURORA_RUNTIME_TO_STRING std::to_string
# endif
template < class T >
static auline AuString AuToString ( const T & obj )
{
# if defined(_AUHAS_FMT)
// locale independent and better optimized!
2024-04-20 00:14:34 +00:00
return AuString ( fmt : : format ( " {} " , obj ) ) ;
2022-04-01 04:06:53 +00:00
# else
// TODO: to_chars (locale independent)
return AURORA_RUNTIME_TO_STRING ( obj ) ;
# endif
}