//# This file is a part of toml++ and is subject to the the terms of the MIT license. //# Copyright (c) 2019-2020 Mark Gillard //# Copyright (c) 2008-2010 Bjoern Hoehrmann (utf8_decoder) //# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text. // SPDX-License-Identifier: MIT #pragma once #include "toml_utf8_generated.h" namespace toml::impl { template [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_match(char32_t codepoint, T... vals) noexcept { static_assert((std::is_same_v && ...)); return ((codepoint == vals) || ...); } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_ascii_whitespace(char32_t codepoint) noexcept { return codepoint == U'\t' || codepoint == U' '; } [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_unicode_whitespace(char32_t codepoint) noexcept { // see: https://en.wikipedia.org/wiki/Whitespace_character#Unicode // (characters that don't say "is a line-break") return codepoint == U'\u00A0' // no-break space || codepoint == U'\u1680' // ogham space mark || (codepoint >= U'\u2000' && codepoint <= U'\u200A') // em quad -> hair space || codepoint == U'\u202F' // narrow no-break space || codepoint == U'\u205F' // medium mathematical space || codepoint == U'\u3000' // ideographic space ; } [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_whitespace(char32_t codepoint) noexcept { return is_ascii_whitespace(codepoint) || is_unicode_whitespace(codepoint); } template [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_ascii_line_break(char32_t codepoint) noexcept { constexpr auto low_range_end = IncludeCarriageReturn ? U'\r' : U'\f'; return (codepoint >= U'\n' && codepoint <= low_range_end); } [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_unicode_line_break(char32_t codepoint) noexcept { // see https://en.wikipedia.org/wiki/Whitespace_character#Unicode // (characters that say "is a line-break") return codepoint == U'\u0085' // next line || codepoint == U'\u2028' // line separator || codepoint == U'\u2029' // paragraph separator ; } template [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_line_break(char32_t codepoint) noexcept { return is_ascii_line_break(codepoint) || is_unicode_line_break(codepoint); } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_string_delimiter(char32_t codepoint) noexcept { return codepoint == U'"' || codepoint == U'\''; } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_ascii_letter(char32_t codepoint) noexcept { return (codepoint >= U'a' && codepoint <= U'z') || (codepoint >= U'A' && codepoint <= U'Z'); } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_binary_digit(char32_t codepoint) noexcept { return codepoint == U'0' || codepoint == U'1'; } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_octal_digit(char32_t codepoint) noexcept { return (codepoint >= U'0' && codepoint <= U'7'); } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_decimal_digit(char32_t codepoint) noexcept { return (codepoint >= U'0' && codepoint <= U'9'); } template [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr std::uint_least32_t hex_to_dec(T codepoint) noexcept { if constexpr (std::is_same_v, std::uint_least32_t>) return codepoint >= 0x41u // >= 'A' ? 10u + (codepoint | 0x20u) - 0x61u // - 'a' : codepoint - 0x30u // - '0' ; else return hex_to_dec(static_cast(codepoint)); } [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_bare_key_character(char32_t codepoint) noexcept { return is_ascii_letter(codepoint) || is_decimal_digit(codepoint) || codepoint == U'-' || codepoint == U'_' #if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) & toml/issues/687 (unicode bare keys) || codepoint == U'+' || is_unicode_letter(codepoint) || is_unicode_number(codepoint) || is_unicode_combining_mark(codepoint) #endif ; } [[nodiscard]] TOML_GNU_ATTR(const) constexpr bool is_value_terminator(char32_t codepoint) noexcept { return is_ascii_line_break(codepoint) || is_ascii_whitespace(codepoint) || codepoint == U']' || codepoint == U'}' || codepoint == U',' || codepoint == U'#' || is_unicode_line_break(codepoint) || is_unicode_whitespace(codepoint) ; } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_control_character(char32_t codepoint) noexcept { return codepoint <= U'\u001F' || codepoint == U'\u007F'; } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_nontab_control_character(char32_t codepoint) noexcept { return codepoint <= U'\u0008' || (codepoint >= U'\u000A' && codepoint <= U'\u001F') || codepoint == U'\u007F'; } [[nodiscard]] TOML_GNU_ATTR(const) TOML_ALWAYS_INLINE constexpr bool is_unicode_surrogate(char32_t codepoint) noexcept { return codepoint >= 0xD800u && codepoint <= 0xDFFF; } struct utf8_decoder final { uint_least32_t state{}; char32_t codepoint{}; static constexpr uint8_t state_table[] { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12 }; [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool error() const noexcept { return state == uint_least32_t{ 12u }; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool has_code_point() const noexcept { return state == uint_least32_t{}; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool needs_more_input() const noexcept { return state > uint_least32_t{} && state != uint_least32_t{ 12u }; } constexpr void operator () (uint8_t byte) noexcept { TOML_ASSERT(!error()); const auto type = state_table[byte]; codepoint = static_cast( has_code_point() ? (uint_least32_t{ 255u } >> type) & byte : (byte & uint_least32_t{ 63u }) | (static_cast(codepoint) << 6) ); state = state_table[state + uint_least32_t{ 256u } + type]; } }; }