2020-03-12 15:23:25 +00:00
|
|
|
//# This file is a part of toml++ and is subject to the the terms of the MIT license.
|
|
|
|
//# Copyright (c) 2019-2020 Mark Gillard <mark.gillard@outlook.com.au>
|
|
|
|
//# Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> (utf8_decoder)
|
|
|
|
//# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
|
2020-04-10 16:46:00 +00:00
|
|
|
// SPDX-License-Identifier: MIT
|
2020-03-12 15:23:25 +00:00
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
#pragma once
|
|
|
|
#include "toml_utf8_generated.h"
|
|
|
|
|
2020-03-28 16:56:59 +00:00
|
|
|
namespace toml::impl
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
2020-04-06 12:57:49 +00:00
|
|
|
template <typename... T>
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-04-06 12:57:49 +00:00
|
|
|
constexpr bool is_match(char32_t codepoint, T... vals) noexcept
|
|
|
|
{
|
|
|
|
static_assert((std::is_same_v<char32_t, T> && ...));
|
|
|
|
return ((codepoint == vals) || ...);
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_ascii_whitespace(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint == U'\t' || codepoint == U' ';
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_unicode_whitespace(char32_t codepoint) noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
// see: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
|
|
|
|
// (characters that don't say "is a line-break")
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
return codepoint == U'\u00A0' // no-break space
|
2020-01-04 14:21:38 +00:00
|
|
|
|| codepoint == U'\u1680' // ogham space mark
|
|
|
|
|| (codepoint >= U'\u2000' && codepoint <= U'\u200A') // em quad -> hair space
|
|
|
|
|| codepoint == U'\u202F' // narrow no-break space
|
|
|
|
|| codepoint == U'\u205F' // medium mathematical space
|
|
|
|
|| codepoint == U'\u3000' // ideographic space
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_whitespace(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_ascii_whitespace(codepoint) || is_unicode_whitespace(codepoint);
|
|
|
|
}
|
|
|
|
|
2020-04-02 21:39:21 +00:00
|
|
|
template <bool IncludeCarriageReturn = true>
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_ascii_line_break(char32_t codepoint) noexcept
|
|
|
|
{
|
2020-04-02 21:39:21 +00:00
|
|
|
constexpr auto low_range_end = IncludeCarriageReturn ? U'\r' : U'\f';
|
2020-02-03 09:12:43 +00:00
|
|
|
return (codepoint >= U'\n' && codepoint <= low_range_end);
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_unicode_line_break(char32_t codepoint) noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
// see https://en.wikipedia.org/wiki/Whitespace_character#Unicode
|
|
|
|
// (characters that say "is a line-break")
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
return codepoint == U'\u0085' // next line
|
2020-01-04 14:21:38 +00:00
|
|
|
|| codepoint == U'\u2028' // line separator
|
|
|
|
|| codepoint == U'\u2029' // paragraph separator
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-04-02 21:39:21 +00:00
|
|
|
template <bool IncludeCarriageReturn = true>
|
2020-02-03 09:12:43 +00:00
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_line_break(char32_t codepoint) noexcept
|
|
|
|
{
|
2020-04-02 21:39:21 +00:00
|
|
|
return is_ascii_line_break<IncludeCarriageReturn>(codepoint) || is_unicode_line_break(codepoint);
|
2020-02-03 09:12:43 +00:00
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_string_delimiter(char32_t codepoint) noexcept
|
|
|
|
{
|
2020-01-07 15:52:50 +00:00
|
|
|
return codepoint == U'"' || codepoint == U'\'';
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_ascii_letter(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return (codepoint >= U'a' && codepoint <= U'z')
|
|
|
|
|| (codepoint >= U'A' && codepoint <= U'Z');
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_binary_digit(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint == U'0' || codepoint == U'1';
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_octal_digit(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return (codepoint >= U'0' && codepoint <= U'7');
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_decimal_digit(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return (codepoint >= U'0' && codepoint <= U'9');
|
|
|
|
}
|
|
|
|
|
2020-01-07 15:52:50 +00:00
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-01-11 21:15:24 +00:00
|
|
|
constexpr bool is_hexadecimal_digit(char32_t codepoint) noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
return (codepoint >= U'a' && codepoint <= U'f')
|
|
|
|
|| (codepoint >= U'A' && codepoint <= U'F')
|
2020-02-03 09:12:43 +00:00
|
|
|
|| is_decimal_digit(codepoint)
|
|
|
|
;
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-04-06 12:57:49 +00:00
|
|
|
constexpr uint32_t hex_to_dec(char codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint >= 'A'
|
|
|
|
? 10u + static_cast<uint32_t>(codepoint - (codepoint >= 'a' ? 'a' : 'A'))
|
|
|
|
: static_cast<uint32_t>(codepoint - '0')
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-04-06 12:57:49 +00:00
|
|
|
constexpr uint32_t hex_to_dec(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint >= U'A'
|
|
|
|
? 10u + static_cast<uint32_t>(codepoint - (codepoint >= U'a' ? U'a' : U'A'))
|
|
|
|
: static_cast<uint32_t>(codepoint - U'0')
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_bare_key_start_character(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_ascii_letter(codepoint)
|
|
|
|
|| is_decimal_digit(codepoint)
|
|
|
|
|| codepoint == U'-'
|
|
|
|
|| codepoint == U'_'
|
2020-04-02 21:39:21 +00:00
|
|
|
#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) & toml/issues/687 (unicode bare keys)
|
2020-01-07 15:52:50 +00:00
|
|
|
|| codepoint == U'+'
|
2020-01-04 14:21:38 +00:00
|
|
|
|| is_unicode_letter(codepoint)
|
|
|
|
|| is_unicode_number(codepoint)
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_bare_key_character(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_bare_key_start_character(codepoint)
|
2020-04-02 21:39:21 +00:00
|
|
|
#if TOML_LANG_UNRELEASED // toml/issues/687 (unicode bare keys)
|
2020-01-04 14:21:38 +00:00
|
|
|
|| is_unicode_combining_mark(codepoint)
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-04-09 08:13:12 +00:00
|
|
|
TOML_GNU_ATTR(const)
|
2020-01-04 14:21:38 +00:00
|
|
|
constexpr bool is_value_terminator(char32_t codepoint) noexcept
|
|
|
|
{
|
2020-02-03 09:12:43 +00:00
|
|
|
return is_ascii_line_break(codepoint)
|
|
|
|
|| is_ascii_whitespace(codepoint)
|
2020-01-04 14:21:38 +00:00
|
|
|
|| codepoint == U']'
|
|
|
|
|| codepoint == U'}'
|
|
|
|
|| codepoint == U','
|
|
|
|
|| codepoint == U'#'
|
2020-02-03 09:12:43 +00:00
|
|
|
|| is_unicode_line_break(codepoint)
|
|
|
|
|| is_unicode_whitespace(codepoint)
|
2020-01-04 14:21:38 +00:00
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-04-02 21:39:21 +00:00
|
|
|
constexpr bool is_nontab_control_character(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint <= U'\u0008'
|
|
|
|
|| (codepoint >= U'\u000A' && codepoint <= U'\u001F')
|
|
|
|
|| codepoint == U'\u007F';
|
|
|
|
}
|
|
|
|
|
2020-04-09 08:13:12 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
TOML_GNU_ATTR(const)
|
|
|
|
TOML_ALWAYS_INLINE
|
2020-04-02 21:39:21 +00:00
|
|
|
constexpr bool is_unicode_surrogate(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint >= 0xD800u && codepoint <= 0xDFFF;
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
struct utf8_decoder final
|
|
|
|
{
|
|
|
|
uint_least32_t state{};
|
|
|
|
char32_t codepoint{};
|
|
|
|
|
|
|
|
static constexpr uint8_t state_table[]
|
|
|
|
{
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
|
|
|
|
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
|
|
12,36,12,12,12,12,12,12,12,12,12,12
|
|
|
|
};
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool error() const noexcept
|
|
|
|
{
|
|
|
|
return state == uint_least32_t{ 12u };
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool has_code_point() const noexcept
|
|
|
|
{
|
|
|
|
return state == uint_least32_t{};
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool needs_more_input() const noexcept
|
|
|
|
{
|
|
|
|
return state > uint_least32_t{} && state != uint_least32_t{ 12u };
|
|
|
|
}
|
|
|
|
|
|
|
|
constexpr void operator () (uint8_t byte) noexcept
|
|
|
|
{
|
|
|
|
TOML_ASSERT(!error());
|
|
|
|
|
|
|
|
const auto type = state_table[byte];
|
|
|
|
|
|
|
|
codepoint = static_cast<char32_t>(
|
|
|
|
has_code_point()
|
|
|
|
? (uint_least32_t{ 255u } >> type) & byte
|
|
|
|
: (byte & uint_least32_t{ 63u }) | (static_cast<uint_least32_t>(codepoint) << 6)
|
|
|
|
);
|
|
|
|
|
|
|
|
state = state_table[state + uint_least32_t{ 256u } + type];
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2020-03-28 16:56:59 +00:00
|
|
|
|