tomlplusplus/include/toml++/toml_utf8.h
Mark Gillard 42af364887 refactored parser
Mainly to simplify a the error handling code (it had gotten a bit verbose), but also to reduce compiled binary sizes.

also:
- moved windows terminal code page stuff in examples to a separate file
2020-04-11 19:43:38 +03:00

264 lines
7.2 KiB
C++

//# This file is a part of toml++ and is subject to the the terms of the MIT license.
//# Copyright (c) 2019-2020 Mark Gillard <mark.gillard@outlook.com.au>
//# Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> (utf8_decoder)
//# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
// SPDX-License-Identifier: MIT
#pragma once
#include "toml_utf8_generated.h"
namespace toml::impl
{
template <typename... T>
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_match(char32_t codepoint, T... vals) noexcept
{
static_assert((std::is_same_v<char32_t, T> && ...));
return ((codepoint == vals) || ...);
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_ascii_whitespace(char32_t codepoint) noexcept
{
return codepoint == U'\t' || codepoint == U' ';
}
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_unicode_whitespace(char32_t codepoint) noexcept
{
// see: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
// (characters that don't say "is a line-break")
return codepoint == U'\u00A0' // no-break space
|| codepoint == U'\u1680' // ogham space mark
|| (codepoint >= U'\u2000' && codepoint <= U'\u200A') // em quad -> hair space
|| codepoint == U'\u202F' // narrow no-break space
|| codepoint == U'\u205F' // medium mathematical space
|| codepoint == U'\u3000' // ideographic space
;
}
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_whitespace(char32_t codepoint) noexcept
{
return is_ascii_whitespace(codepoint) || is_unicode_whitespace(codepoint);
}
template <bool IncludeCarriageReturn = true>
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_ascii_line_break(char32_t codepoint) noexcept
{
constexpr auto low_range_end = IncludeCarriageReturn ? U'\r' : U'\f';
return (codepoint >= U'\n' && codepoint <= low_range_end);
}
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_unicode_line_break(char32_t codepoint) noexcept
{
// see https://en.wikipedia.org/wiki/Whitespace_character#Unicode
// (characters that say "is a line-break")
return codepoint == U'\u0085' // next line
|| codepoint == U'\u2028' // line separator
|| codepoint == U'\u2029' // paragraph separator
;
}
template <bool IncludeCarriageReturn = true>
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_line_break(char32_t codepoint) noexcept
{
return is_ascii_line_break<IncludeCarriageReturn>(codepoint) || is_unicode_line_break(codepoint);
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_string_delimiter(char32_t codepoint) noexcept
{
return codepoint == U'"' || codepoint == U'\'';
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_ascii_letter(char32_t codepoint) noexcept
{
return (codepoint >= U'a' && codepoint <= U'z')
|| (codepoint >= U'A' && codepoint <= U'Z');
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_binary_digit(char32_t codepoint) noexcept
{
return codepoint == U'0' || codepoint == U'1';
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_octal_digit(char32_t codepoint) noexcept
{
return (codepoint >= U'0' && codepoint <= U'7');
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_decimal_digit(char32_t codepoint) noexcept
{
return (codepoint >= U'0' && codepoint <= U'9');
}
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_hexadecimal_digit(char32_t codepoint) noexcept
{
return (codepoint >= U'a' && codepoint <= U'f')
|| (codepoint >= U'A' && codepoint <= U'F')
|| is_decimal_digit(codepoint)
;
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr uint32_t hex_to_dec(char codepoint) noexcept
{
return codepoint >= 'A'
? 10u + static_cast<uint32_t>(codepoint - (codepoint >= 'a' ? 'a' : 'A'))
: static_cast<uint32_t>(codepoint - '0')
;
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr uint32_t hex_to_dec(char32_t codepoint) noexcept
{
return codepoint >= U'A'
? 10u + static_cast<uint32_t>(codepoint - (codepoint >= U'a' ? U'a' : U'A'))
: static_cast<uint32_t>(codepoint - U'0')
;
}
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_bare_key_character(char32_t codepoint) noexcept
{
return is_ascii_letter(codepoint)
|| is_decimal_digit(codepoint)
|| codepoint == U'-'
|| codepoint == U'_'
#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) & toml/issues/687 (unicode bare keys)
|| codepoint == U'+'
|| is_unicode_letter(codepoint)
|| is_unicode_number(codepoint)
|| is_unicode_combining_mark(codepoint)
#endif
;
}
[[nodiscard]]
TOML_GNU_ATTR(const)
constexpr bool is_value_terminator(char32_t codepoint) noexcept
{
return is_ascii_line_break(codepoint)
|| is_ascii_whitespace(codepoint)
|| codepoint == U']'
|| codepoint == U'}'
|| codepoint == U','
|| codepoint == U'#'
|| is_unicode_line_break(codepoint)
|| is_unicode_whitespace(codepoint)
;
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_nontab_control_character(char32_t codepoint) noexcept
{
return codepoint <= U'\u0008'
|| (codepoint >= U'\u000A' && codepoint <= U'\u001F')
|| codepoint == U'\u007F';
}
[[nodiscard]]
TOML_GNU_ATTR(const)
TOML_ALWAYS_INLINE
constexpr bool is_unicode_surrogate(char32_t codepoint) noexcept
{
return codepoint >= 0xD800u && codepoint <= 0xDFFF;
}
struct utf8_decoder final
{
uint_least32_t state{};
char32_t codepoint{};
static constexpr uint8_t state_table[]
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
};
[[nodiscard]] TOML_ALWAYS_INLINE
constexpr bool error() const noexcept
{
return state == uint_least32_t{ 12u };
}
[[nodiscard]] TOML_ALWAYS_INLINE
constexpr bool has_code_point() const noexcept
{
return state == uint_least32_t{};
}
[[nodiscard]] TOML_ALWAYS_INLINE
constexpr bool needs_more_input() const noexcept
{
return state > uint_least32_t{} && state != uint_least32_t{ 12u };
}
constexpr void operator () (uint8_t byte) noexcept
{
TOML_ASSERT(!error());
const auto type = state_table[byte];
codepoint = static_cast<char32_t>(
has_code_point()
? (uint_least32_t{ 255u } >> type) & byte
: (byte & uint_least32_t{ 63u }) | (static_cast<uint_least32_t>(codepoint) << 6)
);
state = state_table[state + uint_least32_t{ 256u } + type];
}
};
}