2020-01-04 14:21:38 +00:00
|
|
|
#pragma once
|
|
|
|
#include "toml_common.h"
|
|
|
|
#include "toml_utf8_generated.h"
|
|
|
|
|
2020-02-22 14:10:32 +00:00
|
|
|
TOML_IMPL_START
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
2020-02-03 09:12:43 +00:00
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_ascii_whitespace(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint == U'\t' || codepoint == U' ';
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]]
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_unicode_whitespace(char32_t codepoint) noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
// see: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
|
|
|
|
// (characters that don't say "is a line-break")
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
return codepoint == U'\u00A0' // no-break space
|
2020-01-04 14:21:38 +00:00
|
|
|
|| codepoint == U'\u1680' // ogham space mark
|
|
|
|
|| (codepoint >= U'\u2000' && codepoint <= U'\u200A') // em quad -> hair space
|
|
|
|
|| codepoint == U'\u202F' // narrow no-break space
|
|
|
|
|| codepoint == U'\u205F' // medium mathematical space
|
|
|
|
|| codepoint == U'\u3000' // ideographic space
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool is_whitespace(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_ascii_whitespace(codepoint) || is_unicode_whitespace(codepoint);
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
template <bool CR = true>
|
2020-02-03 09:12:43 +00:00
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_ascii_line_break(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
constexpr auto low_range_end = CR ? U'\r' : U'\f';
|
|
|
|
return (codepoint >= U'\n' && codepoint <= low_range_end);
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]]
|
2020-02-03 09:12:43 +00:00
|
|
|
constexpr bool is_unicode_line_break(char32_t codepoint) noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
// see https://en.wikipedia.org/wiki/Whitespace_character#Unicode
|
|
|
|
// (characters that say "is a line-break")
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
return codepoint == U'\u0085' // next line
|
2020-01-04 14:21:38 +00:00
|
|
|
|| codepoint == U'\u2028' // line separator
|
|
|
|
|| codepoint == U'\u2029' // paragraph separator
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2020-02-03 09:12:43 +00:00
|
|
|
template <bool CR = true>
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool is_line_break(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_ascii_line_break<CR>(codepoint) || is_unicode_line_break(codepoint);
|
|
|
|
}
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_string_delimiter(char32_t codepoint) noexcept
|
|
|
|
{
|
2020-01-07 15:52:50 +00:00
|
|
|
return codepoint == U'"' || codepoint == U'\'';
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_ascii_letter(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return (codepoint >= U'a' && codepoint <= U'z')
|
|
|
|
|| (codepoint >= U'A' && codepoint <= U'Z');
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_binary_digit(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return codepoint == U'0' || codepoint == U'1';
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_octal_digit(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return (codepoint >= U'0' && codepoint <= U'7');
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool is_decimal_digit(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return (codepoint >= U'0' && codepoint <= U'9');
|
|
|
|
}
|
|
|
|
|
2020-01-07 15:52:50 +00:00
|
|
|
[[nodiscard]]
|
2020-01-11 21:15:24 +00:00
|
|
|
constexpr bool is_hexadecimal_digit(char32_t codepoint) noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
return (codepoint >= U'a' && codepoint <= U'f')
|
|
|
|
|| (codepoint >= U'A' && codepoint <= U'F')
|
2020-02-03 09:12:43 +00:00
|
|
|
|| is_decimal_digit(codepoint)
|
|
|
|
;
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool is_bare_key_start_character(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_ascii_letter(codepoint)
|
|
|
|
|| is_decimal_digit(codepoint)
|
|
|
|
|| codepoint == U'-'
|
|
|
|
|| codepoint == U'_'
|
|
|
|
#if TOML_LANG_HIGHER_THAN(0, 5, 0) // toml/issues/644 & toml/issues/687
|
2020-01-07 15:52:50 +00:00
|
|
|
|| codepoint == U'+'
|
2020-01-04 14:21:38 +00:00
|
|
|
|| is_unicode_letter(codepoint)
|
|
|
|
|| is_unicode_number(codepoint)
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool is_bare_key_character(char32_t codepoint) noexcept
|
|
|
|
{
|
|
|
|
return is_bare_key_start_character(codepoint)
|
|
|
|
#if TOML_LANG_HIGHER_THAN(0, 5, 0) // toml/issues/687
|
|
|
|
|| is_unicode_combining_mark(codepoint)
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool is_value_terminator(char32_t codepoint) noexcept
|
|
|
|
{
|
2020-02-03 09:12:43 +00:00
|
|
|
return is_ascii_line_break(codepoint)
|
|
|
|
|| is_ascii_whitespace(codepoint)
|
2020-01-04 14:21:38 +00:00
|
|
|
|| codepoint == U']'
|
|
|
|
|| codepoint == U'}'
|
|
|
|
|| codepoint == U','
|
|
|
|
|| codepoint == U'#'
|
2020-02-03 09:12:43 +00:00
|
|
|
|| is_unicode_line_break(codepoint)
|
|
|
|
|| is_unicode_whitespace(codepoint)
|
2020-01-04 14:21:38 +00:00
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct utf8_decoder final
|
|
|
|
{
|
2020-02-03 09:12:43 +00:00
|
|
|
//# This decoder is based on the 'Flexible and Economical UTF-8 Decoder'
|
|
|
|
//# Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
|
|
|
//# See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
2020-01-04 14:21:38 +00:00
|
|
|
|
|
|
|
uint_least32_t state{};
|
|
|
|
char32_t codepoint{};
|
|
|
|
|
|
|
|
static constexpr uint8_t state_table[]
|
|
|
|
{
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
|
|
|
|
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
|
|
12,36,12,12,12,12,12,12,12,12,12,12
|
|
|
|
};
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool error() const noexcept
|
|
|
|
{
|
|
|
|
return state == uint_least32_t{ 12u };
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool has_code_point() const noexcept
|
|
|
|
{
|
|
|
|
return state == uint_least32_t{};
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
constexpr bool needs_more_input() const noexcept
|
|
|
|
{
|
|
|
|
return state > uint_least32_t{} && state != uint_least32_t{ 12u };
|
|
|
|
}
|
|
|
|
|
|
|
|
constexpr void operator () (uint8_t byte) noexcept
|
|
|
|
{
|
|
|
|
TOML_ASSERT(!error());
|
|
|
|
|
|
|
|
const auto type = state_table[byte];
|
|
|
|
|
|
|
|
codepoint = static_cast<char32_t>(
|
|
|
|
has_code_point()
|
|
|
|
? (uint_least32_t{ 255u } >> type) & byte
|
|
|
|
: (byte & uint_least32_t{ 63u }) | (static_cast<uint_least32_t>(codepoint) << 6)
|
|
|
|
);
|
|
|
|
|
|
|
|
state = state_table[state + uint_least32_t{ 256u } + type];
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
class utf8_byte_stream;
|
|
|
|
|
|
|
|
template <typename CHAR>
|
|
|
|
class utf8_byte_stream<std::basic_string_view<CHAR>> final
|
|
|
|
{
|
|
|
|
static_assert(sizeof(CHAR) == 1_sz);
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::basic_string_view<CHAR> source;
|
|
|
|
size_t position = {};
|
|
|
|
|
|
|
|
public:
|
|
|
|
explicit constexpr utf8_byte_stream(std::basic_string_view<CHAR> sv) noexcept
|
|
|
|
: source{ sv }
|
|
|
|
{
|
|
|
|
if (source.length() >= 3_sz
|
2020-02-03 09:12:43 +00:00
|
|
|
&& static_cast<uint8_t>(source[0]) == static_cast<uint8_t>(0xEFu)
|
|
|
|
&& static_cast<uint8_t>(source[1]) == static_cast<uint8_t>(0xBBu)
|
|
|
|
&& static_cast<uint8_t>(source[2]) == static_cast<uint8_t>(0xBFu))
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
position += 3_sz;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool eof() const noexcept
|
|
|
|
{
|
|
|
|
return position >= source.length();
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr bool error() const noexcept
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-03-03 21:28:24 +00:00
|
|
|
constexpr optional<uint8_t> operator() () noexcept
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
if (position >= source.length())
|
|
|
|
return {};
|
|
|
|
return static_cast<uint8_t>(source[position++]);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename CHAR>
|
|
|
|
class utf8_byte_stream<std::basic_istream<CHAR>> final
|
|
|
|
{
|
|
|
|
static_assert(sizeof(CHAR) == 1_sz);
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::basic_istream<CHAR>* source;
|
|
|
|
|
|
|
|
public:
|
|
|
|
explicit utf8_byte_stream(std::basic_istream<CHAR>& stream) TOML_MAY_THROW
|
|
|
|
: source{ &stream }
|
|
|
|
{
|
|
|
|
if (*source)
|
|
|
|
{
|
|
|
|
static constexpr uint8_t bom[] {
|
2020-02-03 09:12:43 +00:00
|
|
|
0xEF,
|
|
|
|
0xBB,
|
|
|
|
0xBF
|
2020-01-04 14:21:38 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
using stream_traits = typename std::remove_pointer_t<decltype(source)>::traits_type;
|
|
|
|
const auto initial_pos = source->tellg();
|
|
|
|
size_t bom_pos{};
|
|
|
|
auto bom_char = source->get();
|
|
|
|
while (*source && bom_char != stream_traits::eof() && bom_char == bom[bom_pos])
|
|
|
|
{
|
|
|
|
bom_pos++;
|
|
|
|
bom_char = source->get();
|
|
|
|
}
|
|
|
|
if (!(*source) || bom_pos < 3_sz)
|
|
|
|
source->seekg(initial_pos);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
bool eof() const noexcept
|
|
|
|
{
|
|
|
|
return source->eof();
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
bool error() const noexcept
|
|
|
|
{
|
|
|
|
return !(*source);
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-03-03 21:28:24 +00:00
|
|
|
optional<uint8_t> operator() () TOML_MAY_THROW
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
auto val = source->get();
|
|
|
|
if (val == std::basic_istream<CHAR>::traits_type::eof())
|
|
|
|
return {};
|
|
|
|
return static_cast<uint8_t>(val);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct utf8_codepoint final
|
|
|
|
{
|
|
|
|
char32_t value;
|
2020-02-26 23:28:20 +00:00
|
|
|
string_char bytes[4];
|
|
|
|
source_position position;
|
2020-01-04 14:21:38 +00:00
|
|
|
|
2020-02-26 23:28:20 +00:00
|
|
|
template <typename CHAR = string_char>
|
2020-01-04 14:21:38 +00:00
|
|
|
[[nodiscard]] TOML_ALWAYS_INLINE
|
|
|
|
std::basic_string_view<CHAR> as_view() const noexcept
|
|
|
|
{
|
|
|
|
static_assert(
|
|
|
|
sizeof(CHAR) == 1,
|
|
|
|
"The string view's underlying character type must be 1 byte in size."
|
|
|
|
);
|
|
|
|
|
|
|
|
return bytes[3]
|
2020-02-26 23:28:20 +00:00
|
|
|
? std::basic_string_view<CHAR>{ reinterpret_cast<const CHAR*>(bytes), 4_sz }
|
|
|
|
: std::basic_string_view<CHAR>{ reinterpret_cast<const CHAR*>(bytes) };
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr operator char32_t& () noexcept
|
|
|
|
{
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
constexpr operator const char32_t& () const noexcept
|
|
|
|
{
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
static_assert(std::is_trivial_v<utf8_codepoint>);
|
|
|
|
static_assert(std::is_standard_layout_v<utf8_codepoint>);
|
|
|
|
|
|
|
|
#if TOML_EXCEPTIONS
|
|
|
|
#define TOML_ERROR_CHECK (void)0
|
2020-02-26 23:28:20 +00:00
|
|
|
#define TOML_ERROR throw parse_error
|
2020-01-04 14:21:38 +00:00
|
|
|
#else
|
|
|
|
#define TOML_ERROR_CHECK if (err) return nullptr
|
2020-01-07 15:52:50 +00:00
|
|
|
#define TOML_ERROR err.emplace
|
2020-01-04 14:21:38 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
struct TOML_INTERFACE utf8_reader_interface
|
|
|
|
{
|
|
|
|
[[nodiscard]]
|
2020-02-26 23:28:20 +00:00
|
|
|
virtual const source_path_ptr& source_path() const noexcept = 0;
|
2020-01-04 14:21:38 +00:00
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
virtual const utf8_codepoint* read_next() TOML_MAY_THROW = 0;
|
|
|
|
|
|
|
|
#if !TOML_EXCEPTIONS
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-03-03 21:28:24 +00:00
|
|
|
virtual optional<parse_error>&& error() noexcept = 0;
|
2020-01-04 14:21:38 +00:00
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
virtual ~utf8_reader_interface() noexcept = default;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
class TOML_EMPTY_BASES utf8_reader final
|
|
|
|
: public utf8_reader_interface
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
utf8_byte_stream<T> stream;
|
|
|
|
utf8_decoder decoder;
|
2020-03-07 23:06:53 +00:00
|
|
|
utf8_codepoint codepoints[2];
|
|
|
|
size_t cp_idx = 1;
|
2020-01-04 14:21:38 +00:00
|
|
|
uint8_t current_byte_count{};
|
2020-01-07 15:52:50 +00:00
|
|
|
source_path_ptr source_path_;
|
2020-01-04 14:21:38 +00:00
|
|
|
#if !TOML_EXCEPTIONS
|
2020-03-03 21:28:24 +00:00
|
|
|
optional<parse_error> err;
|
2020-01-04 14:21:38 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
template <typename U, typename STR = std::string_view>
|
|
|
|
explicit utf8_reader(U && source, STR&& source_path = {})
|
2020-01-22 21:29:46 +00:00
|
|
|
TOML_MAY_THROW_UNLESS(std::is_nothrow_constructible_v<utf8_byte_stream<T>, U&&>)
|
2020-01-04 14:21:38 +00:00
|
|
|
: stream{ std::forward<U>(source) }
|
|
|
|
{
|
2020-03-07 23:06:53 +00:00
|
|
|
std::memset(codepoints, 0, sizeof(codepoints));
|
|
|
|
codepoints[0].position = { 1, 1 };
|
|
|
|
codepoints[1].position = { 1, 1 };
|
2020-01-04 14:21:38 +00:00
|
|
|
|
|
|
|
if (!source_path.empty())
|
|
|
|
source_path_ = std::make_shared<const std::string>(std::forward<STR>(source_path));
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-01-07 15:52:50 +00:00
|
|
|
const source_path_ptr& source_path() const noexcept override
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
return source_path_;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
const utf8_codepoint* read_next() TOML_MAY_THROW override
|
|
|
|
{
|
|
|
|
TOML_ERROR_CHECK;
|
|
|
|
|
2020-03-07 23:06:53 +00:00
|
|
|
auto& prev = codepoints[(cp_idx - 1_sz) % 2_sz];
|
|
|
|
|
2020-01-04 14:21:38 +00:00
|
|
|
if (stream.eof())
|
|
|
|
return nullptr;
|
|
|
|
else if (stream.error())
|
|
|
|
TOML_ERROR("An error occurred while reading from the underlying stream", prev.position, source_path_ );
|
|
|
|
else if (decoder.error())
|
|
|
|
TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ );
|
|
|
|
|
|
|
|
TOML_ERROR_CHECK;
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
2020-03-03 21:28:24 +00:00
|
|
|
optional<uint8_t> nextByte;
|
2020-01-04 14:21:38 +00:00
|
|
|
if constexpr (!TOML_EXCEPTIONS || noexcept(stream()))
|
|
|
|
{
|
|
|
|
nextByte = stream();
|
|
|
|
}
|
|
|
|
#if TOML_EXCEPTIONS
|
|
|
|
else
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
nextByte = stream();
|
|
|
|
}
|
|
|
|
catch (const std::exception& exc)
|
|
|
|
{
|
2020-02-26 23:28:20 +00:00
|
|
|
throw parse_error{ exc.what(), prev.position, source_path_ };
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
2020-02-26 23:28:20 +00:00
|
|
|
throw parse_error{ "An unspecified error occurred", prev.position, source_path_ };
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (!nextByte)
|
|
|
|
{
|
|
|
|
if (stream.eof())
|
|
|
|
{
|
|
|
|
if (decoder.needs_more_input())
|
2020-01-06 18:21:16 +00:00
|
|
|
TOML_ERROR("Encountered EOF during incomplete utf-8 code point sequence",
|
|
|
|
prev.position, source_path_);
|
2020-01-04 14:21:38 +00:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
else
|
2020-01-06 18:21:16 +00:00
|
|
|
TOML_ERROR("An error occurred while reading from the underlying stream",
|
|
|
|
prev.position, source_path_);
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TOML_ERROR_CHECK;
|
|
|
|
|
|
|
|
decoder(*nextByte);
|
|
|
|
if (decoder.error())
|
|
|
|
TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ );
|
|
|
|
|
|
|
|
TOML_ERROR_CHECK;
|
|
|
|
|
2020-03-07 23:06:53 +00:00
|
|
|
auto& current = codepoints[cp_idx % 2_sz];
|
2020-02-26 23:28:20 +00:00
|
|
|
current.bytes[current_byte_count++] = static_cast<string_char>(*nextByte);
|
2020-01-04 14:21:38 +00:00
|
|
|
if (decoder.has_code_point())
|
|
|
|
{
|
2020-03-07 23:06:53 +00:00
|
|
|
//store codepoint
|
2020-01-04 14:21:38 +00:00
|
|
|
current.value = decoder.codepoint;
|
|
|
|
|
2020-03-07 23:06:53 +00:00
|
|
|
//reset prev (will be the next 'current')
|
|
|
|
std::memset(prev.bytes, 0, sizeof(prev.bytes));
|
|
|
|
current_byte_count = {};
|
|
|
|
if (is_line_break<false>(current.value))
|
|
|
|
prev.position = { static_cast<source_index>(current.position.line + 1), 1 };
|
2020-01-04 14:21:38 +00:00
|
|
|
else
|
2020-03-07 23:06:53 +00:00
|
|
|
prev.position = { current.position.line, static_cast<source_index>(current.position.column + 1) };
|
|
|
|
cp_idx++;
|
|
|
|
return ¤t;
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
}
|
2020-03-07 23:06:53 +00:00
|
|
|
|
|
|
|
TOML_UNREACHABLE;
|
2020-01-04 14:21:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !TOML_EXCEPTIONS
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-03-03 21:28:24 +00:00
|
|
|
optional<parse_error>&& error() noexcept override
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
return std::move(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename CHAR>
|
|
|
|
utf8_reader(std::basic_string_view<CHAR>, std::string_view) -> utf8_reader<std::basic_string_view<CHAR>>;
|
|
|
|
|
|
|
|
template <typename CHAR>
|
|
|
|
utf8_reader(std::basic_istream<CHAR>&, std::string_view) -> utf8_reader<std::basic_istream<CHAR>>;
|
|
|
|
|
|
|
|
template <typename CHAR>
|
|
|
|
utf8_reader(std::basic_string_view<CHAR>, std::string&&) -> utf8_reader<std::basic_string_view<CHAR>>;
|
|
|
|
|
|
|
|
template <typename CHAR>
|
|
|
|
utf8_reader(std::basic_istream<CHAR>&, std::string&&) -> utf8_reader<std::basic_istream<CHAR>>;
|
|
|
|
|
|
|
|
#if !TOML_EXCEPTIONS
|
|
|
|
#undef TOML_ERROR_CHECK
|
|
|
|
#define TOML_ERROR_CHECK if (reader.error()) return nullptr
|
|
|
|
#endif
|
|
|
|
|
|
|
|
class TOML_EMPTY_BASES utf8_buffered_reader final
|
|
|
|
: public utf8_reader_interface
|
|
|
|
{
|
|
|
|
public:
|
2020-01-07 15:52:50 +00:00
|
|
|
static constexpr size_t max_history_length = 64;
|
2020-01-04 14:21:38 +00:00
|
|
|
|
|
|
|
private:
|
2020-01-07 15:52:50 +00:00
|
|
|
static constexpr size_t history_buffer_size = max_history_length - 1; //'head' is stored in the reader
|
2020-01-04 14:21:38 +00:00
|
|
|
utf8_reader_interface& reader;
|
|
|
|
struct
|
|
|
|
{
|
|
|
|
|
|
|
|
utf8_codepoint buffer[history_buffer_size];
|
|
|
|
size_t count, first;
|
|
|
|
}
|
|
|
|
history = {};
|
|
|
|
const utf8_codepoint* head = {};
|
|
|
|
size_t negative_offset = {};
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
explicit utf8_buffered_reader(utf8_reader_interface& reader_) noexcept
|
|
|
|
: reader{ reader_ }
|
|
|
|
{}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-02-26 23:28:20 +00:00
|
|
|
const source_path_ptr& source_path() const noexcept override
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
return reader.source_path();
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
const utf8_codepoint* read_next() TOML_MAY_THROW override
|
|
|
|
{
|
|
|
|
TOML_ERROR_CHECK;
|
|
|
|
|
|
|
|
if (negative_offset)
|
|
|
|
{
|
|
|
|
negative_offset--;
|
|
|
|
|
|
|
|
// an entry negative offset of 1 just means "replay the current head"
|
|
|
|
if (!negative_offset)
|
|
|
|
return head;
|
|
|
|
|
|
|
|
// otherwise step back into the history buffer
|
|
|
|
else
|
|
|
|
return history.buffer + ((history.first + history.count - negative_offset) % history_buffer_size);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// first character read from stream
|
|
|
|
if (!history.count && !head) TOML_UNLIKELY
|
|
|
|
head = reader.read_next();
|
|
|
|
|
|
|
|
// subsequent characters and not eof
|
|
|
|
else if (head)
|
|
|
|
{
|
|
|
|
if (history.count < history_buffer_size) TOML_UNLIKELY
|
|
|
|
history.buffer[history.count++] = *head;
|
|
|
|
else
|
|
|
|
history.buffer[(history.first++ + history_buffer_size) % history_buffer_size] = *head;
|
|
|
|
|
|
|
|
head = reader.read_next();
|
|
|
|
}
|
|
|
|
|
|
|
|
return head;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
[[nodiscard]]
|
|
|
|
const utf8_codepoint* step_back(size_t count) noexcept
|
|
|
|
{
|
|
|
|
TOML_ERROR_CHECK;
|
|
|
|
TOML_ASSERT(history.count);
|
|
|
|
TOML_ASSERT(negative_offset + count <= history.count);
|
|
|
|
|
|
|
|
negative_offset += count;
|
|
|
|
|
|
|
|
return negative_offset
|
|
|
|
? history.buffer + ((history.first + history.count - negative_offset) % history_buffer_size)
|
|
|
|
: head;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !TOML_EXCEPTIONS
|
|
|
|
|
|
|
|
[[nodiscard]]
|
2020-03-03 21:28:24 +00:00
|
|
|
optional<parse_error>&& error() noexcept override
|
2020-01-04 14:21:38 +00:00
|
|
|
{
|
|
|
|
return reader.error();
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
#undef TOML_ERROR_CHECK
|
|
|
|
#undef TOML_ERROR
|
|
|
|
}
|
2020-02-22 14:10:32 +00:00
|
|
|
TOML_IMPL_END
|