#pragma once #include "toml_common.h" #if TOML_LANG_HIGHER_THAN(0, 5, 0) // toml/issues/687 #include "toml_utf8_generated.h" #endif // TOML_LANG_HIGHER_THAN(0, 5, 0) namespace toml::impl { [[nodiscard]] constexpr bool is_whitespace(char32_t codepoint) noexcept { // see: https://en.wikipedia.org/wiki/Whitespace_character#Unicode // (characters that don't say "is a line-break") return codepoint == U'\t' || codepoint == U' ' || codepoint == U'\u00A0' // no-break space || codepoint == U'\u1680' // ogham space mark || (codepoint >= U'\u2000' && codepoint <= U'\u200A') // em quad -> hair space || codepoint == U'\u202F' // narrow no-break space || codepoint == U'\u205F' // medium mathematical space || codepoint == U'\u3000' // ideographic space ; } template [[nodiscard]] constexpr bool is_line_break(char32_t codepoint) noexcept { // see https://en.wikipedia.org/wiki/Whitespace_character#Unicode // (characters that say "is a line-break") constexpr auto low_range_end = CR ? U'\r' : U'\f'; return (codepoint >= U'\n' && codepoint <= low_range_end) || codepoint == U'\u0085' // next line || codepoint == U'\u2028' // line separator || codepoint == U'\u2029' // paragraph separator ; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool is_string_delimiter(char32_t codepoint) noexcept { return codepoint == U'"' || codepoint == U'\''; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool is_ascii_letter(char32_t codepoint) noexcept { return (codepoint >= U'a' && codepoint <= U'z') || (codepoint >= U'A' && codepoint <= U'Z'); } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool is_binary_digit(char32_t codepoint) noexcept { return codepoint == U'0' || codepoint == U'1'; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool is_octal_digit(char32_t codepoint) noexcept { return (codepoint >= U'0' && codepoint <= U'7'); } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool is_decimal_digit(char32_t codepoint) noexcept { return (codepoint >= U'0' && codepoint <= U'9'); } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool is_hex_digit(char32_t codepoint) noexcept { return (codepoint >= U'a' && codepoint <= U'f') || (codepoint >= U'A' && codepoint <= U'F') || is_decimal_digit(codepoint); } [[nodiscard]] constexpr bool is_bare_key_start_character(char32_t codepoint) noexcept { return is_ascii_letter(codepoint) || is_decimal_digit(codepoint) || codepoint == U'-' || codepoint == U'_' #if TOML_LANG_HIGHER_THAN(0, 5, 0) // toml/issues/644 & toml/issues/687 || codepoint == U'+' || is_unicode_letter(codepoint) || is_unicode_number(codepoint) #endif ; } [[nodiscard]] constexpr bool is_bare_key_character(char32_t codepoint) noexcept { return is_bare_key_start_character(codepoint) #if TOML_LANG_HIGHER_THAN(0, 5, 0) // toml/issues/687 || is_unicode_combining_mark(codepoint) #endif ; } [[nodiscard]] constexpr bool is_value_terminator(char32_t codepoint) noexcept { return is_line_break(codepoint) || is_whitespace(codepoint) || codepoint == U']' || codepoint == U'}' || codepoint == U',' || codepoint == U'#' ; } struct utf8_decoder final { // This decoder is based on code from here: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // // License: // // Copyright (c) 2008-2009 Bjoern Hoehrmann // // Permission is hereby granted, free of charge, to any person obtaining a copy of this // software and associated documentation files (the "Software"), to deal in the Software // without restriction, including without limitation the rights to use, copy, modify, merge, // publish, distribute, sublicense, and/or sell copies of the Software, and to permit // persons to whom the Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all copies // or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE // FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. uint_least32_t state{}; char32_t codepoint{}; static constexpr uint8_t state_table[] { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12 }; [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool error() const noexcept { return state == uint_least32_t{ 12u }; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool has_code_point() const noexcept { return state == uint_least32_t{}; } [[nodiscard]] TOML_ALWAYS_INLINE constexpr bool needs_more_input() const noexcept { return state > uint_least32_t{} && state != uint_least32_t{ 12u }; } constexpr void operator () (uint8_t byte) noexcept { TOML_ASSERT(!error()); const auto type = state_table[byte]; codepoint = static_cast( has_code_point() ? (uint_least32_t{ 255u } >> type) & byte : (byte & uint_least32_t{ 63u }) | (static_cast(codepoint) << 6) ); state = state_table[state + uint_least32_t{ 256u } + type]; } }; template class utf8_byte_stream; template class utf8_byte_stream> final { static_assert(sizeof(CHAR) == 1_sz); private: std::basic_string_view source; size_t position = {}; public: explicit constexpr utf8_byte_stream(std::basic_string_view sv) noexcept : source{ sv } { if (source.length() >= 3_sz && static_cast(source[0]) == 0xEF_u8 && static_cast(source[1]) == 0xBB_u8 && static_cast(source[2]) == 0xBF_u8) { position += 3_sz; } } [[nodiscard]] constexpr bool eof() const noexcept { return position >= source.length(); } [[nodiscard]] constexpr bool error() const noexcept { return false; } [[nodiscard]] constexpr std::optional operator() () noexcept { if (position >= source.length()) return {}; return static_cast(source[position++]); } }; template class utf8_byte_stream> final { static_assert(sizeof(CHAR) == 1_sz); private: std::basic_istream* source; public: explicit utf8_byte_stream(std::basic_istream& stream) TOML_MAY_THROW : source{ &stream } { if (*source) { static constexpr uint8_t bom[] { 0xEF_u8, 0xBB_u8, 0xBF_u8 }; using stream_traits = typename std::remove_pointer_t::traits_type; const auto initial_pos = source->tellg(); size_t bom_pos{}; auto bom_char = source->get(); while (*source && bom_char != stream_traits::eof() && bom_char == bom[bom_pos]) { bom_pos++; bom_char = source->get(); } if (!(*source) || bom_pos < 3_sz) source->seekg(initial_pos); } } [[nodiscard]] bool eof() const noexcept { return source->eof(); } [[nodiscard]] bool error() const noexcept { return !(*source); } [[nodiscard]] std::optional operator() () TOML_MAY_THROW { auto val = source->get(); if (val == std::basic_istream::traits_type::eof()) return {}; return static_cast(val); } }; struct utf8_codepoint final { char32_t value; uint8_t bytes[4]; toml::source_position position; template [[nodiscard]] TOML_ALWAYS_INLINE std::basic_string_view as_view() const noexcept { static_assert( sizeof(CHAR) == 1, "The string view's underlying character type must be 1 byte in size." ); return bytes[3] ? std::basic_string_view{ reinterpret_cast(bytes), 4_sz } : std::basic_string_view{ reinterpret_cast(bytes) }; } [[nodiscard]] constexpr operator char32_t& () noexcept { return value; } [[nodiscard]] constexpr operator const char32_t& () const noexcept { return value; } }; static_assert(std::is_trivial_v); static_assert(std::is_standard_layout_v); #if TOML_EXCEPTIONS #define TOML_ERROR_CHECK (void)0 #define TOML_ERROR(...) throw toml::parse_error( __VA_ARGS__ ) #else #define TOML_ERROR_CHECK if (err) return nullptr #define TOML_ERROR(...) err.emplace( __VA_ARGS__ ) #endif struct TOML_INTERFACE utf8_reader_interface { [[nodiscard]] virtual const std::shared_ptr& source_path() const noexcept = 0; [[nodiscard]] virtual const utf8_codepoint* read_next() TOML_MAY_THROW = 0; #if !TOML_EXCEPTIONS [[nodiscard]] virtual std::optional&& error() noexcept = 0; #endif virtual ~utf8_reader_interface() noexcept = default; }; template class TOML_EMPTY_BASES utf8_reader final : public utf8_reader_interface { private: utf8_byte_stream stream; utf8_decoder decoder; utf8_codepoint prev{}, current{}; uint8_t current_byte_count{}; std::shared_ptr source_path_; #if !TOML_EXCEPTIONS std::optional err; #endif public: template explicit utf8_reader(U && source, STR&& source_path = {}) TOML_CONDITIONAL_NOEXCEPT(std::is_nothrow_constructible_v, U&&>) : stream{ std::forward(source) } { current.position = { 1u, 1u }; if (!source_path.empty()) source_path_ = std::make_shared(std::forward(source_path)); } [[nodiscard]] const std::shared_ptr& source_path() const noexcept override { return source_path_; } [[nodiscard]] const utf8_codepoint* read_next() TOML_MAY_THROW override { TOML_ERROR_CHECK; if (stream.eof()) return nullptr; else if (stream.error()) TOML_ERROR("An error occurred while reading from the underlying stream", prev.position, source_path_ ); else if (decoder.error()) TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ ); TOML_ERROR_CHECK; while (true) { std::optional nextByte; if constexpr (!TOML_EXCEPTIONS || noexcept(stream())) { nextByte = stream(); } #if TOML_EXCEPTIONS else { try { nextByte = stream(); } catch (const std::exception& exc) { throw toml::parse_error{ exc.what(), prev.position, source_path_ }; } catch (...) { throw toml::parse_error{ "An unspecified error occurred", prev.position, source_path_ }; } } #endif if (!nextByte) { if (stream.eof()) { if (decoder.needs_more_input()) TOML_ERROR("Encountered EOF during incomplete utf-8 code point sequence", prev.position, source_path_); return nullptr; } else TOML_ERROR("An error occurred while reading from the underlying stream", prev.position, source_path_); } TOML_ERROR_CHECK; decoder(*nextByte); if (decoder.error()) TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ ); TOML_ERROR_CHECK; current.bytes[current_byte_count++] = *nextByte; if (decoder.has_code_point()) { current.value = decoder.codepoint; prev = current; std::memset(current.bytes, 0, sizeof(current.bytes)); current_byte_count = {}; if (is_line_break(prev.value)) { current.position.line++; current.position.column = 1u; } else current.position.column++; return &prev; } } } #if !TOML_EXCEPTIONS [[nodiscard]] std::optional&& error() noexcept override { return std::move(err); } #endif }; template utf8_reader(std::basic_string_view, std::string_view) -> utf8_reader>; template utf8_reader(std::basic_istream&, std::string_view) -> utf8_reader>; template utf8_reader(std::basic_string_view, std::string&&) -> utf8_reader>; template utf8_reader(std::basic_istream&, std::string&&) -> utf8_reader>; #if !TOML_EXCEPTIONS #undef TOML_ERROR_CHECK #define TOML_ERROR_CHECK if (reader.error()) return nullptr #endif class TOML_EMPTY_BASES utf8_buffered_reader final : public utf8_reader_interface { public: static constexpr auto max_history_length = 64_sz; private: static constexpr auto history_buffer_size = max_history_length - 1_sz; //the 'head' is stored in the underlying reader utf8_reader_interface& reader; struct { utf8_codepoint buffer[history_buffer_size]; size_t count, first; } history = {}; const utf8_codepoint* head = {}; size_t negative_offset = {}; public: explicit utf8_buffered_reader(utf8_reader_interface& reader_) noexcept : reader{ reader_ } {} [[nodiscard]] const std::shared_ptr& source_path() const noexcept override { return reader.source_path(); } [[nodiscard]] const utf8_codepoint* read_next() TOML_MAY_THROW override { TOML_ERROR_CHECK; if (negative_offset) { negative_offset--; // an entry negative offset of 1 just means "replay the current head" if (!negative_offset) return head; // otherwise step back into the history buffer else return history.buffer + ((history.first + history.count - negative_offset) % history_buffer_size); } else { // first character read from stream if (!history.count && !head) TOML_UNLIKELY head = reader.read_next(); // subsequent characters and not eof else if (head) { if (history.count < history_buffer_size) TOML_UNLIKELY history.buffer[history.count++] = *head; else history.buffer[(history.first++ + history_buffer_size) % history_buffer_size] = *head; head = reader.read_next(); } return head; } } [[nodiscard]] const utf8_codepoint* step_back(size_t count) noexcept { TOML_ERROR_CHECK; TOML_ASSERT(history.count); TOML_ASSERT(negative_offset + count <= history.count); negative_offset += count; return negative_offset ? history.buffer + ((history.first + history.count - negative_offset) % history_buffer_size) : head; } #if !TOML_EXCEPTIONS [[nodiscard]] std::optional&& error() noexcept override { return reader.error(); } #endif }; #undef TOML_ERROR_CHECK #undef TOML_ERROR }