mirror of
https://github.com/ToruNiina/toml11.git
synced 2024-11-08 13:50:06 +00:00
feat: add bare minimum utf8 seq validity check
This commit is contained in:
parent
9745c0005f
commit
0aa3773860
@ -225,13 +225,6 @@ using lex_string = either<lex_ml_basic_string, lex_basic_string,
|
||||
lex_ml_literal_string, lex_literal_string>;
|
||||
|
||||
// ===========================================================================
|
||||
|
||||
using lex_comment_start_symbol = character<'#'>;
|
||||
using lex_non_eol = exclude<either<in_range<0x00, 0x08>, /*0x09 == tab is allowed*/
|
||||
in_range<0x0A, 0x1F>, character<0x7F>>>;
|
||||
using lex_comment = sequence<lex_comment_start_symbol,
|
||||
repeat<lex_non_eol, unlimited>>;
|
||||
|
||||
using lex_dot_sep = sequence<maybe<lex_ws>, character<'.'>, maybe<lex_ws>>;
|
||||
|
||||
using lex_unquoted_key = repeat<either<lex_alpha, lex_digit,
|
||||
@ -266,6 +259,35 @@ using lex_array_table = sequence<lex_array_table_open,
|
||||
maybe<lex_ws>,
|
||||
lex_array_table_close>;
|
||||
|
||||
using lex_utf8_1byte = in_range<0x00, 0x7F>;
|
||||
using lex_utf8_2byte = sequence<
|
||||
in_range<static_cast<char>(0xC2), static_cast<char>(0xDF)>,
|
||||
in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>
|
||||
>;
|
||||
using lex_utf8_3byte = sequence<either<
|
||||
sequence<character<static_cast<char>(0xE0)>, in_range<static_cast<char>(0xA0), static_cast<char>(0xBF)>>,
|
||||
sequence<in_range <static_cast<char>(0xE1), static_cast<char>(0xEC)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>,
|
||||
sequence<character<static_cast<char>(0xED)>, in_range<static_cast<char>(0x80), static_cast<char>(0x9F)>>,
|
||||
sequence<in_range <static_cast<char>(0xEE), static_cast<char>(0xEF)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>
|
||||
>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>;
|
||||
using lex_utf8_4byte = sequence<either<
|
||||
sequence<character<static_cast<char>(0xF0)>, in_range<static_cast<char>(0x90), static_cast<char>(0xBF)>>,
|
||||
sequence<in_range <static_cast<char>(0xF1), static_cast<char>(0xF3)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>,
|
||||
sequence<character<static_cast<char>(0xF4)>, in_range<static_cast<char>(0x80), static_cast<char>(0x8F)>>
|
||||
>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>,
|
||||
in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>;
|
||||
using lex_utf8_code = either<
|
||||
lex_utf8_1byte,
|
||||
lex_utf8_2byte,
|
||||
lex_utf8_3byte,
|
||||
lex_utf8_4byte
|
||||
>;
|
||||
|
||||
using lex_comment_start_symbol = character<'#'>;
|
||||
using lex_non_eol_ascii = either<character<0x09>, in_range<0x20, 0x7E>>;
|
||||
using lex_comment = sequence<lex_comment_start_symbol, repeat<either<
|
||||
lex_non_eol_ascii, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte>, unlimited>>;
|
||||
|
||||
} // detail
|
||||
} // toml
|
||||
#endif // TOML_LEXER_HPP
|
||||
|
@ -364,6 +364,17 @@ inline result<std::string, std::string> parse_escape_sequence(location& loc)
|
||||
return err(msg);
|
||||
}
|
||||
|
||||
inline result<none_t, std::ptrdiff_t> check_utf8_validity(const std::string& reg)
|
||||
{
|
||||
location loc("tmp", reg);
|
||||
const auto u8 = repeat<lex_utf8_code, unlimited>::invoke(loc);
|
||||
if(!u8 || loc.iter() != loc.end())
|
||||
{
|
||||
return err(std::distance(loc.begin(), loc.iter()));
|
||||
}
|
||||
return ok(none_t{});
|
||||
}
|
||||
|
||||
inline result<std::pair<toml::string, region>, std::string>
|
||||
parse_ml_basic_string(location& loc)
|
||||
{
|
||||
@ -432,7 +443,20 @@ parse_ml_basic_string(location& loc)
|
||||
source_location(inner_loc));
|
||||
}
|
||||
}
|
||||
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
||||
|
||||
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
|
||||
{
|
||||
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
||||
}
|
||||
else
|
||||
{
|
||||
inner_loc.reset(first);
|
||||
inner_loc.advance(u8.as_err());
|
||||
throw syntax_error(format_underline(
|
||||
"parse_ml_basic_string: invalid utf8 sequence found",
|
||||
{{source_location(inner_loc), "here"}}),
|
||||
source_location(inner_loc));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -484,7 +508,20 @@ parse_basic_string(location& loc)
|
||||
}
|
||||
quot = lex_quotation_mark::invoke(inner_loc);
|
||||
}
|
||||
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
||||
|
||||
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
|
||||
{
|
||||
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
||||
}
|
||||
else
|
||||
{
|
||||
inner_loc.reset(first);
|
||||
inner_loc.advance(u8.as_err());
|
||||
throw syntax_error(format_underline(
|
||||
"parse_ml_basic_string: invalid utf8 sequence found",
|
||||
{{source_location(inner_loc), "here"}}),
|
||||
source_location(inner_loc));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -545,8 +582,21 @@ parse_ml_literal_string(location& loc)
|
||||
source_location(inner_loc));
|
||||
}
|
||||
}
|
||||
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
|
||||
token.unwrap()));
|
||||
|
||||
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
|
||||
{
|
||||
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
|
||||
token.unwrap()));
|
||||
}
|
||||
else
|
||||
{
|
||||
inner_loc.reset(first);
|
||||
inner_loc.advance(u8.as_err());
|
||||
throw syntax_error(format_underline(
|
||||
"parse_ml_basic_string: invalid utf8 sequence found",
|
||||
{{source_location(inner_loc), "here"}}),
|
||||
source_location(inner_loc));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -584,9 +634,22 @@ parse_literal_string(location& loc)
|
||||
{{source_location(inner_loc), "should be '"}}),
|
||||
source_location(inner_loc));
|
||||
}
|
||||
return ok(std::make_pair(
|
||||
toml::string(body.unwrap().str(), toml::string_t::literal),
|
||||
token.unwrap()));
|
||||
|
||||
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
|
||||
{
|
||||
return ok(std::make_pair(
|
||||
toml::string(body.unwrap().str(), toml::string_t::literal),
|
||||
token.unwrap()));
|
||||
}
|
||||
else
|
||||
{
|
||||
inner_loc.reset(first);
|
||||
inner_loc.advance(u8.as_err());
|
||||
throw syntax_error(format_underline(
|
||||
"parse_ml_basic_string: invalid utf8 sequence found",
|
||||
{{source_location(inner_loc), "here"}}),
|
||||
source_location(inner_loc));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user