feat: add bare minimum utf8 seq validity check

This commit is contained in:
ToruNiina 2021-06-30 00:58:50 +09:00
parent 9745c0005f
commit 0aa3773860
2 changed files with 99 additions and 14 deletions

View File

@ -225,13 +225,6 @@ using lex_string = either<lex_ml_basic_string, lex_basic_string,
lex_ml_literal_string, lex_literal_string>;
// ===========================================================================
using lex_comment_start_symbol = character<'#'>;
using lex_non_eol = exclude<either<in_range<0x00, 0x08>, /*0x09 == tab is allowed*/
in_range<0x0A, 0x1F>, character<0x7F>>>;
using lex_comment = sequence<lex_comment_start_symbol,
repeat<lex_non_eol, unlimited>>;
using lex_dot_sep = sequence<maybe<lex_ws>, character<'.'>, maybe<lex_ws>>;
using lex_unquoted_key = repeat<either<lex_alpha, lex_digit,
@ -266,6 +259,35 @@ using lex_array_table = sequence<lex_array_table_open,
maybe<lex_ws>,
lex_array_table_close>;
using lex_utf8_1byte = in_range<0x00, 0x7F>;
using lex_utf8_2byte = sequence<
in_range<static_cast<char>(0xC2), static_cast<char>(0xDF)>,
in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>
>;
using lex_utf8_3byte = sequence<either<
sequence<character<static_cast<char>(0xE0)>, in_range<static_cast<char>(0xA0), static_cast<char>(0xBF)>>,
sequence<in_range <static_cast<char>(0xE1), static_cast<char>(0xEC)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>,
sequence<character<static_cast<char>(0xED)>, in_range<static_cast<char>(0x80), static_cast<char>(0x9F)>>,
sequence<in_range <static_cast<char>(0xEE), static_cast<char>(0xEF)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>
>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>;
using lex_utf8_4byte = sequence<either<
sequence<character<static_cast<char>(0xF0)>, in_range<static_cast<char>(0x90), static_cast<char>(0xBF)>>,
sequence<in_range <static_cast<char>(0xF1), static_cast<char>(0xF3)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>,
sequence<character<static_cast<char>(0xF4)>, in_range<static_cast<char>(0x80), static_cast<char>(0x8F)>>
>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>,
in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>;
using lex_utf8_code = either<
lex_utf8_1byte,
lex_utf8_2byte,
lex_utf8_3byte,
lex_utf8_4byte
>;
using lex_comment_start_symbol = character<'#'>;
using lex_non_eol_ascii = either<character<0x09>, in_range<0x20, 0x7E>>;
using lex_comment = sequence<lex_comment_start_symbol, repeat<either<
lex_non_eol_ascii, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte>, unlimited>>;
} // detail
} // toml
#endif // TOML_LEXER_HPP

View File

@ -364,6 +364,17 @@ inline result<std::string, std::string> parse_escape_sequence(location& loc)
return err(msg);
}
inline result<none_t, std::ptrdiff_t> check_utf8_validity(const std::string& reg)
{
location loc("tmp", reg);
const auto u8 = repeat<lex_utf8_code, unlimited>::invoke(loc);
if(!u8 || loc.iter() != loc.end())
{
return err(std::distance(loc.begin(), loc.iter()));
}
return ok(none_t{});
}
inline result<std::pair<toml::string, region>, std::string>
parse_ml_basic_string(location& loc)
{
@ -432,7 +443,20 @@ parse_ml_basic_string(location& loc)
source_location(inner_loc));
}
}
return ok(std::make_pair(toml::string(retval), token.unwrap()));
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
{
return ok(std::make_pair(toml::string(retval), token.unwrap()));
}
else
{
inner_loc.reset(first);
inner_loc.advance(u8.as_err());
throw syntax_error(format_underline(
"parse_ml_basic_string: invalid utf8 sequence found",
{{source_location(inner_loc), "here"}}),
source_location(inner_loc));
}
}
else
{
@ -484,7 +508,20 @@ parse_basic_string(location& loc)
}
quot = lex_quotation_mark::invoke(inner_loc);
}
return ok(std::make_pair(toml::string(retval), token.unwrap()));
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
{
return ok(std::make_pair(toml::string(retval), token.unwrap()));
}
else
{
inner_loc.reset(first);
inner_loc.advance(u8.as_err());
throw syntax_error(format_underline(
"parse_ml_basic_string: invalid utf8 sequence found",
{{source_location(inner_loc), "here"}}),
source_location(inner_loc));
}
}
else
{
@ -545,8 +582,21 @@ parse_ml_literal_string(location& loc)
source_location(inner_loc));
}
}
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
token.unwrap()));
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
{
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
token.unwrap()));
}
else
{
inner_loc.reset(first);
inner_loc.advance(u8.as_err());
throw syntax_error(format_underline(
"parse_ml_basic_string: invalid utf8 sequence found",
{{source_location(inner_loc), "here"}}),
source_location(inner_loc));
}
}
else
{
@ -584,9 +634,22 @@ parse_literal_string(location& loc)
{{source_location(inner_loc), "should be '"}}),
source_location(inner_loc));
}
return ok(std::make_pair(
toml::string(body.unwrap().str(), toml::string_t::literal),
token.unwrap()));
if(const auto u8 = check_utf8_validity(token.unwrap().str()))
{
return ok(std::make_pair(
toml::string(body.unwrap().str(), toml::string_t::literal),
token.unwrap()));
}
else
{
inner_loc.reset(first);
inner_loc.advance(u8.as_err());
throw syntax_error(format_underline(
"parse_ml_basic_string: invalid utf8 sequence found",
{{source_location(inner_loc), "here"}}),
source_location(inner_loc));
}
}
else
{