fix: handle edge-cases with quotes in ml-string

See comments in the code for detail.
This commit is contained in:
ToruNiina 2020-02-04 22:33:30 +09:00
parent d495df93a6
commit 0582e1535b
2 changed files with 99 additions and 13 deletions

View File

@ -154,12 +154,53 @@ using lex_basic_string = sequence<lex_quotation_mark,
repeat<lex_basic_char, unlimited>,
lex_quotation_mark>;
// After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
// are allowed to be used.
// After this, the following strings are *explicitly* allowed.
// - One or two `"`s in a multi-line basic string is allowed wherever it is.
// - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
// - One or two `"`s can appear just before or after the delimiter.
// ```toml
// str4 = """Here are two quotation marks: "". Simple enough."""
// str5 = """Here are three quotation marks: ""\"."""
// str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
// str7 = """"This," she said, "is just a pointless statement.""""
// ```
// In the current implementation (v3.3.0), it is difficult to parse `str7` in
// the above example. It is difficult to recognize `"` at the end of string body
// collectly. It will be misunderstood as a `"""` delimiter and an additional,
// invalid `"`. Like this:
// ```console
// what(): [error] toml::parse_table: invalid line format
// --> hoge.toml
// |
// 13 | str7 = """"This," she said, "is just a pointless statement.""""
// | ^- expected newline, but got '"'.
// ```
// As a quick workaround for this problem, `lex_ml_basic_string_delim` was
// splitted into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
// `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
// In parse_ml_basic_string() function, the trailing `"`s will be attached to
// the string body.
//
// Note: This feature is a "clarification". Therefore this change is considered
// as a spec that has been defined since the time when the multi-line
// basic string was introduced. Although it is a post-v0.5.0 changes,
// this change will be activated regardless of the flag,
// `TOML11_USE_UNRELEASED_TOML_FEATURES`.
//
using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
using lex_ml_basic_string_open = lex_ml_basic_string_delim;
using lex_ml_basic_string_close = sequence<
repeat<lex_quotation_mark, exactly<3>>,
maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
>;
#ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09
in_range<0x0a, 0x1F>, // is tab
character<0x5C>,
character<0x7F>,
character<0x5C>, // backslash
character<0x7F>, // DEL
lex_ml_basic_string_delim>>;
#else // TOML v0.5.0
using lex_ml_basic_unescaped = exclude<either<in_range<0x00,0x1F>,
@ -176,9 +217,9 @@ using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
lex_ml_basic_escaped_newline>,
unlimited>;
using lex_ml_basic_string = sequence<lex_ml_basic_string_delim,
using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
lex_ml_basic_body,
lex_ml_basic_string_delim>;
lex_ml_basic_string_close>;
using lex_literal_char = exclude<either<in_range<0x00, 0x08>,
in_range<0x10, 0x19>, character<0x27>>>;
@ -187,7 +228,13 @@ using lex_literal_string = sequence<lex_apostrophe,
repeat<lex_literal_char, unlimited>,
lex_apostrophe>;
// the same reason as above.
using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
using lex_ml_literal_string_open = lex_ml_literal_string_delim;
using lex_ml_literal_string_close = sequence<
repeat<lex_apostrophe, exactly<3>>,
maybe<lex_apostrophe>, maybe<lex_apostrophe>
>;
using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
in_range<0x10, 0x1F>,
@ -195,9 +242,9 @@ using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
lex_ml_literal_string_delim>>;
using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
unlimited>;
using lex_ml_literal_string = sequence<lex_ml_literal_string_delim,
using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
lex_ml_literal_body,
lex_ml_literal_string_delim>;
lex_ml_literal_string_close>;
using lex_string = either<lex_ml_basic_string, lex_basic_string,
lex_ml_literal_string, lex_literal_string>;

View File

@ -375,7 +375,7 @@ parse_ml_basic_string(location<Container>& loc)
std::string retval;
retval.reserve(token.unwrap().size());
auto delim = lex_ml_basic_string_delim::invoke(inner_loc);
auto delim = lex_ml_basic_string_open::invoke(inner_loc);
if(!delim)
{
throw internal_error(format_underline(
@ -410,7 +410,26 @@ parse_ml_basic_string(location<Container>& loc)
{{std::addressof(inner_loc), "not sufficient token"}}),
source_location(std::addressof(inner_loc)));
}
delim = lex_ml_basic_string_delim::invoke(inner_loc);
delim = lex_ml_basic_string_close::invoke(inner_loc);
}
// `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s
// at just before the delimiter. Here, we need to attach `"`s at the
// end of the string body, if it exists.
// For detail, see the definition of `lex_ml_basic_string_close`.
assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(),
[](const char c) noexcept {return c == '\"';}));
switch(delim.unwrap().size())
{
case 3: {break;}
case 4: {retval += "\""; break;}
case 5: {retval += "\"\""; break;}
default:
{
throw internal_error(format_underline(
"parse_ml_basic_string: closing delimiter has invalid length",
{{std::addressof(inner_loc), "end of this"}}),
source_location(std::addressof(inner_loc)));
}
}
return ok(std::make_pair(toml::string(retval), token.unwrap()));
}
@ -485,7 +504,7 @@ parse_ml_literal_string(location<Container>& loc)
{
location<std::string> inner_loc(loc.name(), token.unwrap().str());
const auto open = lex_ml_literal_string_delim::invoke(inner_loc);
const auto open = lex_ml_literal_string_open::invoke(inner_loc);
if(!open)
{
throw internal_error(format_underline(
@ -498,7 +517,7 @@ parse_ml_literal_string(location<Container>& loc)
const auto body = lex_ml_literal_body::invoke(inner_loc);
const auto close = lex_ml_literal_string_delim::invoke(inner_loc);
const auto close = lex_ml_literal_string_close::invoke(inner_loc);
if(!close)
{
throw internal_error(format_underline(
@ -506,9 +525,29 @@ parse_ml_literal_string(location<Container>& loc)
{{std::addressof(inner_loc), "should be '''"}}),
source_location(std::addressof(inner_loc)));
}
return ok(std::make_pair(
toml::string(body.unwrap().str(), toml::string_t::literal),
token.unwrap()));
// `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s
// at just before the delimiter. Here, we need to attach `'`s at the
// end of the string body, if it exists.
// For detail, see the definition of `lex_ml_basic_string_close`.
std::string retval = body.unwrap().str();
assert(std::all_of(close.unwrap().first(), close.unwrap().last(),
[](const char c) noexcept {return c == '\'';}));
switch(close.unwrap().size())
{
case 3: {break;}
case 4: {retval += "'"; break;}
case 5: {retval += "''"; break;}
default:
{
throw internal_error(format_underline(
"parse_ml_literal_string: closing delimiter has invalid length",
{{std::addressof(inner_loc), "end of this"}}),
source_location(std::addressof(inner_loc)));
}
}
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
token.unwrap()));
}
else
{