mirror of
https://github.com/ToruNiina/toml11.git
synced 2025-01-10 09:20:11 +00:00
fix: handle edge-cases with quotes in ml-string
See comments in the code for detail.
This commit is contained in:
parent
d495df93a6
commit
0582e1535b
@ -154,12 +154,53 @@ using lex_basic_string = sequence<lex_quotation_mark,
|
||||
repeat<lex_basic_char, unlimited>,
|
||||
lex_quotation_mark>;
|
||||
|
||||
// After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
|
||||
// are allowed to be used.
|
||||
// After this, the following strings are *explicitly* allowed.
|
||||
// - One or two `"`s in a multi-line basic string is allowed wherever it is.
|
||||
// - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
|
||||
// - One or two `"`s can appear just before or after the delimiter.
|
||||
// ```toml
|
||||
// str4 = """Here are two quotation marks: "". Simple enough."""
|
||||
// str5 = """Here are three quotation marks: ""\"."""
|
||||
// str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
|
||||
// str7 = """"This," she said, "is just a pointless statement.""""
|
||||
// ```
|
||||
// In the current implementation (v3.3.0), it is difficult to parse `str7` in
|
||||
// the above example. It is difficult to recognize `"` at the end of string body
|
||||
// collectly. It will be misunderstood as a `"""` delimiter and an additional,
|
||||
// invalid `"`. Like this:
|
||||
// ```console
|
||||
// what(): [error] toml::parse_table: invalid line format
|
||||
// --> hoge.toml
|
||||
// |
|
||||
// 13 | str7 = """"This," she said, "is just a pointless statement.""""
|
||||
// | ^- expected newline, but got '"'.
|
||||
// ```
|
||||
// As a quick workaround for this problem, `lex_ml_basic_string_delim` was
|
||||
// splitted into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
|
||||
// `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
|
||||
// In parse_ml_basic_string() function, the trailing `"`s will be attached to
|
||||
// the string body.
|
||||
//
|
||||
// Note: This feature is a "clarification". Therefore this change is considered
|
||||
// as a spec that has been defined since the time when the multi-line
|
||||
// basic string was introduced. Although it is a post-v0.5.0 changes,
|
||||
// this change will be activated regardless of the flag,
|
||||
// `TOML11_USE_UNRELEASED_TOML_FEATURES`.
|
||||
//
|
||||
using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
|
||||
using lex_ml_basic_string_open = lex_ml_basic_string_delim;
|
||||
using lex_ml_basic_string_close = sequence<
|
||||
repeat<lex_quotation_mark, exactly<3>>,
|
||||
maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
|
||||
>;
|
||||
|
||||
#ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
|
||||
using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09
|
||||
in_range<0x0a, 0x1F>, // is tab
|
||||
character<0x5C>,
|
||||
character<0x7F>,
|
||||
character<0x5C>, // backslash
|
||||
character<0x7F>, // DEL
|
||||
lex_ml_basic_string_delim>>;
|
||||
#else // TOML v0.5.0
|
||||
using lex_ml_basic_unescaped = exclude<either<in_range<0x00,0x1F>,
|
||||
@ -176,9 +217,9 @@ using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
|
||||
using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
|
||||
lex_ml_basic_escaped_newline>,
|
||||
unlimited>;
|
||||
using lex_ml_basic_string = sequence<lex_ml_basic_string_delim,
|
||||
using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
|
||||
lex_ml_basic_body,
|
||||
lex_ml_basic_string_delim>;
|
||||
lex_ml_basic_string_close>;
|
||||
|
||||
using lex_literal_char = exclude<either<in_range<0x00, 0x08>,
|
||||
in_range<0x10, 0x19>, character<0x27>>>;
|
||||
@ -187,7 +228,13 @@ using lex_literal_string = sequence<lex_apostrophe,
|
||||
repeat<lex_literal_char, unlimited>,
|
||||
lex_apostrophe>;
|
||||
|
||||
// the same reason as above.
|
||||
using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
|
||||
using lex_ml_literal_string_open = lex_ml_literal_string_delim;
|
||||
using lex_ml_literal_string_close = sequence<
|
||||
repeat<lex_apostrophe, exactly<3>>,
|
||||
maybe<lex_apostrophe>, maybe<lex_apostrophe>
|
||||
>;
|
||||
|
||||
using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
|
||||
in_range<0x10, 0x1F>,
|
||||
@ -195,9 +242,9 @@ using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
|
||||
lex_ml_literal_string_delim>>;
|
||||
using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
|
||||
unlimited>;
|
||||
using lex_ml_literal_string = sequence<lex_ml_literal_string_delim,
|
||||
using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
|
||||
lex_ml_literal_body,
|
||||
lex_ml_literal_string_delim>;
|
||||
lex_ml_literal_string_close>;
|
||||
|
||||
using lex_string = either<lex_ml_basic_string, lex_basic_string,
|
||||
lex_ml_literal_string, lex_literal_string>;
|
||||
|
@ -375,7 +375,7 @@ parse_ml_basic_string(location<Container>& loc)
|
||||
std::string retval;
|
||||
retval.reserve(token.unwrap().size());
|
||||
|
||||
auto delim = lex_ml_basic_string_delim::invoke(inner_loc);
|
||||
auto delim = lex_ml_basic_string_open::invoke(inner_loc);
|
||||
if(!delim)
|
||||
{
|
||||
throw internal_error(format_underline(
|
||||
@ -410,7 +410,26 @@ parse_ml_basic_string(location<Container>& loc)
|
||||
{{std::addressof(inner_loc), "not sufficient token"}}),
|
||||
source_location(std::addressof(inner_loc)));
|
||||
}
|
||||
delim = lex_ml_basic_string_delim::invoke(inner_loc);
|
||||
delim = lex_ml_basic_string_close::invoke(inner_loc);
|
||||
}
|
||||
// `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s
|
||||
// at just before the delimiter. Here, we need to attach `"`s at the
|
||||
// end of the string body, if it exists.
|
||||
// For detail, see the definition of `lex_ml_basic_string_close`.
|
||||
assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(),
|
||||
[](const char c) noexcept {return c == '\"';}));
|
||||
switch(delim.unwrap().size())
|
||||
{
|
||||
case 3: {break;}
|
||||
case 4: {retval += "\""; break;}
|
||||
case 5: {retval += "\"\""; break;}
|
||||
default:
|
||||
{
|
||||
throw internal_error(format_underline(
|
||||
"parse_ml_basic_string: closing delimiter has invalid length",
|
||||
{{std::addressof(inner_loc), "end of this"}}),
|
||||
source_location(std::addressof(inner_loc)));
|
||||
}
|
||||
}
|
||||
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
||||
}
|
||||
@ -485,7 +504,7 @@ parse_ml_literal_string(location<Container>& loc)
|
||||
{
|
||||
location<std::string> inner_loc(loc.name(), token.unwrap().str());
|
||||
|
||||
const auto open = lex_ml_literal_string_delim::invoke(inner_loc);
|
||||
const auto open = lex_ml_literal_string_open::invoke(inner_loc);
|
||||
if(!open)
|
||||
{
|
||||
throw internal_error(format_underline(
|
||||
@ -498,7 +517,7 @@ parse_ml_literal_string(location<Container>& loc)
|
||||
|
||||
const auto body = lex_ml_literal_body::invoke(inner_loc);
|
||||
|
||||
const auto close = lex_ml_literal_string_delim::invoke(inner_loc);
|
||||
const auto close = lex_ml_literal_string_close::invoke(inner_loc);
|
||||
if(!close)
|
||||
{
|
||||
throw internal_error(format_underline(
|
||||
@ -506,9 +525,29 @@ parse_ml_literal_string(location<Container>& loc)
|
||||
{{std::addressof(inner_loc), "should be '''"}}),
|
||||
source_location(std::addressof(inner_loc)));
|
||||
}
|
||||
return ok(std::make_pair(
|
||||
toml::string(body.unwrap().str(), toml::string_t::literal),
|
||||
token.unwrap()));
|
||||
// `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s
|
||||
// at just before the delimiter. Here, we need to attach `'`s at the
|
||||
// end of the string body, if it exists.
|
||||
// For detail, see the definition of `lex_ml_basic_string_close`.
|
||||
|
||||
std::string retval = body.unwrap().str();
|
||||
assert(std::all_of(close.unwrap().first(), close.unwrap().last(),
|
||||
[](const char c) noexcept {return c == '\'';}));
|
||||
switch(close.unwrap().size())
|
||||
{
|
||||
case 3: {break;}
|
||||
case 4: {retval += "'"; break;}
|
||||
case 5: {retval += "''"; break;}
|
||||
default:
|
||||
{
|
||||
throw internal_error(format_underline(
|
||||
"parse_ml_literal_string: closing delimiter has invalid length",
|
||||
{{std::addressof(inner_loc), "end of this"}}),
|
||||
source_location(std::addressof(inner_loc)));
|
||||
}
|
||||
}
|
||||
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
|
||||
token.unwrap()));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user