fix: handle edge-cases with quotes in ml-string

See comments in the code for detail.
2025-01-10 09:20:11 +00:00 · 2020-02-04 22:33:30 +09:00 · 2020-02-04 22:33:30 +09:00 · 0582e1535b
commit 0582e1535b
parent d495df93a6
2 changed files with 99 additions and 13 deletions
--- a/toml/lexer.hpp
+++ b/toml/lexer.hpp
@ -154,12 +154,53 @@ using lex_basic_string = sequence<lex_quotation_mark,
                                  repeat<lex_basic_char, unlimited>,
                                  lex_quotation_mark>;

+// After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
+// are allowed to be used.
+// After this, the following strings are *explicitly* allowed.
+// - One or two `"`s in a multi-line basic string is allowed wherever it is.
+// - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
+// - One or two `"`s can appear just before or after the delimiter.
+// ```toml
+// str4 = """Here are two quotation marks: "". Simple enough."""
+// str5 = """Here are three quotation marks: ""\"."""
+// str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
+// str7 = """"This," she said, "is just a pointless statement.""""
+// ```
+// In the current implementation (v3.3.0), it is difficult to parse `str7` in
+// the above example. It is difficult to recognize `"` at the end of string body
+// collectly. It will be misunderstood as a `"""` delimiter and an additional,
+// invalid `"`. Like this:
+// ```console
+//   what():  [error] toml::parse_table: invalid line format
+//  --> hoge.toml
+//     |
+//  13 | str7 = """"This," she said, "is just a pointless statement.""""
+//     |                                                               ^- expected newline, but got '"'.
+// ```
+// As a quick workaround for this problem, `lex_ml_basic_string_delim` was
+// splitted into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
+// `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
+// In parse_ml_basic_string() function, the trailing `"`s will be attached to
+// the string body.
+//
+// Note: This feature is a "clarification". Therefore this change is considered
+//       as a spec that has been defined since the time when the multi-line
+//       basic string was introduced. Although it is a post-v0.5.0 changes,
+//       this change will be activated regardless of the flag,
+//       `TOML11_USE_UNRELEASED_TOML_FEATURES`.
+//
 using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
+using lex_ml_basic_string_open  = lex_ml_basic_string_delim;
+using lex_ml_basic_string_close = sequence<
+        repeat<lex_quotation_mark, exactly<3>>,
+        maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
+    >;
+
 #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
 using lex_ml_basic_unescaped    = exclude<either<in_range<0x00, 0x08>, // 0x09
                                                 in_range<0x0a, 0x1F>, // is tab
-                                                 character<0x5C>,
-                                                 character<0x7F>,
+                                                 character<0x5C>, // backslash
+                                                 character<0x7F>, // DEL
                                                 lex_ml_basic_string_delim>>;
 #else // TOML v0.5.0
 using lex_ml_basic_unescaped    = exclude<either<in_range<0x00,0x1F>,
@ -176,9 +217,9 @@ using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
 using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
                                        lex_ml_basic_escaped_newline>,
                                 unlimited>;
-using lex_ml_basic_string = sequence<lex_ml_basic_string_delim,
+using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
                                     lex_ml_basic_body,
-                                     lex_ml_basic_string_delim>;
+                                     lex_ml_basic_string_close>;

 using lex_literal_char = exclude<either<in_range<0x00, 0x08>,
                                        in_range<0x10, 0x19>, character<0x27>>>;
@ -187,7 +228,13 @@ using lex_literal_string = sequence<lex_apostrophe,
                                    repeat<lex_literal_char, unlimited>,
                                    lex_apostrophe>;

+// the same reason as above.
 using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
+using lex_ml_literal_string_open  = lex_ml_literal_string_delim;
+using lex_ml_literal_string_close = sequence<
+        repeat<lex_apostrophe, exactly<3>>,
+        maybe<lex_apostrophe>, maybe<lex_apostrophe>
+    >;

 using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
                                           in_range<0x10, 0x1F>,
@ -195,9 +242,9 @@ using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
                                           lex_ml_literal_string_delim>>;
 using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
                                   unlimited>;
-using lex_ml_literal_string = sequence<lex_ml_literal_string_delim,
+using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
                                       lex_ml_literal_body,
-                                       lex_ml_literal_string_delim>;
+                                       lex_ml_literal_string_close>;

 using lex_string = either<lex_ml_basic_string,   lex_basic_string,
                          lex_ml_literal_string, lex_literal_string>;
--- a/toml/parser.hpp
+++ b/toml/parser.hpp
@ -375,7 +375,7 @@ parse_ml_basic_string(location<Container>& loc)
        std::string retval;
        retval.reserve(token.unwrap().size());

-        auto delim = lex_ml_basic_string_delim::invoke(inner_loc);
+        auto delim = lex_ml_basic_string_open::invoke(inner_loc);
        if(!delim)
        {
            throw internal_error(format_underline(
@ -410,7 +410,26 @@ parse_ml_basic_string(location<Container>& loc)
                    {{std::addressof(inner_loc), "not sufficient token"}}),
                    source_location(std::addressof(inner_loc)));
            }
-            delim = lex_ml_basic_string_delim::invoke(inner_loc);
+            delim = lex_ml_basic_string_close::invoke(inner_loc);
+        }
+        // `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s
+        // at just before the delimiter. Here, we need to attach `"`s at the
+        // end of the string body, if it exists.
+        // For detail, see the definition of `lex_ml_basic_string_close`.
+        assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(),
+                           [](const char c) noexcept {return c == '\"';}));
+        switch(delim.unwrap().size())
+        {
+            case 3: {break;}
+            case 4: {retval += "\"";  break;}
+            case 5: {retval += "\"\""; break;}
+            default:
+            {
+                throw internal_error(format_underline(
+                    "parse_ml_basic_string: closing delimiter has invalid length",
+                    {{std::addressof(inner_loc), "end of this"}}),
+                    source_location(std::addressof(inner_loc)));
+            }
        }
        return ok(std::make_pair(toml::string(retval), token.unwrap()));
    }
@ -485,7 +504,7 @@ parse_ml_literal_string(location<Container>& loc)
    {
        location<std::string> inner_loc(loc.name(), token.unwrap().str());

-        const auto open = lex_ml_literal_string_delim::invoke(inner_loc);
+        const auto open = lex_ml_literal_string_open::invoke(inner_loc);
        if(!open)
        {
            throw internal_error(format_underline(
@ -498,7 +517,7 @@ parse_ml_literal_string(location<Container>& loc)

        const auto body = lex_ml_literal_body::invoke(inner_loc);

-        const auto close = lex_ml_literal_string_delim::invoke(inner_loc);
+        const auto close = lex_ml_literal_string_close::invoke(inner_loc);
        if(!close)
        {
            throw internal_error(format_underline(
@ -506,9 +525,29 @@ parse_ml_literal_string(location<Container>& loc)
                {{std::addressof(inner_loc), "should be '''"}}),
                source_location(std::addressof(inner_loc)));
        }
-        return ok(std::make_pair(
-                  toml::string(body.unwrap().str(), toml::string_t::literal),
-                  token.unwrap()));
+        // `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s
+        // at just before the delimiter. Here, we need to attach `'`s at the
+        // end of the string body, if it exists.
+        // For detail, see the definition of `lex_ml_basic_string_close`.
+
+        std::string retval = body.unwrap().str();
+        assert(std::all_of(close.unwrap().first(), close.unwrap().last(),
+                           [](const char c) noexcept {return c == '\'';}));
+        switch(close.unwrap().size())
+        {
+            case 3: {break;}
+            case 4: {retval += "'";  break;}
+            case 5: {retval += "''"; break;}
+            default:
+            {
+                throw internal_error(format_underline(
+                    "parse_ml_literal_string: closing delimiter has invalid length",
+                    {{std::addressof(inner_loc), "end of this"}}),
+                    source_location(std::addressof(inner_loc)));
+            }
+        }
+        return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
+                                 token.unwrap()));
    }
    else
    {