feat: add bare minimum utf8 seq validity check

2024-11-08 13:50:06 +00:00 · 2021-06-30 00:58:50 +09:00 · 2021-06-30 00:58:50 +09:00 · 0aa3773860
commit 0aa3773860
parent 9745c0005f
2 changed files with 99 additions and 14 deletions
--- a/toml/lexer.hpp
+++ b/toml/lexer.hpp
@ -225,13 +225,6 @@ using lex_string = either<lex_ml_basic_string,   lex_basic_string,
                          lex_ml_literal_string, lex_literal_string>;

 // ===========================================================================
-
-using lex_comment_start_symbol = character<'#'>;
-using lex_non_eol = exclude<either<in_range<0x00, 0x08>, /*0x09 == tab is allowed*/
-                                   in_range<0x0A, 0x1F>, character<0x7F>>>;
-using lex_comment = sequence<lex_comment_start_symbol,
-                             repeat<lex_non_eol, unlimited>>;
-
 using lex_dot_sep = sequence<maybe<lex_ws>, character<'.'>, maybe<lex_ws>>;

 using lex_unquoted_key = repeat<either<lex_alpha, lex_digit,
@ -266,6 +259,35 @@ using lex_array_table       = sequence<lex_array_table_open,
                                       maybe<lex_ws>,
                                       lex_array_table_close>;

+using lex_utf8_1byte = in_range<0x00, 0x7F>;
+using lex_utf8_2byte = sequence<
+        in_range<static_cast<char>(0xC2), static_cast<char>(0xDF)>,
+        in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>
+    >;
+using lex_utf8_3byte = sequence<either<
+        sequence<character<static_cast<char>(0xE0)>,                          in_range<static_cast<char>(0xA0), static_cast<char>(0xBF)>>,
+        sequence<in_range <static_cast<char>(0xE1), static_cast<char>(0xEC)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>,
+        sequence<character<static_cast<char>(0xED)>,                          in_range<static_cast<char>(0x80), static_cast<char>(0x9F)>>,
+        sequence<in_range <static_cast<char>(0xEE), static_cast<char>(0xEF)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>
+    >, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>;
+using lex_utf8_4byte = sequence<either<
+        sequence<character<static_cast<char>(0xF0)>,                          in_range<static_cast<char>(0x90), static_cast<char>(0xBF)>>,
+        sequence<in_range <static_cast<char>(0xF1), static_cast<char>(0xF3)>, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>,
+        sequence<character<static_cast<char>(0xF4)>,                          in_range<static_cast<char>(0x80), static_cast<char>(0x8F)>>
+    >, in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>,
+       in_range<static_cast<char>(0x80), static_cast<char>(0xBF)>>;
+using lex_utf8_code = either<
+        lex_utf8_1byte,
+        lex_utf8_2byte,
+        lex_utf8_3byte,
+        lex_utf8_4byte
+    >;
+
+using lex_comment_start_symbol = character<'#'>;
+using lex_non_eol_ascii = either<character<0x09>, in_range<0x20, 0x7E>>;
+using lex_comment = sequence<lex_comment_start_symbol, repeat<either<
+    lex_non_eol_ascii, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte>, unlimited>>;
+
 } // detail
 } // toml
 #endif // TOML_LEXER_HPP
--- a/toml/parser.hpp
+++ b/toml/parser.hpp
@ -364,6 +364,17 @@ inline result<std::string, std::string> parse_escape_sequence(location& loc)
    return err(msg);
 }

+inline result<none_t, std::ptrdiff_t> check_utf8_validity(const std::string& reg)
+{
+    location loc("tmp", reg);
+    const auto u8 = repeat<lex_utf8_code, unlimited>::invoke(loc);
+    if(!u8 || loc.iter() != loc.end())
+    {
+        return err(std::distance(loc.begin(), loc.iter()));
+    }
+    return ok(none_t{});
+}
+
 inline result<std::pair<toml::string, region>, std::string>
 parse_ml_basic_string(location& loc)
 {
@ -432,7 +443,20 @@ parse_ml_basic_string(location& loc)
                    source_location(inner_loc));
            }
        }
-        return ok(std::make_pair(toml::string(retval), token.unwrap()));
+
+        if(const auto u8 = check_utf8_validity(token.unwrap().str()))
+        {
+            return ok(std::make_pair(toml::string(retval), token.unwrap()));
+        }
+        else
+        {
+            inner_loc.reset(first);
+            inner_loc.advance(u8.as_err());
+            throw syntax_error(format_underline(
+                "parse_ml_basic_string: invalid utf8 sequence found",
+                {{source_location(inner_loc), "here"}}),
+                source_location(inner_loc));
+        }
    }
    else
    {
@ -484,7 +508,20 @@ parse_basic_string(location& loc)
            }
            quot = lex_quotation_mark::invoke(inner_loc);
        }
-        return ok(std::make_pair(toml::string(retval), token.unwrap()));
+
+        if(const auto u8 = check_utf8_validity(token.unwrap().str()))
+        {
+            return ok(std::make_pair(toml::string(retval), token.unwrap()));
+        }
+        else
+        {
+            inner_loc.reset(first);
+            inner_loc.advance(u8.as_err());
+            throw syntax_error(format_underline(
+                "parse_ml_basic_string: invalid utf8 sequence found",
+                {{source_location(inner_loc), "here"}}),
+                source_location(inner_loc));
+        }
    }
    else
    {
@ -545,8 +582,21 @@ parse_ml_literal_string(location& loc)
                    source_location(inner_loc));
            }
        }
-        return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
-                                 token.unwrap()));
+
+        if(const auto u8 = check_utf8_validity(token.unwrap().str()))
+        {
+            return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
+                                     token.unwrap()));
+        }
+        else
+        {
+            inner_loc.reset(first);
+            inner_loc.advance(u8.as_err());
+            throw syntax_error(format_underline(
+                "parse_ml_basic_string: invalid utf8 sequence found",
+                {{source_location(inner_loc), "here"}}),
+                source_location(inner_loc));
+        }
    }
    else
    {
@ -584,9 +634,22 @@ parse_literal_string(location& loc)
                {{source_location(inner_loc), "should be '"}}),
                source_location(inner_loc));
        }
-        return ok(std::make_pair(
-                  toml::string(body.unwrap().str(), toml::string_t::literal),
-                  token.unwrap()));
+
+        if(const auto u8 = check_utf8_validity(token.unwrap().str()))
+        {
+            return ok(std::make_pair(
+                      toml::string(body.unwrap().str(), toml::string_t::literal),
+                      token.unwrap()));
+        }
+        else
+        {
+            inner_loc.reset(first);
+            inner_loc.advance(u8.as_err());
+            throw syntax_error(format_underline(
+                "parse_ml_basic_string: invalid utf8 sequence found",
+                {{source_location(inner_loc), "here"}}),
+                source_location(inner_loc));
+        }
    }
    else
    {