From 62e8d58d8d3a3020a5bf3ced5c9a96ec29e4781e Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sun, 16 Jun 2019 17:32:29 +0900 Subject: [PATCH 1/3] feat: guess possible format errors --- toml/parser.hpp | 130 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 113 insertions(+), 17 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index a345611..6271d22 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1410,41 +1410,132 @@ parse_inline_table(location& loc) } template -value_t guess_number_type(const location& l) +result guess_number_type(const location& l) { + // This function tries to find some (common) mistakes by checking characters + // that follows the last character of a value. But it is often difficult + // because some non-newline characters can appear after a value. E.g. + // spaces, tabs, commas (in an array or inline table), closing brackets + // (of an array or inline table), comment-sign (#). Since this function + // does not parse further, those characters are always allowed to be there. location loc = l; - if(lex_offset_date_time::invoke(loc)) {return value_t::OffsetDatetime;} + if(lex_offset_date_time::invoke(loc)) {return ok(value_t::OffsetDatetime);} loc.reset(l.iter()); - if(lex_local_date_time::invoke(loc)) {return value_t::LocalDatetime;} + if(lex_local_date_time::invoke(loc)) + { + // bad offset may appear after this. + if(loc.iter() != loc.end() && (*loc.iter() == '+' || *loc.iter() == '-' + || *loc.iter() == 'Z' || *loc.iter() == 'z')) + { + return err(format_underline("[error] bad offset: should be [+-]HH:MM or Z", + {{std::addressof(l), "[+-]HH:MM or Z"}}, + {"OK: +09:00, -05:30", "NG: +9:00, -5:30"})); + } + return ok(value_t::LocalDatetime); + } loc.reset(l.iter()); - if(lex_local_date::invoke(loc)) {return value_t::LocalDate;} + if(lex_local_date::invoke(loc)) + { + // bad time may appear after this. + // A space is allowed as a delimiter between local time. But there are + // both cases in which a space becomes valid or invalid. + // - invalid: 2019-06-16 7:00:00 + // - valid : 2019-06-16 07:00:00 + if(loc.iter() != loc.end()) + { + const auto c = *loc.iter(); + if(c == 'T' || c == 't' || ('0' <= c && c <= '9')) + { + return err(format_underline("[error] bad time: should be HH:MM:SS.subsec", + {{std::addressof(l), "HH:MM:SS.subsec"}}, + {"OK: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", + "NG: 1979-05-27T7:32:00, 1979-05-27 7:32"})); + } + if(c == ' ' && std::next(loc.iter()) != loc.end() && + ('0' <= *std::next(loc.iter()) && *std::next(loc.iter())<= '9')) + { + return err(format_underline("[error] bad time: should be HH:MM:SS.subsec", + {{std::addressof(l), "HH:MM:SS.subsec"}}, + {"OK: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", + "NG: 1979-05-27T7:32:00, 1979-05-27 7:32"})); + } + } + return ok(value_t::LocalDate); + } loc.reset(l.iter()); - if(lex_local_time::invoke(loc)) {return value_t::LocalTime;} + if(lex_local_time::invoke(loc)) {return ok(value_t::LocalTime);} loc.reset(l.iter()); - if(lex_float::invoke(loc)) {return value_t::Float;} + if(lex_float::invoke(loc)) + { + if(loc.iter() != loc.end() && *loc.iter() == '_') + { + return err(format_underline("[error] bad float: `_` should be surrounded by digits", + {{std::addressof(l), "here"}}, + {"OK: +1.0, -2e-2, 3.141_592_653_589, inf, nan", + "NG: _1.0, 1.0_, 1_.0, 1.0__0"})); + } + return ok(value_t::Float); + } loc.reset(l.iter()); - return value_t::Integer; + if(lex_integer::invoke(loc)) + { + if(loc.iter() != loc.end()) + { + const auto c = *loc.iter(); + if(c == '_') + { + return err(format_underline("[error] bad integer: `_` should be surrounded by digits", + {{std::addressof(l), "here"}}, + {"OK: -42, 1_000, 1_2_3_4_5, 0xCOFFEE, 0b0010, 0o755", + "NG: 1__000, 0123"})); + } + if('0' <= c && c <= '9') + { + return err(format_underline("[error] bad integer: leading zero", + {{std::addressof(l), "here"}}, + {"OK: -42, 1_000, 1_2_3_4_5, 0xCOFFEE, 0b0010, 0o755", + "NG: 1__000, 0123"})); + } + if(c == ':' || c == '-') + { + return err(format_underline("[error] bad datetime: invalid format", + {{std::addressof(l), "here"}}, + {"OK: 1979-05-27T07:32:00-07:00, 1979-05-27 07:32:00.999999Z", + "NG: 1979-05-27T7:32:00-7:00, 1979-05-27 7:32-00:30"})); + } + if(c == '.' || c == 'e' || c == 'E') + { + return err(format_underline("[error] bad float: invalid format", + {{std::addressof(l), "here"}}, + {"OK: +1.0, -2e-2, 3.141_592_653_589, inf, nan", + "NG: _1.0, 1.0_, 1_.0, 1.0__0"})); + } + } + return ok(value_t::Integer); + } + return err(format_underline("[error] bad format: unknown value appeared", + {{std::addressof(l), "here"}})); } template -value_t guess_value_type(const location& loc) +result guess_value_type(const location& loc) { switch(*loc.iter()) { - case '"' : {return value_t::String; } - case '\'': {return value_t::String; } - case 't' : {return value_t::Boolean;} - case 'f' : {return value_t::Boolean;} - case '[' : {return value_t::Array; } - case '{' : {return value_t::Table; } - case 'i' : {return value_t::Float; } // inf. - case 'n' : {return value_t::Float; } // nan. + case '"' : {return ok(value_t::String); } + case '\'': {return ok(value_t::String); } + case 't' : {return ok(value_t::Boolean);} + case 'f' : {return ok(value_t::Boolean);} + case '[' : {return ok(value_t::Array); } + case '{' : {return ok(value_t::Table); } + case 'i' : {return ok(value_t::Float); } // inf. + case 'n' : {return ok(value_t::Float); } // nan. default : {return guess_number_type(loc);} } } @@ -1459,7 +1550,12 @@ result parse_value(location& loc) {{std::addressof(loc), ""}})); } - switch(guess_value_type(loc)) + const auto type = guess_value_type(loc); + if(!type) + { + return err(type.unwrap_err()); + } + switch(type.unwrap()) { case value_t::Boolean : {return parse_boolean(loc); } case value_t::Integer : {return parse_integer(loc); } From cf1c9371b69d8e311b7a2cd494a74da5ac1aa7a1 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sun, 16 Jun 2019 17:52:42 +0900 Subject: [PATCH 2/3] fix: correct example and positions in err msgs --- toml/parser.hpp | 52 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 6271d22..89e8049 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1430,7 +1430,7 @@ result guess_number_type(const location& l) || *loc.iter() == 'Z' || *loc.iter() == 'z')) { return err(format_underline("[error] bad offset: should be [+-]HH:MM or Z", - {{std::addressof(l), "[+-]HH:MM or Z"}}, + {{std::addressof(loc), "[+-]HH:MM or Z"}}, {"OK: +09:00, -05:30", "NG: +9:00, -5:30"})); } return ok(value_t::LocalDatetime); @@ -1447,18 +1447,26 @@ result guess_number_type(const location& l) if(loc.iter() != loc.end()) { const auto c = *loc.iter(); - if(c == 'T' || c == 't' || ('0' <= c && c <= '9')) + if(c == 'T' || c == 't') { return err(format_underline("[error] bad time: should be HH:MM:SS.subsec", - {{std::addressof(l), "HH:MM:SS.subsec"}}, + {{std::addressof(loc), "HH:MM:SS.subsec"}}, + {"OK: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", + "NG: 1979-05-27T7:32:00, 1979-05-27 7:32"})); + } + if('0' <= c && c <= '9') + { + return err(format_underline("[error] bad time: missing T", + {{std::addressof(loc), "T or space required here"}}, {"OK: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", "NG: 1979-05-27T7:32:00, 1979-05-27 7:32"})); } if(c == ' ' && std::next(loc.iter()) != loc.end() && ('0' <= *std::next(loc.iter()) && *std::next(loc.iter())<= '9')) { + loc.advance(); return err(format_underline("[error] bad time: should be HH:MM:SS.subsec", - {{std::addressof(l), "HH:MM:SS.subsec"}}, + {{std::addressof(loc), "HH:MM:SS.subsec"}}, {"OK: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", "NG: 1979-05-27T7:32:00, 1979-05-27 7:32"})); } @@ -1475,9 +1483,9 @@ result guess_number_type(const location& l) if(loc.iter() != loc.end() && *loc.iter() == '_') { return err(format_underline("[error] bad float: `_` should be surrounded by digits", - {{std::addressof(l), "here"}}, + {{std::addressof(loc), "here"}}, {"OK: +1.0, -2e-2, 3.141_592_653_589, inf, nan", - "NG: _1.0, 1.0_, 1_.0, 1.0__0"})); + "NG: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"})); } return ok(value_t::Float); } @@ -1491,36 +1499,52 @@ result guess_number_type(const location& l) if(c == '_') { return err(format_underline("[error] bad integer: `_` should be surrounded by digits", - {{std::addressof(l), "here"}}, - {"OK: -42, 1_000, 1_2_3_4_5, 0xCOFFEE, 0b0010, 0o755", + {{std::addressof(loc), "here"}}, + {"OK: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755", "NG: 1__000, 0123"})); } if('0' <= c && c <= '9') { + // leading zero. point '0' + loc.retrace(); return err(format_underline("[error] bad integer: leading zero", - {{std::addressof(l), "here"}}, - {"OK: -42, 1_000, 1_2_3_4_5, 0xCOFFEE, 0b0010, 0o755", + {{std::addressof(loc), "here"}}, + {"OK: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755", "NG: 1__000, 0123"})); } if(c == ':' || c == '-') { return err(format_underline("[error] bad datetime: invalid format", - {{std::addressof(l), "here"}}, + {{std::addressof(loc), "here"}}, {"OK: 1979-05-27T07:32:00-07:00, 1979-05-27 07:32:00.999999Z", "NG: 1979-05-27T7:32:00-7:00, 1979-05-27 7:32-00:30"})); } if(c == '.' || c == 'e' || c == 'E') { return err(format_underline("[error] bad float: invalid format", - {{std::addressof(l), "here"}}, + {{std::addressof(loc), "here"}}, {"OK: +1.0, -2e-2, 3.141_592_653_589, inf, nan", - "NG: _1.0, 1.0_, 1_.0, 1.0__0"})); + "NG: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"})); } } return ok(value_t::Integer); } + if(loc.iter() != loc.end() && *loc.iter() == '.') + { + return err(format_underline("[error] bad float: invalid format", + {{std::addressof(loc), "integer part required before this"}}, + {"OK: +1.0, -2e-2, 3.141_592_653_589, inf, nan", + "NG: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"})); + } + if(loc.iter() != loc.end() && *loc.iter() == '_') + { + return err(format_underline("[error] bad number: `_` should be surrounded by digits", + {{std::addressof(loc), "`_` is not surrounded by digits"}}, + {"OK: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755", + "NG: 1__000, 0123"})); + } return err(format_underline("[error] bad format: unknown value appeared", - {{std::addressof(l), "here"}})); + {{std::addressof(loc), "here"}})); } template From 00d40140acb28c1740d9af4e441fda55f31f7492 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Mon, 17 Jun 2019 12:59:29 +0900 Subject: [PATCH 3/3] doc: add an example of error message to README --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 5072dc9..6a00be8 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,24 @@ terminate called after throwing an instance of 'toml::syntax_error' | ~~~~~~~ table defined twice ``` +When toml11 encounters a malformed value, it tries to detect what type it is. +Then it shows hints to fix the format. An error message while reading one of +the malformed files in [the language agnostic test suite](https://github.com/BurntSushi/toml-test). +is shown below. + +```console + what(): [error] bad time: should be HH:MM:SS.subsec + --> ./datetime-malformed-no-secs.toml + 1 | no-secs = 1987-07-05T17:45Z + | ^------- HH:MM:SS.subsec + | +Hint: OK: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999 +Hint: NG: 1979-05-27T7:32:00, 1979-05-27 7:32 +``` + +You can find other examples in a job named `output_result` on +[CircleCI](https://circleci.com/gh/ToruNiina/toml11). + Since the error message generation is generally a difficult task, the current status is not ideal. If you encounter a weird error message, please let us know and contribute to improve the quality!