Fix character skipping after a surrogate pair

In a string the first character following a surrogate pair is skipped by the lexer, but the rest of the string is parsed as usual.
2024-11-22 20:10:06 +00:00 · 2015-11-13 12:49:26 +01:00 · 2015-11-13 12:49:26 +01:00 · ec7a1d8347
commit ec7a1d8347
parent 3948630374
3 changed files with 9 additions and 4 deletions
--- a/src/json.hpp
+++ b/src/json.hpp
@ -6856,8 +6856,8 @@ basic_json_parser_59:
                                auto codepoint2 = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>
                                                               (i + 7), 4).c_str(), nullptr, 16);
                                result += to_unicode(codepoint, codepoint2);
-                                // skip the next 11 characters (xxxx\uyyyy)
-                                i += 11;
+                                // skip the next 10 characters (xxxx\uyyyy)
+                                i += 10;
                            }
                            else
                            {
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
@ -6162,8 +6162,8 @@ class basic_json
                                auto codepoint2 = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>
                                                               (i + 7), 4).c_str(), nullptr, 16);
                                result += to_unicode(codepoint, codepoint2);
-                                // skip the next 11 characters (xxxx\uyyyy)
-                                i += 11;
+                                // skip the next 10 characters (xxxx\uyyyy)
+                                i += 10;
                            }
                            else
                            {
--- a/test/unit.cpp
+++ b/test/unit.cpp
@ -10205,4 +10205,9 @@ TEST_CASE("regression tests")
        j["string"] = bytes;
        CHECK(j["string"] == "\u0007\u0007");
    }
+
+    SECTION("character following a surrogate pair is skipped")
+    {
+        CHECK(json::parse("\"\\ud80c\\udc60abc\"").get<json::string_t>() == u8"\U00013060abc");
+    }
 }