Reland: [skjson] Unescape strings

Reviewed-on: https://skia-review.googlesource.com/c/167240 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Florin Malita <fmalita@chromium.org> Change-Id: Icfa1b335f19423422f2fe6fb592b40f7b72b16eb TBR= Reviewed-on: https://skia-review.googlesource.com/c/168265 Reviewed-by: Florin Malita <fmalita@chromium.org>
2018-11-05 11:44:02 -05:00 · 2018-11-05 11:44:02 -05:00 · 97ea59ada7
commit 97ea59ada7
parent 30e0d7fb4f
2 changed files with 111 additions and 6 deletions
--- a/src/utils/SkJSON.cpp
+++ b/src/utils/SkJSON.cpp
@ -8,8 +8,10 @@
 #include "SkJSON.h"

 #include "SkMalloc.h"
+#include "SkParse.h"
 #include "SkStream.h"
 #include "SkString.h"
+#include "SkUTF.h"

 #include <cmath>
 #include <tuple>
@ -226,7 +228,7 @@ namespace {

 // bit 0 (0x01) - plain ASCII string character
 // bit 1 (0x02) - whitespace
-// bit 2 (0x04) - string terminator (" \0 [control chars] **AND } ]** <- see matchString notes)
+// bit 2 (0x04) - string terminator (" \\ \0 [control chars] **AND } ]** <- see matchString notes)
 // bit 3 (0x08) - 0-9
 // bit 4 (0x10) - 0-9 e E .
 // bit 5 (0x20) - scope terminator (} ])
@ -237,7 +239,7 @@ static constexpr uint8_t g_token_flags[256] = {
    3,   1,   4,   1,   1,   1,   1,   1,     1,   1,   1,   1,   1,   1,   0x11,1, // 2
 0x19,0x19,0x19,0x19,0x19,0x19,0x19,0x19,  0x19,0x19,   1,   1,   1,   1,   1,   1, // 3
    1,   1,   1,   1,   1,   0x11,1,   1,     1,   1,   1,   1,   1,   1,   1,   1, // 4
-    1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   0,0x25,   1,   1, // 5
+    1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   4,0x25,   1,   1, // 5
    1,   1,   1,   1,   1,   0x11,1,   1,     1,   1,   1,   1,   1,   1,   1,   1, // 6
    1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   1,0x25,   1,   1, // 7

@ -286,6 +288,7 @@ public:
    explicit DOMParser(SkArenaAlloc& alloc)
        : fAlloc(alloc) {
        fValueStack.reserve(kValueStackReserve);
+        fUnescapeBuffer.reserve(kUnescapeBufferReserve);
    }

    const Value parse(const char* p, size_t size) {
@ -460,6 +463,10 @@ private:
    static constexpr size_t kValueStackReserve = 256;
    std::vector<Value>    fValueStack;

+    // String unescape buffer.
+    static constexpr size_t kUnescapeBufferReserve = 512;
+    std::vector<char>     fUnescapeBuffer;
+
    // Tracks the current object/array scope, as an index into fStack:
    //
    //   - for objects: fScopeIndex =  (index of first value in scope)
@ -626,28 +633,98 @@ private:
        return this->error(nullptr, p, "invalid token");
    }

+    const std::vector<char>* unescapeString(const char* begin, const char* end) {
+        fUnescapeBuffer.clear();
+
+        for (const auto* p = begin; p != end; ++p) {
+            if (*p != '\\') {
+                fUnescapeBuffer.push_back(*p);
+                continue;
+            }
+
+            if (++p == end) {
+                return nullptr;
+            }
+
+            switch (*p) {
+            case  '"': fUnescapeBuffer.push_back( '"'); break;
+            case '\\': fUnescapeBuffer.push_back('\\'); break;
+            case  '/': fUnescapeBuffer.push_back( '/'); break;
+            case  'b': fUnescapeBuffer.push_back('\b'); break;
+            case  'f': fUnescapeBuffer.push_back('\f'); break;
+            case  'n': fUnescapeBuffer.push_back('\n'); break;
+            case  'r': fUnescapeBuffer.push_back('\r'); break;
+            case  't': fUnescapeBuffer.push_back('\t'); break;
+            case  'u': {
+                if (p + 4 >= end) {
+                    return nullptr;
+                }
+
+                uint32_t hexed;
+                const char hex_str[] = {p[1], p[2], p[3], p[4], '\0'};
+                const auto* eos = SkParse::FindHex(hex_str, &hexed);
+                if (!eos || *eos) {
+                    return nullptr;
+                }
+
+                char utf8[SkUTF::kMaxBytesInUTF8Sequence];
+                const auto utf8_len = SkUTF::ToUTF8(SkTo<SkUnichar>(hexed), utf8);
+                fUnescapeBuffer.insert(fUnescapeBuffer.end(), utf8, utf8 + utf8_len);
+                p += 4;
+            } break;
+            default: return nullptr;
+            }
+        }
+
+        return &fUnescapeBuffer;
+    }
+
    template <typename MatchFunc>
    const char* matchString(const char* p, const char* p_stop, MatchFunc&& func) {
        SkASSERT(*p == '"');
        const auto* s_begin = p + 1;
-
-        // TODO: unescape
+        bool requires_unescape = false;

        do {
            // Consume string chars.
+            // This is the fast path, and hopefully we only hit it once then quick-exit below.
            for (p = p + 1; !is_eostring(*p); ++p);

            if (*p == '"') {
                // Valid string found.
-                func(s_begin, p - s_begin, p_stop);
+                if (!requires_unescape) {
+                    func(s_begin, p - s_begin, p_stop);
+                } else {
+                    // Slow unescape.  We could avoid this extra copy with some effort,
+                    // but in practice escaped strings should be rare.
+                    const auto* buf = this->unescapeString(s_begin, p);
+                    if (!buf) {
+                        break;
+                    }
+
+                    SkASSERT(!buf->empty());
+                    func(buf->data(), buf->size(), buf->data() + buf->size() - 1);
+                }
                return p + 1;
            }

+            if (*p == '\\') {
+                requires_unescape = true;
+                ++p;
+                continue;
+            }
+
            // End-of-scope chars are special: we use them to tag the end of the input.
            // Thus they cannot be consumed indiscriminately -- we need to check if we hit the
            // end of the input.  To that effect, we treat them as string terminators above,
            // then we catch them here.
-        } while (is_eoscope(*p) && (p != p_stop)); // Safe scope terminator char, keep going.
+            if (is_eoscope(*p)) {
+                continue;
+            }
+
+            // Invalid/unexpected char.
+            break;
+        } while (p != p_stop);

        // Premature end-of-input, or illegal string char.
        return this->error(nullptr, s_begin - 1, "invalid string");
--- a/tests/JSONTest.cpp
+++ b/tests/JSONTest.cpp
@ -59,6 +59,14 @@ DEF_TEST(JSON_Parse, reporter) {
        { "{ \"k\" : null , }"        , nullptr },
        { "{ \"k\" : null \"k\" : 1 }", nullptr },

+        {R"zzz(["\)zzz"      , nullptr},
+        {R"zzz(["\])zzz"     , nullptr},
+        {R"zzz(["\"])zzz"    , nullptr},
+        {R"zzz(["\z"])zzz"   , nullptr},
+        {R"zzz(["\u"])zzz"   , nullptr},
+        {R"zzz(["\u0"])zzz"  , nullptr},
+        {R"zzz(["\u00"])zzz" , nullptr},
+        {R"zzz(["\u000"])zzz", nullptr},

        { "[]"                           , "[]" },
        { " \n\r\t [ \n\r\t ] \n\r\t "   , "[]" },
@ -100,6 +108,26 @@ DEF_TEST(JSON_Parse, reporter) {
           }",
          "{\"k1\":null,\"k2\":0,\"k3\":[true,"
              "{\"kk1\":\"foo\",\"kk2\":\"bar\",\"kk3\":1.28,\"kk4\":[42]},\"boo\",null]}" },
+
+        {R"zzz(["\""])zzz"    , "[\"\"\"]"},
+        {R"zzz(["\\"])zzz"    , "[\"\\\"]"},
+        {R"zzz(["\/"])zzz"    , "[\"/\"]" },
+        {R"zzz(["\b"])zzz"    , "[\"\b\"]"},
+        {R"zzz(["\f"])zzz"    , "[\"\f\"]"},
+        {R"zzz(["\n"])zzz"    , "[\"\n\"]"},
+        {R"zzz(["\r"])zzz"    , "[\"\r\"]"},
+        {R"zzz(["\t"])zzz"    , "[\"\t\"]"},
+        {R"zzz(["\u1234"])zzz", "[\"\u1234\"]"},
+
+        {R"zzz(["foo\"bar"])zzz"    , "[\"foo\"bar\"]"},
+        {R"zzz(["foo\\bar"])zzz"    , "[\"foo\\bar\"]"},
+        {R"zzz(["foo\/bar"])zzz"    , "[\"foo/bar\"]" },
+        {R"zzz(["foo\bbar"])zzz"    , "[\"foo\bbar\"]"},
+        {R"zzz(["foo\fbar"])zzz"    , "[\"foo\fbar\"]"},
+        {R"zzz(["foo\nbar"])zzz"    , "[\"foo\nbar\"]"},
+        {R"zzz(["foo\rbar"])zzz"    , "[\"foo\rbar\"]"},
+        {R"zzz(["foo\tbar"])zzz"    , "[\"foo\tbar\"]"},
+        {R"zzz(["foo\u1234bar"])zzz", "[\"foo\u1234bar\"]"},
    };

    for (const auto& tst : g_tests) {