[stringrefs] Add generalized UTF-8 decoder / validator

Bug: v8:12868 A slight modification to the existing DFA-based UTF-8 allocator to allow decoding surrogates, for use in decoding WTF-8. We'll need to additionally constrain the decoder to disallow surrogate pairs. Change-Id: Ifddbf08d4eeeff8f270df52a68f01769ea790eec Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3652787 Commit-Queue: Andy Wingo <wingo@igalia.com> Reviewed-by: Marja Hölttä <marja@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/main@{#80654}
2022-05-20 08:33:27 +02:00 · 2022-05-20 08:33:27 +02:00 · b48262d719
commit b48262d719
parent 0440123e30
6 changed files with 300 additions and 0 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -2449,6 +2449,7 @@ filegroup(
            "src/debug/debug-wasm-objects-inl.h",
            "src/runtime/runtime-test-wasm.cc",
            "src/runtime/runtime-wasm.cc",
+            "src/third_party/utf8-decoder/generalized-utf8-decoder.h",
            "src/trap-handler/handler-inside-posix.h",
            "src/trap-handler/handler-inside.cc",
            "src/trap-handler/handler-outside.cc",
--- a/BUILD.gn
+++ b/BUILD.gn
@ -3542,6 +3542,7 @@ v8_header_set("v8_internal_headers") {
      "src/compiler/wasm-loop-peeling.h",
      "src/debug/debug-wasm-objects-inl.h",
      "src/debug/debug-wasm-objects.h",
+      "src/third_party/utf8-decoder/generalized-utf8-decoder.h",
      "src/trap-handler/trap-handler-internal.h",
      "src/trap-handler/trap-handler.h",
      "src/wasm/assembler-buffer-cache.h",
--- a/src/third_party/utf8-decoder/README.v8
+++ b/src/third_party/utf8-decoder/README.v8
@ -16,3 +16,9 @@ Local modifications:
  bit mask for the incoming byte.
 - The caller must now zero out the code point buffer after successful or
  unsuccessful state transitions.
+- Specifically for generalized-utf8-decoder.h: we adapt the original
+  decoder to decode and validate "generalized UTF-8", a variant of UTF-8
+  used in WTF-8 that can encode surrogates.  See
+  https://simonsapin.github.io/wtf-8/#generalized-utf8.  There is one
+  fewer state and so the transition table is smaller by one in both
+  dimensions.
--- a/src/third_party/utf8-decoder/generalized-utf8-decoder.h
+++ b/src/third_party/utf8-decoder/generalized-utf8-decoder.h
@ -0,0 +1,105 @@
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ and the sibling file
+// utf8-decoder.h for details.
+//
+// This file decodes "generalized UTF-8", which is the same as UTF-8 except that
+// it allows surrogates: https://simonsapin.github.io/wtf-8/#generalized-utf8
+
+#include <stdint.h>
+
+#ifndef __GENERALIZED_UTF8_DFA_DECODER_H
+#define __GENERALIZED_UTF8_DFA_DECODER_H
+
+namespace GeneralizedUtf8DfaDecoder {
+
+enum State : uint8_t {
+  kReject = 0,
+  kAccept = 11,
+  kTwoByte = 22,
+  kThreeByte = 33,
+  kFourByte = 44,
+  kFourByteLow = 55,
+  kThreeByteHigh = 66,
+  kFourByteMidHigh = 77,
+};
+
+static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
+  // This first table maps bytes to character to a transition.
+  //
+  // The transition value takes a state to a new state, but it also determines
+  // the set of bits from the current byte that contribute to the decoded
+  // codepoint:
+  //
+  //   Transition | Bits from current byte that contribute to decoded codepoint
+  //   ------------------------------------------------------------------------
+  //    0, 1      | 0b01111111
+  //    2, 3      | 0b00111111
+  //    4, 5      | 0b00011111
+  //    6, 7      | 0b00001111
+  //    8, 9      | 0b00000111
+  //    10        | 0b00000011
+  //
+  // Given the WTF-8 encoding, we therefore have the following constraints:
+  //   1. The transition value for 1-byte encodings should have the value 0 or 1
+  //      so that we preserve all of the low 7 bits.
+  //   2. Continuation bytes (0x80 to 0xBF) are of the form 0b10xxxxxx, and
+  //      therefore should have transition value between 0 and 3.
+  //   3. Leading bytes for 2-byte encodings are of the form 0b110yyyyy, and
+  //      therefore the transition value can be between 2 and 5.
+  //   4. Leading bytes for 3-byte encodings (0b1110zzzz) need transition value
+  //      between 4 and 7.
+  //   5. Leading bytes for 4-byte encodings (0b11110uuu) need transition value
+  //      between 6 and 9.
+  //   6. We need more states to impose irregular constraints.  Sometimes we can
+  //      use the knowldege that e.g. some high significant bits of the xxxx in
+  //      0b1110xxxx are 0, then we can use a higher transition value.
+  //   7. Transitions to invalid states can use any transition value.
+  static constexpr uint8_t transitions[] = {
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 00-0F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 10-1F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 20-2F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 30-3F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 40-4F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 50-5F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 60-6F
+      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 70-7F
+      1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80-8F
+      2,  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 90-9F
+      3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // A0-AF
+      3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // B0-BF
+      8,  8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // C0-CF
+      4,  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // D0-DF
+      9,  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,  // E0-EF
+      10, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,  // F0-FF
+  };
+
+  // This second table maps a state to a new state when adding a transition.
+  //  00-7F
+  //  |   80-8F
+  //  |   |   90-9F
+  //  |   |   |   A0-BF
+  //  |   |   |   |   C2-DF
+  //  |   |   |   |   |   E1-EF
+  //  |   |   |   |   |   |   F1-F3
+  //  |   |   |   |   |   |   |   F4
+  //  |   |   |   |   |   |   |   |   C0, C1, F5-FF
+  //  |   |   |   |   |   |   |   |   |  E0
+  //  |   |   |   |   |   |   |   |   |  |   F0
+  static constexpr uint8_t states[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // REJECT = 0
+      11, 0,  0,  0,  22, 33, 44, 55, 0, 66, 77,  // ACCEPT = 11
+      0,  11, 11, 11, 0,  0,  0,  0,  0, 0,  0,   // 2-byte = 22
+      0,  22, 22, 22, 0,  0,  0,  0,  0, 0,  0,   // 3-byte = 33
+      0,  33, 33, 33, 0,  0,  0,  0,  0, 0,  0,   // 4-byte = 44
+      0,  33, 0,  0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte low = 55
+      0,  0,  0,  22, 0,  0,  0,  0,  0, 0,  0,   // 3-byte high = 66
+      0,  0,  33, 33, 0,  0,  0,  0,  0, 0,  0,   // 4-byte mid/high = 77
+  };
+
+  uint8_t type = transitions[byte];
+  *state = static_cast<State>(states[*state + type]);
+  *buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
+}
+
+}  // namespace GeneralizedUtf8DfaDecoder
+
+#endif  // __GENERALIZED_UTF8_DFA_DECODER_H
--- a/test/cctest/BUILD.gn
+++ b/test/cctest/BUILD.gn
@ -416,6 +416,7 @@ v8_source_set("cctest_sources") {
      "wasm/test-wasm-serialization.cc",
      "wasm/test-wasm-shared-engine.cc",
      "wasm/test-wasm-stack.cc",
+      "wasm/test-wasm-strings.cc",
      "wasm/test-wasm-trap-position.cc",
      "wasm/wasm-atomics-utils.h",
      "wasm/wasm-run-utils.cc",
--- a/test/cctest/wasm/test-wasm-strings.cc
+++ b/test/cctest/wasm/test-wasm-strings.cc
@ -0,0 +1,186 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/strings/unicode.h"
+#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
+#include "src/third_party/utf8-decoder/utf8-decoder.h"
+#include "test/cctest/cctest.h"
+
+namespace v8 {
+namespace internal {
+namespace wasm {
+namespace test_wasm_strings {
+
+struct Utf8Decoder {
+  Utf8DfaDecoder::State state = Utf8DfaDecoder::kAccept;
+  uint32_t codepoint = 0;
+  void Decode(uint8_t byte) {
+    DCHECK(!failure());
+    Utf8DfaDecoder::Decode(byte, &state, &codepoint);
+  }
+  bool success() const { return state == Utf8DfaDecoder::kAccept; }
+  bool failure() const { return state == Utf8DfaDecoder::kReject; }
+  bool incomplete() const { return !success() && !failure(); }
+};
+
+struct GeneralizedUtf8Decoder {
+  GeneralizedUtf8DfaDecoder::State state = GeneralizedUtf8DfaDecoder::kAccept;
+  uint32_t codepoint = 0;
+  void Decode(uint8_t byte) {
+    DCHECK(!failure());
+    GeneralizedUtf8DfaDecoder::Decode(byte, &state, &codepoint);
+  }
+  bool success() const { return state == GeneralizedUtf8DfaDecoder::kAccept; }
+  bool failure() const { return state == GeneralizedUtf8DfaDecoder::kReject; }
+  bool incomplete() const { return !success() && !failure(); }
+};
+
+struct DecodingOracle {
+  Utf8Decoder utf8;
+  GeneralizedUtf8Decoder generalized_utf8;
+
+  void Decode(uint8_t byte) {
+    utf8.Decode(byte);
+    generalized_utf8.Decode(byte);
+  }
+
+  void CheckSame() const {
+    CHECK_EQ(utf8.success(), generalized_utf8.success());
+    CHECK_EQ(utf8.failure(), generalized_utf8.failure());
+    if (utf8.success()) CHECK(utf8.codepoint == generalized_utf8.codepoint);
+  }
+
+  bool success() const {
+    CheckSame();
+    return utf8.success();
+  }
+  bool failure() const {
+    CheckSame();
+    return utf8.failure();
+  }
+  bool incomplete() const {
+    CheckSame();
+    return utf8.incomplete();
+  }
+};
+
+TEST(GeneralizedUTF8Decode) {
+  // Exhaustive check that the generalized UTF-8 decoder matches the strict
+  // UTF-8 encoder, except for surrogates.  Each production should end the
+  // decoders accepting or rejecting the production.
+  for (uint32_t byte1 = 0; byte1 <= 0xFF; byte1++) {
+    DecodingOracle decoder1;
+    decoder1.Decode(byte1);
+
+    if (byte1 <= 0x7F) {
+      // First byte in [0x00, 0x7F]: one-byte.
+      CHECK(decoder1.success());
+    } else if (byte1 <= 0xC1) {
+      // First byte in [0x80, 0xC1]: invalid.
+      CHECK(decoder1.failure());
+    } else if (byte1 <= 0xDF) {
+      // First byte in [0xC2, 0xDF]: two-byte.
+      CHECK(decoder1.incomplete());
+      // Second byte completes the sequence.  Only [0x80, 0xBF] is valid.
+      for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
+        DecodingOracle decoder2 = decoder1;
+        decoder2.Decode(byte2);
+        if (0x80 <= byte2 && byte2 <= 0xBF) {
+          CHECK(decoder2.success());
+        } else {
+          CHECK(decoder2.failure());
+        }
+      }
+    } else if (byte1 <= 0xEF) {
+      // First byte in [0xE0, 0xEF]: three-byte sequence.
+      CHECK(decoder1.incomplete());
+      uint32_t min = byte1 == 0xE0 ? 0xA0 : 0x80;
+      for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
+        DecodingOracle decoder2 = decoder1;
+        decoder2.Decode(byte2);
+        if (min <= byte2 && byte2 <= 0xBF) {
+          // Second byte in [min, 0xBF]: continuation.
+          bool is_surrogate = byte1 == 0xED && byte2 >= 0xA0;
+          if (is_surrogate) {
+            // Here's where we expect the two decoders to differ: generalized
+            // UTF-8 will get a surrogate and strict UTF-8 errors.
+            CHECK(decoder2.utf8.failure());
+            CHECK(decoder2.generalized_utf8.incomplete());
+          } else {
+            CHECK(decoder2.incomplete());
+          }
+
+          // Third byte completes the sequence.  Only [0x80, 0xBF] is valid.
+          for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
+            DecodingOracle decoder3 = decoder2;
+            if (is_surrogate) {
+              decoder3.generalized_utf8.Decode(byte3);
+              if (0x80 <= byte3 && byte3 <= 0xBF) {
+                CHECK(decoder3.generalized_utf8.success());
+                uint32_t codepoint = decoder3.generalized_utf8.codepoint;
+                CHECK(unibrow::Utf16::IsLeadSurrogate(codepoint) ||
+                      unibrow::Utf16::IsTrailSurrogate(codepoint));
+              } else {
+                CHECK(decoder3.generalized_utf8.failure());
+              }
+            } else {
+              decoder3.Decode(byte3);
+              if (0x80 <= byte3 && byte3 <= 0xBF) {
+                CHECK(decoder3.success());
+              } else {
+                CHECK(decoder3.failure());
+              }
+            }
+          }
+        } else {
+          // Second byte not in range: failure.
+          CHECK(decoder2.failure());
+        }
+      }
+    } else if (byte1 <= 0xF4) {
+      // First byte in [0xF0, 0xF4]: four-byte sequence.
+      CHECK(decoder1.incomplete());
+      uint32_t min = byte1 == 0xF0 ? 0x90 : 0x80;
+      uint32_t max = byte1 == 0xF4 ? 0x8F : 0xBF;
+      for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
+        DecodingOracle decoder2 = decoder1;
+        decoder2.Decode(byte2);
+        if (min <= byte2 && byte2 <= max) {
+          // Second byte in [min, max]: continuation.
+          CHECK(decoder2.incomplete());
+          for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
+            DecodingOracle decoder3 = decoder2;
+            decoder3.Decode(byte3);
+            if (0x80 <= byte3 && byte3 <= 0xBF) {
+              // Third byte in [0x80, BF]: continuation.
+              CHECK(decoder3.incomplete());
+              for (uint32_t byte4 = 0x00; byte4 <= 0xFF; byte4++) {
+                DecodingOracle decoder4 = decoder3;
+                decoder4.Decode(byte4);
+                // Fourth byte4 completes the sequence.
+                if (0x80 <= byte4 && byte4 <= 0xBF) {
+                  CHECK(decoder4.success());
+                } else {
+                  CHECK(decoder4.failure());
+                }
+              }
+            } else {
+              CHECK(decoder3.failure());
+            }
+          }
+        } else {
+          CHECK(decoder2.failure());
+        }
+      }
+    } else {
+      // First byte in [0xF5, 0xFF]: failure.
+      CHECK(decoder1.failure());
+    }
+  }
+}
+
+}  // namespace test_wasm_strings
+}  // namespace wasm
+}  // namespace internal
+}  // namespace v8