[stringrefs] Fix WTF-8 rejection of surrogate pairs

Quite embarassingly, the test that the WTF-8 decoder rejects surrogate pairs was broken: the trailing surrogate was invalid. (The range of the second byte for leading surrogates is [A0,AF], and for trailing is [B0,BF]). Of course the actual functionality was broken, because the code that detected surrogate pairs called IsSurrogatePair with swapped arguments. Bug: v8:12868 Change-Id: Icab5e2e4e200afb3d34f478ab4f98b739ada5645 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3723497 Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Commit-Queue: Andy Wingo <wingo@igalia.com> Cr-Commit-Position: refs/heads/main@{#81376}
2022-06-27 08:20:00 +02:00 · 2022-06-27 08:20:00 +02:00 · 0257b0a3d4
commit 0257b0a3d4
parent f07faef9c7
3 changed files with 85 additions and 77 deletions
--- a/src/strings/unicode-decoder.cc
+++ b/src/strings/unicode-decoder.cc
@ -14,16 +14,59 @@
 namespace v8 {
 namespace internal {

+namespace {
+template <class Decoder>
+struct DecoderTraits;
+
+template <>
+struct DecoderTraits<Utf8Decoder> {
+  static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
+    // The DfaDecoder will only ever decode Unicode scalar values, and all
+    // sequences of USVs are valid.
+    DCHECK(!unibrow::Utf16::IsLeadSurrogate(trail));
+    DCHECK(!unibrow::Utf16::IsTrailSurrogate(trail));
+    return false;
+  }
+  static const bool kAllowIncompleteSequences = true;
+  using DfaDecoder = Utf8DfaDecoder;
+};
+
+#if V8_ENABLE_WEBASSEMBLY
+template <>
+struct DecoderTraits<Wtf8Decoder> {
+  static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
+    return unibrow::Utf16::IsSurrogatePair(lead, trail);
+  }
+  static const bool kAllowIncompleteSequences = false;
+  using DfaDecoder = GeneralizedUtf8DfaDecoder;
+};
+
+template <>
+struct DecoderTraits<StrictUtf8Decoder> {
+  static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
+    // The DfaDecoder will only ever decode Unicode scalar values, and all
+    // sequences of USVs are valid.
+    DCHECK(!unibrow::Utf16::IsLeadSurrogate(trail));
+    DCHECK(!unibrow::Utf16::IsTrailSurrogate(trail));
+    return false;
+  }
+  static const bool kAllowIncompleteSequences = false;
+  using DfaDecoder = Utf8DfaDecoder;
+};
+#endif  // V8_ENABLE_WEBASSEMBLY
+}  // namespace
+
 template <class Decoder>
 Utf8DecoderBase<Decoder>::Utf8DecoderBase(
    const base::Vector<const uint8_t>& data)
    : encoding_(Encoding::kAscii),
      non_ascii_start_(NonAsciiStart(data.begin(), data.length())),
      utf16_length_(non_ascii_start_) {
+  using Traits = DecoderTraits<Decoder>;
  if (non_ascii_start_ == data.length()) return;

  bool is_one_byte = true;
-  auto state = Decoder::DfaDecoder::kAccept;
+  auto state = Traits::DfaDecoder::kAccept;
  uint32_t current = 0;
  uint32_t previous = 0;
  const uint8_t* cursor = data.begin() + non_ascii_start_;
@ -31,11 +74,11 @@ Utf8DecoderBase<Decoder>::Utf8DecoderBase(

  while (cursor < end) {
    auto previous_state = state;
-    Decoder::DfaDecoder::Decode(*cursor, &state, &current);
-    if (state < Decoder::DfaDecoder::kAccept) {
-      DCHECK_EQ(state, Decoder::DfaDecoder::kReject);
-      if (Decoder::kAllowIncompleteSequences) {
-        state = Decoder::DfaDecoder::kAccept;
+    Traits::DfaDecoder::Decode(*cursor, &state, &current);
+    if (state < Traits::DfaDecoder::kAccept) {
+      DCHECK_EQ(state, Traits::DfaDecoder::kReject);
+      if (Traits::kAllowIncompleteSequences) {
+        state = Traits::DfaDecoder::kAccept;
        static_assert(unibrow::Utf8::kBadChar > unibrow::Latin1::kMaxChar);
        is_one_byte = false;
        utf16_length_++;
@ -43,13 +86,13 @@ Utf8DecoderBase<Decoder>::Utf8DecoderBase(
        current = 0;
        // If we were trying to continue a multibyte sequence, try this byte
        // again.
-        if (previous_state != Decoder::DfaDecoder::kAccept) continue;
+        if (previous_state != Traits::DfaDecoder::kAccept) continue;
      } else {
        encoding_ = Encoding::kInvalid;
        return;
      }
-    } else if (state == Decoder::DfaDecoder::kAccept) {
-      if (Decoder::InvalidCodePointSequence(current, previous)) {
+    } else if (state == Traits::DfaDecoder::kAccept) {
+      if (Traits::IsInvalidSurrogatePair(previous, current)) {
        encoding_ = Encoding::kInvalid;
        return;
      }
@ -62,9 +105,9 @@ Utf8DecoderBase<Decoder>::Utf8DecoderBase(
    cursor++;
  }

-  if (state == Decoder::DfaDecoder::kAccept) {
+  if (state == Traits::DfaDecoder::kAccept) {
    encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
-  } else if (Decoder::kAllowIncompleteSequences) {
+  } else if (Traits::kAllowIncompleteSequences) {
    static_assert(unibrow::Utf8::kBadChar > unibrow::Latin1::kMaxChar);
    encoding_ = Encoding::kUtf16;
    utf16_length_++;
@ -77,28 +120,29 @@ template <class Decoder>
 template <typename Char>
 void Utf8DecoderBase<Decoder>::Decode(Char* out,
                                      const base::Vector<const uint8_t>& data) {
+  using Traits = DecoderTraits<Decoder>;
  DCHECK(!is_invalid());
  CopyChars(out, data.begin(), non_ascii_start_);

  out += non_ascii_start_;

-  auto state = Decoder::DfaDecoder::kAccept;
+  auto state = Traits::DfaDecoder::kAccept;
  uint32_t current = 0;
  const uint8_t* cursor = data.begin() + non_ascii_start_;
  const uint8_t* end = data.begin() + data.length();

  while (cursor < end) {
    auto previous_state = state;
-    Decoder::DfaDecoder::Decode(*cursor, &state, &current);
-    if (Decoder::kAllowIncompleteSequences &&
-        state < Decoder::DfaDecoder::kAccept) {
-      state = Decoder::DfaDecoder::kAccept;
+    Traits::DfaDecoder::Decode(*cursor, &state, &current);
+    if (Traits::kAllowIncompleteSequences &&
+        state < Traits::DfaDecoder::kAccept) {
+      state = Traits::DfaDecoder::kAccept;
      *(out++) = static_cast<Char>(unibrow::Utf8::kBadChar);
      current = 0;
      // If we were trying to continue a multibyte sequence, try this byte
      // again.
-      if (previous_state != Decoder::DfaDecoder::kAccept) continue;
-    } else if (state == Decoder::DfaDecoder::kAccept) {
+      if (previous_state != Traits::DfaDecoder::kAccept) continue;
+    } else if (state == Traits::DfaDecoder::kAccept) {
      if (sizeof(Char) == 1 ||
          current <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
        *(out++) = static_cast<Char>(current);
@ -111,42 +155,30 @@ void Utf8DecoderBase<Decoder>::Decode(Char* out,
    cursor++;
  }

-  if (Decoder::kAllowIncompleteSequences &&
-      state != Decoder::DfaDecoder::kAccept) {
+  if (Traits::kAllowIncompleteSequences &&
+      state != Traits::DfaDecoder::kAccept) {
    *out = static_cast<Char>(unibrow::Utf8::kBadChar);
  } else {
-    DCHECK_EQ(state, Decoder::DfaDecoder::kAccept);
+    DCHECK_EQ(state, Traits::DfaDecoder::kAccept);
  }
 }

-template V8_EXPORT_PRIVATE Utf8DecoderBase<Utf8Decoder>::Utf8DecoderBase(
-    const base::Vector<const uint8_t>& data);
+#define DEFINE_UNICODE_DECODER(Decoder)                                 \
+  template V8_EXPORT_PRIVATE Utf8DecoderBase<Decoder>::Utf8DecoderBase( \
+      const base::Vector<const uint8_t>& data);                         \
+  template V8_EXPORT_PRIVATE void Utf8DecoderBase<Decoder>::Decode(     \
+      uint8_t* out, const base::Vector<const uint8_t>& data);           \
+  template V8_EXPORT_PRIVATE void Utf8DecoderBase<Decoder>::Decode(     \
+      uint16_t* out, const base::Vector<const uint8_t>& data)

-template V8_EXPORT_PRIVATE void Utf8DecoderBase<Utf8Decoder>::Decode(
-    uint8_t* out, const base::Vector<const uint8_t>& data);
-
-template V8_EXPORT_PRIVATE void Utf8DecoderBase<Utf8Decoder>::Decode(
-    uint16_t* out, const base::Vector<const uint8_t>& data);
+DEFINE_UNICODE_DECODER(Utf8Decoder);

 #if V8_ENABLE_WEBASSEMBLY
-template Utf8DecoderBase<Wtf8Decoder>::Utf8DecoderBase(
-    const base::Vector<const uint8_t>& data);
-
-template void Utf8DecoderBase<Wtf8Decoder>::Decode(
-    uint8_t* out, const base::Vector<const uint8_t>& data);
-
-template void Utf8DecoderBase<Wtf8Decoder>::Decode(
-    uint16_t* out, const base::Vector<const uint8_t>& data);
-
-template Utf8DecoderBase<StrictUtf8Decoder>::Utf8DecoderBase(
-    const base::Vector<const uint8_t>& data);
-
-template void Utf8DecoderBase<StrictUtf8Decoder>::Decode(
-    uint8_t* out, const base::Vector<const uint8_t>& data);
-
-template void Utf8DecoderBase<StrictUtf8Decoder>::Decode(
-    uint16_t* out, const base::Vector<const uint8_t>& data);
+DEFINE_UNICODE_DECODER(Wtf8Decoder);
+DEFINE_UNICODE_DECODER(StrictUtf8Decoder);
 #endif  // V8_ENABLE_WEBASSEMBLY

+#undef DEFINE_UNICODE_DECODER
+
 }  // namespace internal
 }  // namespace v8
--- a/src/strings/unicode-decoder.h
+++ b/src/strings/unicode-decoder.h
@ -8,10 +8,6 @@
 #include "src/base/vector.h"
 #include "src/strings/unicode.h"

-#if V8_ENABLE_WEBASSEMBLY
-#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
-#endif
-
 namespace v8 {
 namespace internal {

@ -84,16 +80,6 @@ class Utf8DecoderBase {
 class V8_EXPORT_PRIVATE Utf8Decoder final
    : public Utf8DecoderBase<Utf8Decoder> {
 public:
-  static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
-    // The DfaDecoder will only ever decode Unicode scalar values, and all
-    // sequences of USVs are valid.
-    DCHECK(!unibrow::Utf16::IsLeadSurrogate(current));
-    DCHECK(!unibrow::Utf16::IsTrailSurrogate(current));
-    return false;
-  }
-  static const bool kAllowIncompleteSequences = true;
-  using DfaDecoder = Utf8DfaDecoder;
-
  explicit Utf8Decoder(const base::Vector<const uint8_t>& data)
      : Utf8DecoderBase(data) {}

@ -111,12 +97,6 @@ class V8_EXPORT_PRIVATE Utf8Decoder final
 // isolated surrogates.
 class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
 public:
-  static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
-    return unibrow::Utf16::IsSurrogatePair(current, previous);
-  }
-  static const bool kAllowIncompleteSequences = false;
-  using DfaDecoder = GeneralizedUtf8DfaDecoder;
-
  explicit Wtf8Decoder(const base::Vector<const uint8_t>& data)
      : Utf8DecoderBase(data) {}

@ -127,16 +107,6 @@ class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
 // with U+FFFD, we have a separate Encoding::kInvalid state.
 class StrictUtf8Decoder : public Utf8DecoderBase<StrictUtf8Decoder> {
 public:
-  static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
-    // The DfaDecoder will only ever decode Unicode scalar values, and all
-    // sequence of USVs are valid.
-    DCHECK(!unibrow::Utf16::IsLeadSurrogate(current));
-    DCHECK(!unibrow::Utf16::IsTrailSurrogate(current));
-    return false;
-  }
-  static const bool kAllowIncompleteSequences = false;
-  using DfaDecoder = Utf8DfaDecoder;
-
  explicit StrictUtf8Decoder(const base::Vector<const uint8_t>& data)
      : Utf8DecoderBase(data) {}

--- a/test/mjsunit/wasm/stringrefs-exec.js
+++ b/test/mjsunit/wasm/stringrefs-exec.js
@ -51,7 +51,12 @@ let interestingStrings = ['',
                          'two \ucccc byte',
                          'surrogate \ud800\udc000 pair',
                          'isolated \ud800 leading',
-                          'isolated \udc00 trailing'];
+                          'isolated \udc00 trailing',
+                          '\ud800 isolated leading at beginning',
+                          '\udc00 isolated trailing at beginning',
+                          'isolated leading at end \ud800',
+                          'isolated trailing at end \udc00',
+                          'swapped surrogate \udc00\ud800 pair'];

 function IsSurrogate(codepoint) {
  return 0xD800 <= codepoint && codepoint <= 0xDFFF
@ -87,7 +92,8 @@ function makeWtf8TestDataSegment() {
  for (let bytes of ['trailing high byte \xa9',
                     'interstitial high \xa9 byte',
                     'invalid \xc0 byte',
-                     'surrogate \xed\xa0\x80\xed\xd0\x80 pair']) {
+                     'invalid three-byte \xed\xd0\x80',
+                     'surrogate \xed\xa0\x80\xed\xb0\x80 pair']) {
    invalid[bytes] = { offset: data.length, length: bytes.length };
    for (let i = 0; i < bytes.length; i++) {
      data.push(bytes.charCodeAt(i));