Fix bad-char handling in utf-8 streaming streams. Also add test.

R=jochen@chromium.org BUG=chromium:651333, v8:4947 Review-Url: https://codereview.chromium.org/2391273002 Cr-Commit-Position: refs/heads/master@{#40004}
2016-10-05 10:18:36 -07:00 · 2016-10-05 10:18:36 -07:00 · 138127a608
commit 138127a608
parent 186e7db8dd
4 changed files with 85 additions and 18 deletions
--- a/src/parsing/scanner-character-streams.cc
+++ b/src/parsing/scanner-character-streams.cc
@ -286,6 +286,20 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
  uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
  DCHECK_EQ(cursor, buffer_end_);

+  // If the current chunk is the last (empty) chunk we'll have to process
+  // any left-over, partial characters.
+  if (chunk.length == 0) {
+    unibrow::uchar t =
+        unibrow::Utf8::ValueOfIncrementalFinish(&current_.pos.incomplete_char);
+    if (t != unibrow::Utf8::kBufferEmpty) {
+      DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
+      *cursor = static_cast<uc16>(t);
+      buffer_end_++;
+      current_.pos.chars++;
+    }
+    return;
+  }
+
  static const unibrow::uchar kUtf8Bom = 0xfeff;

  unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
@ -421,7 +435,7 @@ size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
    if (current_.chunk_no == chunks_.size()) {
      out_of_data = !FetchChunk();
    }
-    if (!out_of_data) FillBufferFromCurrentChunk();
+    FillBufferFromCurrentChunk();
  }

  DCHECK_EQ(current_.pos.chars - position, buffer_end_ - buffer_cursor_);
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -333,25 +333,54 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
      *buffer = 0;
      return kBadChar;
    }
-  } else {
-    // We're inside of a character, as described by buffer.
-    if (IsContinuationCharacter(next)) {
-      // How many bytes (excluding this one) do we still expect?
-      uint8_t count = (*buffer >> 24) - 1;
-      // Update the value.
-      uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
-      if (count) {
-        *buffer = count << 24 | value;
-        return kIncomplete;
-      } else {
-        *buffer = 0;
-        return value;
-      }
+  } else if (*buffer <= 0xff) {
+    // We have one unprocessed byte left (from the last else case in this if
+    // statement).
+    uchar previous = *buffer;
+    *buffer = 0;
+    uchar t = ValueOfIncremental(previous, buffer);
+    if (t == kIncomplete) {
+      // If we have an incomplete character, process both the previous and the
+      // next byte at once.
+      return ValueOfIncremental(next, buffer);
    } else {
-      // Within a character, but not a continuation character? Bad char.
-      *buffer = 0;
-      return kBadChar;
+      // Otherwise, process the previous byte and save the next byte for next
+      // time.
+      DCHECK_EQ(0, *buffer);
+      *buffer = next;
+      return t;
    }
+  } else if (IsContinuationCharacter(next)) {
+    // We're inside of a character, as described by buffer.
+
+    // How many bytes (excluding this one) do we still expect?
+    uint8_t count = (*buffer >> 24) - 1;
+    // Update the value.
+    uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
+    if (count) {
+      *buffer = count << 24 | value;
+      return kIncomplete;
+    } else {
+      *buffer = 0;
+      return value;
+    }
+  } else {
+    // Within a character, but not a continuation character? Then the
+    // previous char was a bad char. But we need to save the current
+    // one.
+    *buffer = next;
+    return kBadChar;
+  }
+}
+
+uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
+  DCHECK_NOT_NULL(buffer);
+  if (*buffer == 0) {
+    return kBufferEmpty;
+  } else {
+    // Process left-over chars. An incomplete char at the end maps to kBadChar.
+    uchar t = ValueOfIncremental(0, buffer);
+    return (t == kIncomplete) ? kBadChar : t;
  }
 }

--- a/src/unicode.h
+++ b/src/unicode.h
@ -161,6 +161,7 @@ class Utf8 {
  typedef uint32_t Utf8IncrementalBuffer;
  static uchar ValueOfIncremental(byte next_byte,
                                  Utf8IncrementalBuffer* buffer);
+  static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer);

  // Excludes non-characters from the set of valid code points.
  static inline bool IsValidCharacter(uchar c);
--- a/test/cctest/parsing/test-scanner-streams.cc
+++ b/test/cctest/parsing/test-scanner-streams.cc
@ -423,3 +423,26 @@ TEST(CharacterStreams) {
  TestCharacterStreams(buffer, arraysize(buffer) - 1);
  TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
 }
+
+// Regression test for crbug.com/651333. Read invalid utf-8.
+TEST(Regress651333) {
+  const uint8_t bytes[] =
+      "A\xf1"
+      "ad";  // Anad, with n == n-with-tilde.
+  const uint16_t unicode[] = {65, 65533, 97, 100};
+
+  // Run the test for all sub-strings 0..N of bytes, to make sure we hit the
+  // error condition in and at chunk boundaries.
+  for (size_t len = 0; len < arraysize(bytes); len++) {
+    // Read len bytes from bytes, and compare against the expected unicode
+    // characters. Expect kBadChar ( == Unicode replacement char == code point
+    // 65533) instead of the incorrectly coded Latin1 char.
+    ChunkSource chunks(bytes, len, false);
+    std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
+        &chunks, v8::ScriptCompiler::StreamedSource::UTF8));
+    for (size_t i = 0; i < len; i++) {
+      CHECK_EQ(unicode[i], stream->Advance());
+    }
+    CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
+  }
+}