Handle Utf-8 BOM at beginning of an Utf-8 stream.

(This should enable to drop the BOM handling in the Blink bindings.) R=marja@chromium.org BUG=v8:4947 Review-Url: https://codereview.chromium.org/2354973002 Cr-Commit-Position: refs/heads/master@{#39579}
2016-09-21 01:39:48 -07:00 · 2016-09-21 01:39:48 -07:00 · a2b8b6e7db
commit a2b8b6e7db
parent a7455beba1
2 changed files with 68 additions and 1 deletions
--- a/src/parsing/scanner-character-streams.cc
+++ b/src/parsing/scanner-character-streams.cc
@ -286,6 +286,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
  uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
  DCHECK_EQ(cursor, buffer_end_);

+  static const unibrow::uchar kUtf8Bom = 0xfeff;
+
  unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
      current_.pos.incomplete_char;
  size_t it;
@ -294,7 +296,11 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
    unibrow::uchar t =
        unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
    if (t == unibrow::Utf8::kIncomplete) continue;
-    if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+    if (V8_LIKELY(t < kUtf8Bom)) {
+      *(cursor++) = static_cast<uc16>(t);  // The by most frequent case.
+    } else if (t == kUtf8Bom && current_.pos.bytes + it == 2) {
+      // BOM detected at beginning of the stream. Don't copy it.
+    } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
      *(cursor++) = static_cast<uc16>(t);
    } else {
      *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
--- a/test/cctest/parsing/test-scanner-streams.cc
+++ b/test/cctest/parsing/test-scanner-streams.cc
@ -109,6 +109,67 @@ TEST(Utf8StreamAsciiOnly) {
  } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
 }

+TEST(Utf8StreamBOM) {
+  // Construct test string w/ UTF-8 BOM (byte order mark)
+  char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
+  strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
+
+  const char* chunks[] = {data, "\0"};
+  ChunkSource chunk_source(chunks);
+  std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
+      v8::internal::ScannerStream::For(
+          &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
+
+  // Read the data without tripping over the BOM.
+  for (size_t i = 0; unicode_ucs2[i]; i++) {
+    CHECK_EQ(unicode_ucs2[i], stream->Advance());
+  }
+  CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
+
+  // Make sure seek works.
+  stream->Seek(0);
+  CHECK_EQ(unicode_ucs2[0], stream->Advance());
+
+  stream->Seek(5);
+  CHECK_EQ(unicode_ucs2[5], stream->Advance());
+}
+
+TEST(Utf8SplitBOM) {
+  // Construct chunks with a BOM split into two chunks.
+  char partial_bom[] = "\xef\xbb";
+  char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
+  strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
+
+  {
+    const char* chunks[] = {partial_bom, data, "\0"};
+    ChunkSource chunk_source(chunks);
+    std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
+        v8::internal::ScannerStream::For(
+            &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
+
+    // Read the data without tripping over the BOM.
+    for (size_t i = 0; unicode_ucs2[i]; i++) {
+      CHECK_EQ(unicode_ucs2[i], stream->Advance());
+    }
+  }
+
+  // And now with single-byte BOM chunks.
+  char bom_byte_1[] = "\xef";
+  char bom_byte_2[] = "\xbb";
+  {
+    const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
+    ChunkSource chunk_source(chunks);
+    std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
+        v8::internal::ScannerStream::For(
+            &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
+
+    // Read the data without tripping over the BOM.
+    for (size_t i = 0; unicode_ucs2[i]; i++) {
+      CHECK_EQ(unicode_ucs2[i], stream->Advance());
+    }
+  }
+}
+
 TEST(Utf8ChunkBoundaries) {
  // Test utf-8 parsing at chunk boundaries.