[scanner] Drop lonely byte support as it's unused by blink anyway.

The embedder should ultimately be responsible for handling this since they anyway give us a copy of the data. They can easily make sure that the chunks we get do not have lonely bytes. Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng Change-Id: Ie862107bbbdd00c4d904fbb457a206c2fd52e5d0 Reviewed-on: https://chromium-review.googlesource.com/1127044 Reviewed-by: Ulan Degenbaev <ulan@chromium.org> Reviewed-by: Marja Hölttä <marja@chromium.org> Commit-Queue: Toon Verwaest <verwaest@chromium.org> Cr-Commit-Position: refs/heads/master@{#54262}
2018-07-05 15:09:58 +02:00 · 2018-07-05 15:09:58 +02:00 · c7ad1ddd44
commit c7ad1ddd44
parent bfeb78a763
3 changed files with 43 additions and 122 deletions
--- a/include/v8.h
+++ b/include/v8.h
@ -1480,6 +1480,10 @@ class V8_EXPORT ScriptCompiler {
     * more than two data chunks. The embedder can avoid this problem by always
     * returning at least 2 bytes of data.
     *
+     * When streaming UTF-16 data, V8 does not handle characters split between
+     * two data chunks. The embedder has to make sure that chunks have an even
+     * length.
+     *
     * If the embedder wants to cancel the streaming, they should make the next
     * GetMoreData call return 0. V8 will interpret it as end of data (and most
     * probably, parsing will fail). The streaming task will return as soon as
--- a/src/parsing/scanner-character-streams.cc
+++ b/src/parsing/scanner-character-streams.cc
@ -38,9 +38,8 @@ struct Range {
  const Char* end;

  size_t length() { return static_cast<size_t>(end - start); }
-  bool empty() const { return start == end; }
  bool unaligned_start() const {
-    return reinterpret_cast<intptr_t>(start) % 2 == 1;
+    return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
  }
 };

@ -95,89 +94,47 @@ class ChunkedStream {

  Range<Char> GetDataAt(size_t pos) {
    Chunk chunk = FindChunk(pos);
-    size_t buffer_end = chunk.length();
+    size_t buffer_end = chunk.length;
    size_t buffer_pos = Min(buffer_end, pos - chunk.position);
-    return {&chunk.data()[buffer_pos], &chunk.data()[buffer_end]};
+    return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
  }

  ~ChunkedStream() {
    for (size_t i = 0; i < chunks_.size(); i++) {
-      delete[] chunks_[i].raw_data;
+      delete[] chunks_[i].data;
    }
  }

  static const bool kCanAccessHeap = false;

 private:
-  // A single chunk of Chars. There may be a lonely bytes at the start and end
-  // in case sizeof(Char) > 1. They just need to be ignored since additional
-  // chunks are added by FetchChunk that contain the full character.
-  // TODO(verwaest): Make sure that those characters are added by blink instead
-  // so we can get rid of this complexity here.
  struct Chunk {
-    // A raw chunk of Chars possibly including a lonely start and/or a lonely
-    // end byte.
-    const uint8_t* const raw_data;
-    // The logical position of data() (possibly skipping a lonely start byte).
+    const Char* const data;
+    // The logical position of data.
    const size_t position;
-    // The length of the raw_data.
-    const size_t raw_length : sizeof(size_t) * 8 - 1;
-    // Tells us whether the first byte of raw_data is a lonely start byte and
-    // should be skipped because it's combined with a lonely end byte from the
-    // previous chunk.
-    const bool lonely_start : 1;
-
-    size_t end_position() const { return position + length(); }
-
-    // The chunk includes a lonely end byte if the chunk is 2-byte but has an
-    // uneven number of chars (possibly ignoring a lonely start byte that is
-    // merged with the lonely end byte of the previous chunk).
-    bool lonely_end() const {
-      return (raw_length - lonely_start) % sizeof(Char) == 1;
-    }
-
-    uint8_t lonely_end_byte() const {
-      DCHECK(lonely_end());
-      return raw_data[raw_length - 1];
-    }
-
-    size_t length() const {
-      return (raw_length - lonely_start) >> (sizeof(Char) - 1);
-    }
-
-    bool has_chars() const { return raw_length - lonely_start > 0; }
-
-    const Char* data() const {
-      return reinterpret_cast<const Char*>(raw_data + lonely_start);
-    }
+    const size_t length;
+    size_t end_position() const { return position + length; }
  };

  Chunk FindChunk(size_t position) {
-    if (chunks_.empty()) FetchFirstChunk();
+    if (chunks_.empty()) FetchChunk(size_t{0});

-    // Walk forwards while the position is in front of the current chunk..
-    if (chunks_.back().position <= position) {
-      while (position >= chunks_.back().end_position() &&
-             chunks_.back().has_chars()) {
-        FetchChunk();
-      }
-      // Return if the final chunk's starting position is before the position.
-      if (chunks_.back().position <= position) return chunks_.back();
-      // Otherwise walk backwards to find the intermediate chunk added to
-      // support lonely bytes.
-      // TODO(verwaest): Remove once we don't need to support lonely bytes here
-      // anymore.
+    // Walk forwards while the position is in front of the current chunk.
+    while (position >= chunks_.back().end_position() &&
+           chunks_.back().length > 0) {
+      FetchChunk(chunks_.back().end_position());
    }

    // Walk backwards.
-    for (auto reverse_it = chunks_.rbegin() + 1; reverse_it != chunks_.rend();
+    for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
         ++reverse_it) {
      if (reverse_it->position <= position) return *reverse_it;
    }
+
    UNREACHABLE();
  }

-  void FetchFirstChunk() {
+  void FetchChunk(size_t position) {
    const uint8_t* data = nullptr;
    size_t length;
    {
@ -185,35 +142,10 @@ class ChunkedStream {
                                  RuntimeCallCounterId::kGetMoreDataCallback);
      length = source_->GetMoreData(&data);
    }
-    chunks_.push_back({data, 0, length, false});
-  }
-
-  void FetchChunk() {
-    DCHECK(!chunks_.empty());
-
-    const uint8_t* data = nullptr;
-    size_t length;
-    {
-      RuntimeCallTimerScope scope(stats_,
-                                  RuntimeCallCounterId::kGetMoreDataCallback);
-      length = source_->GetMoreData(&data);
-    }
-
-    const Chunk& last_chunk = chunks_.back();
-    bool lonely_start = last_chunk.lonely_end();
-    DCHECK(last_chunk.has_chars());
-
-    size_t position = last_chunk.end_position();
-
-    if (lonely_start) {
-      uint8_t* intermediate = NewArray<uint8_t>(2);
-      intermediate[0] = last_chunk.lonely_end_byte();
-      intermediate[1] = length == 0 ? 0 : data[0];
-      chunks_.push_back({intermediate, position, 2, false});
-      position += 1;
-    }
-
-    chunks_.push_back({data, position, length, lonely_start});
+    // Incoming data has to be aligned to Char size.
+    DCHECK_EQ(0, length % sizeof(Char));
+    chunks_.push_back(
+        {reinterpret_cast<const Char*>(data), position, length / sizeof(Char)});
  }

  std::vector<struct Chunk> chunks_;
@ -240,7 +172,7 @@ class BufferedCharacterStream : public Utf16CharacterStream {
    buffer_cursor_ = buffer_start_;

    Range<Char> range = byte_stream_.GetDataAt(position);
-    if (range.empty()) {
+    if (range.length() == 0) {
      buffer_end_ = buffer_start_;
      return false;
    }
@ -261,10 +193,8 @@ class BufferedCharacterStream : public Utf16CharacterStream {
  ByteStream<Char> byte_stream_;
 };

-// Provides a (partially) unbuffered utf-16 view on the bytes from the
-// underlying ByteStream. It is only partially unbuffered when running on MIPS
-// due to lonely start bytes making chunks unaligned. In that case, unaligned
-// chars in a chunk (due to lonely start) are locally buffered.
+// Provides a unbuffered utf-16 view on the bytes from the underlying
+// ByteStream.
 template <template <typename T> class ByteStream>
 class UnbufferedCharacterStream : public Utf16CharacterStream {
 public:
@ -282,20 +212,9 @@ class UnbufferedCharacterStream : public Utf16CharacterStream {
    buffer_start_ = range.start;
    buffer_end_ = range.end;
    buffer_cursor_ = buffer_start_;
-    if (range.empty()) return false;
+    if (range.length() == 0) return false;

-// TODO(verwaest): Make sure that this cannot happen by dealing with lonely
-// bytes on the blink side.
-#if V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64
-    // Buffer anyway in case the chunk is unaligned due to a lonely start.
-    if (range.unaligned_start()) {
-      size_t length = Min(kBufferSize, range.length());
-      i::CopyCharsUnsigned(buffer_, buffer_start_, length);
-      buffer_start_ = &buffer_[0];
-      buffer_cursor_ = buffer_start_;
-      buffer_end_ = &buffer_[length];
-    }
-#endif
+    DCHECK(!range.unaligned_start());
    DCHECK_LE(buffer_start_, buffer_end_);
    return true;
  }
@ -303,10 +222,6 @@ class UnbufferedCharacterStream : public Utf16CharacterStream {
  bool can_access_heap() override { return false; }

 private:
-#if V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64
-  static const size_t kBufferSize = 512;
-  uc16 buffer_[kBufferSize];
-#endif
  ByteStream<uint16_t> byte_stream_;
 };

--- a/test/cctest/parsing/test-scanner-streams.cc
+++ b/test/cctest/parsing/test-scanner-streams.cc
@ -28,12 +28,14 @@ class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
      chunks += strlen(chunks) + 1;
    } while (chunks_.back().len > 0);
  }
-  ChunkSource(const uint8_t* data, size_t len, bool extra_chunky)
+  ChunkSource(const uint8_t* data, size_t char_size, size_t len,
+              bool extra_chunky)
      : current_(0) {
-    // If extra_chunky, we'll use increasingly large chunk sizes.
-    // If not, we'll have a single chunk of full length.
-    size_t chunk_size = extra_chunky ? 1 : len;
-    for (size_t i = 0; i < len; i += chunk_size, chunk_size++) {
+    // If extra_chunky, we'll use increasingly large chunk sizes.  If not, we'll
+    // have a single chunk of full length. Make sure that chunks are always
+    // aligned to char-size though.
+    size_t chunk_size = extra_chunky ? char_size : len;
+    for (size_t i = 0; i < len; i += chunk_size, chunk_size += char_size) {
      chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
    }
    chunks_.push_back({nullptr, 0});
@ -371,7 +373,7 @@ void TestCharacterStreams(const char* one_byte_source, unsigned length,
    const uint8_t* data = one_byte_vector.begin();
    const uint8_t* data_end = one_byte_vector.end();

-    ChunkSource single_chunk(data, data_end - data, false);
+    ChunkSource single_chunk(data, 1, data_end - data, false);
    std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
        i::ScannerStream::For(&single_chunk,
                              v8::ScriptCompiler::StreamedSource::ONE_BYTE,
@ -379,7 +381,7 @@ void TestCharacterStreams(const char* one_byte_source, unsigned length,
    TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
                        length, start, end);

-    ChunkSource many_chunks(data, data_end - data, true);
+    ChunkSource many_chunks(data, 1, data_end - data, true);
    one_byte_streaming_stream.reset(i::ScannerStream::For(
        &many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE, nullptr));
    TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
@ -390,14 +392,14 @@ void TestCharacterStreams(const char* one_byte_source, unsigned length,
  {
    const uint8_t* data = one_byte_vector.begin();
    const uint8_t* data_end = one_byte_vector.end();
-    ChunkSource chunks(data, data_end - data, false);
+    ChunkSource chunks(data, 1, data_end - data, false);
    std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
        i::ScannerStream::For(&chunks, v8::ScriptCompiler::StreamedSource::UTF8,
                              nullptr));
    TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
                        start, end);

-    ChunkSource many_chunks(data, data_end - data, true);
+    ChunkSource many_chunks(data, 1, data_end - data, true);
    utf8_streaming_stream.reset(i::ScannerStream::For(
        &many_chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
    TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
@ -410,14 +412,14 @@ void TestCharacterStreams(const char* one_byte_source, unsigned length,
        reinterpret_cast<const uint8_t*>(two_byte_vector.begin());
    const uint8_t* data_end =
        reinterpret_cast<const uint8_t*>(two_byte_vector.end());
-    ChunkSource chunks(data, data_end - data, false);
+    ChunkSource chunks(data, 2, data_end - data, false);
    std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
        i::ScannerStream::For(
            &chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
    TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
                        length, start, end);

-    ChunkSource many_chunks(data, data_end - data, true);
+    ChunkSource many_chunks(data, 2, data_end - data, true);
    two_byte_streaming_stream.reset(i::ScannerStream::For(
        &many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
    TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
@ -459,7 +461,7 @@ TEST(Regress651333) {
    // Read len bytes from bytes, and compare against the expected unicode
    // characters. Expect kBadChar ( == Unicode replacement char == code point
    // 65533) instead of the incorrectly coded Latin1 char.
-    ChunkSource chunks(bytes, len, false);
+    ChunkSource chunks(bytes, 1, len, false);
    std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
        &chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
    for (size_t i = 0; i < len; i++) {