[scanner] Separate ascii-in-utf8 length computation from decoding the chars

This way we walk the input string twice, but we reduce the number of branches per ascii char in the long-ascii-sequence case from 2 per char to ~ 1 + 2 / sizeof(intptr). Let's land and see what the bots say. Change-Id: I574971c7df896237f3382be634a9bedc920fc827 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1649356 Reviewed-by: Leszek Swirski <leszeks@chromium.org> Commit-Queue: Toon Verwaest <verwaest@chromium.org> Cr-Commit-Position: refs/heads/master@{#62046}
2019-06-07 09:30:38 +02:00 · 2019-06-07 09:30:38 +02:00 · a64ccef757
commit a64ccef757
parent faaf4a8ab3
1 changed files with 4 additions and 7 deletions
--- a/src/parsing/scanner-character-streams.cc
+++ b/src/parsing/scanner-character-streams.cc
@ -607,13 +607,10 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
    size_t max_buffer = max_buffer_end - output_cursor;
    int max_length = static_cast<int>(Min(remaining, max_buffer));
    DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
-    const uint8_t* read_end = cursor + max_length;
-    for (; cursor < read_end; cursor++) {
-      uint8_t c = *cursor;
-      DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
-      if (c > unibrow::Utf8::kMaxOneByteChar) break;
-      *(output_cursor++) = c;
-    }
+    int ascii_length = NonAsciiStart(cursor, max_length);
+    CopyChars(output_cursor, cursor, ascii_length);
+    cursor += ascii_length;
+    output_cursor += ascii_length;
  }

  current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);