[scanner] Separate ascii-in-utf8 length computation from decoding the chars

This way we walk the input string twice, but we reduce the number of branches
per ascii char in the long-ascii-sequence case from 2 per char to ~ 1 + 2 /
sizeof(intptr). Let's land and see what the bots say.

Change-Id: I574971c7df896237f3382be634a9bedc920fc827
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1649356
Reviewed-by: Leszek Swirski <leszeks@chromium.org>
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62046}
This commit is contained in:
Toon Verwaest 2019-06-07 09:30:38 +02:00 committed by Commit Bot
parent faaf4a8ab3
commit a64ccef757

View File

@ -607,13 +607,10 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
size_t max_buffer = max_buffer_end - output_cursor;
int max_length = static_cast<int>(Min(remaining, max_buffer));
DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
const uint8_t* read_end = cursor + max_length;
for (; cursor < read_end; cursor++) {
uint8_t c = *cursor;
DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
if (c > unibrow::Utf8::kMaxOneByteChar) break;
*(output_cursor++) = c;
}
int ascii_length = NonAsciiStart(cursor, max_length);
CopyChars(output_cursor, cursor, ascii_length);
cursor += ascii_length;
output_cursor += ascii_length;
}
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);