Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
// Copyright 2016 the V8 project authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file.
|
|
|
|
|
|
|
|
#include "src/factory.h" // for i::Factory::NewExternalStringFrom*Byte
|
2017-02-07 14:05:02 +00:00
|
|
|
#include "src/feedback-vector-inl.h" // for include "src/factory.h"
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
#include "src/objects-inl.h"
|
|
|
|
#include "src/parsing/scanner-character-streams.h"
|
|
|
|
#include "src/parsing/scanner.h"
|
|
|
|
#include "test/cctest/cctest.h"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
// Implement ExternalSourceStream based on const char**.
|
|
|
|
// This will take each string as one chunk. The last chunk must be empty.
|
|
|
|
class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
|
|
|
|
public:
|
|
|
|
explicit ChunkSource(const char** chunks) : current_(0) {
|
|
|
|
do {
|
|
|
|
chunks_.push_back(
|
|
|
|
{reinterpret_cast<const uint8_t*>(*chunks), strlen(*chunks)});
|
|
|
|
chunks++;
|
|
|
|
} while (chunks_.back().len > 0);
|
|
|
|
}
|
2017-05-09 16:04:10 +00:00
|
|
|
explicit ChunkSource(const char* chunks) : current_(0) {
|
|
|
|
do {
|
|
|
|
chunks_.push_back(
|
|
|
|
{reinterpret_cast<const uint8_t*>(chunks), strlen(chunks)});
|
|
|
|
chunks += strlen(chunks) + 1;
|
|
|
|
} while (chunks_.back().len > 0);
|
|
|
|
}
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
ChunkSource(const uint8_t* data, size_t len, bool extra_chunky)
|
|
|
|
: current_(0) {
|
|
|
|
// If extra_chunky, we'll use increasingly large chunk sizes.
|
|
|
|
// If not, we'll have a single chunk of full length.
|
|
|
|
size_t chunk_size = extra_chunky ? 1 : len;
|
2016-11-03 12:31:51 +00:00
|
|
|
for (size_t i = 0; i < len; i += chunk_size, chunk_size++) {
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
|
|
|
|
}
|
|
|
|
chunks_.push_back({nullptr, 0});
|
|
|
|
}
|
2017-04-28 08:54:52 +00:00
|
|
|
ChunkSource(const uint8_t* data, size_t len, size_t chunk_size)
|
|
|
|
: current_(0) {
|
|
|
|
for (size_t i = 0; i < len; i += chunk_size) {
|
|
|
|
chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
|
|
|
|
}
|
|
|
|
chunks_.push_back({nullptr, 0});
|
|
|
|
}
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
~ChunkSource() {}
|
|
|
|
bool SetBookmark() override { return false; }
|
|
|
|
void ResetToBookmark() override {}
|
|
|
|
size_t GetMoreData(const uint8_t** src) override {
|
|
|
|
DCHECK_LT(current_, chunks_.size());
|
|
|
|
Chunk& next = chunks_[current_++];
|
|
|
|
uint8_t* chunk = new uint8_t[next.len];
|
|
|
|
i::MemMove(chunk, next.ptr, next.len);
|
|
|
|
*src = chunk;
|
|
|
|
return next.len;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
struct Chunk {
|
|
|
|
const uint8_t* ptr;
|
|
|
|
size_t len;
|
|
|
|
};
|
|
|
|
std::vector<Chunk> chunks_;
|
|
|
|
size_t current_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class TestExternalResource : public v8::String::ExternalStringResource {
|
|
|
|
public:
|
|
|
|
explicit TestExternalResource(uint16_t* data, int length)
|
|
|
|
: data_(data), length_(static_cast<size_t>(length)) {}
|
|
|
|
|
|
|
|
~TestExternalResource() {}
|
|
|
|
|
|
|
|
const uint16_t* data() const { return data_; }
|
|
|
|
size_t length() const { return length_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
uint16_t* data_;
|
|
|
|
size_t length_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class TestExternalOneByteResource
|
|
|
|
: public v8::String::ExternalOneByteStringResource {
|
|
|
|
public:
|
|
|
|
TestExternalOneByteResource(const char* data, size_t length)
|
|
|
|
: data_(data), length_(length) {}
|
|
|
|
|
|
|
|
const char* data() const { return data_; }
|
|
|
|
size_t length() const { return length_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
const char* data_;
|
|
|
|
size_t length_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// A test string with all lengths of utf-8 encodings.
|
|
|
|
const char unicode_utf8[] =
|
|
|
|
"abc" // 3x ascii
|
|
|
|
"\xc3\xa4" // a Umlaut, code point 228
|
|
|
|
"\xe2\xa8\xa0" // >> (math symbol), code point 10784
|
|
|
|
"\xf0\x9f\x92\xa9" // best character, code point 128169,
|
|
|
|
// as utf-16 surrogates: 55357 56489
|
|
|
|
"def"; // 3x ascii again.
|
|
|
|
const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357,
|
|
|
|
56489, 100, 101, 102, 0};
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
TEST(Utf8StreamAsciiOnly) {
|
|
|
|
const char* chunks[] = {"abc", "def", "ghi", ""};
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
|
|
|
v8::internal::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
|
|
|
|
// Read the data without dying.
|
|
|
|
v8::internal::uc32 c;
|
|
|
|
do {
|
|
|
|
c = stream->Advance();
|
|
|
|
} while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
|
|
|
|
}
|
|
|
|
|
2016-09-21 08:39:48 +00:00
|
|
|
TEST(Utf8StreamBOM) {
|
|
|
|
// Construct test string w/ UTF-8 BOM (byte order mark)
|
|
|
|
char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
|
|
|
|
strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
|
|
|
|
|
|
|
|
const char* chunks[] = {data, "\0"};
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
|
|
|
v8::internal::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
2016-09-21 08:39:48 +00:00
|
|
|
|
|
|
|
// Read the data without tripping over the BOM.
|
|
|
|
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
|
|
|
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
|
|
|
|
// Make sure seek works.
|
|
|
|
stream->Seek(0);
|
|
|
|
CHECK_EQ(unicode_ucs2[0], stream->Advance());
|
|
|
|
|
|
|
|
stream->Seek(5);
|
|
|
|
CHECK_EQ(unicode_ucs2[5], stream->Advance());
|
2017-01-30 23:21:03 +00:00
|
|
|
|
|
|
|
// Try again, but make sure we have to seek 'backwards'.
|
|
|
|
while (v8::internal::Utf16CharacterStream::kEndOfInput != stream->Advance()) {
|
|
|
|
// Do nothing. We merely advance the stream to the end of its input.
|
|
|
|
}
|
|
|
|
stream->Seek(5);
|
|
|
|
CHECK_EQ(unicode_ucs2[5], stream->Advance());
|
2016-09-21 08:39:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST(Utf8SplitBOM) {
|
|
|
|
// Construct chunks with a BOM split into two chunks.
|
|
|
|
char partial_bom[] = "\xef\xbb";
|
|
|
|
char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
|
|
|
|
strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
|
|
|
|
|
|
|
|
{
|
|
|
|
const char* chunks[] = {partial_bom, data, "\0"};
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
|
|
|
v8::internal::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
2016-09-21 08:39:48 +00:00
|
|
|
|
|
|
|
// Read the data without tripping over the BOM.
|
|
|
|
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
|
|
|
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// And now with single-byte BOM chunks.
|
|
|
|
char bom_byte_1[] = "\xef";
|
|
|
|
char bom_byte_2[] = "\xbb";
|
|
|
|
{
|
|
|
|
const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
|
|
|
v8::internal::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
2016-09-21 08:39:48 +00:00
|
|
|
|
|
|
|
// Read the data without tripping over the BOM.
|
|
|
|
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
|
|
|
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TEST(Utf8ChunkBoundaries) {
|
|
|
|
// Test utf-8 parsing at chunk boundaries.
|
|
|
|
|
|
|
|
// Split the test string at each byte and pass it to the stream. This way,
|
|
|
|
// we'll have a split at each possible boundary.
|
|
|
|
size_t len = strlen(unicode_utf8);
|
|
|
|
char buffer[arraysize(unicode_utf8) + 3];
|
|
|
|
for (size_t i = 1; i < len; i++) {
|
|
|
|
// Copy source string into buffer, splitting it at i.
|
|
|
|
// Then add three chunks, 0..i-1, i..strlen-1, empty.
|
|
|
|
strncpy(buffer, unicode_utf8, i);
|
|
|
|
strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
|
|
|
|
buffer[i] = '\0';
|
|
|
|
buffer[len + 1] = '\0';
|
|
|
|
buffer[len + 2] = '\0';
|
|
|
|
const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
|
|
|
|
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
|
|
|
v8::internal::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
|
|
|
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
|
|
|
|
stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(Utf8SingleByteChunks) {
|
|
|
|
// Have each byte as a single-byte chunk.
|
|
|
|
size_t len = strlen(unicode_utf8);
|
|
|
|
char buffer[arraysize(unicode_utf8) + 4];
|
|
|
|
for (size_t i = 1; i < len - 1; i++) {
|
|
|
|
// Copy source string into buffer, make a single-byte chunk at i.
|
|
|
|
strncpy(buffer, unicode_utf8, i);
|
|
|
|
strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1);
|
|
|
|
buffer[i] = '\0';
|
|
|
|
buffer[i + 1] = unicode_utf8[i];
|
|
|
|
buffer[i + 2] = '\0';
|
|
|
|
buffer[len + 2] = '\0';
|
|
|
|
buffer[len + 3] = '\0';
|
|
|
|
const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3,
|
|
|
|
buffer + len + 3};
|
|
|
|
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
|
|
|
v8::internal::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
|
|
|
|
for (size_t j = 0; unicode_ucs2[j]; j++) {
|
|
|
|
CHECK_EQ(unicode_ucs2[j], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
|
|
|
|
stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
|
|
|
|
|
|
|
|
void TestCharacterStream(const char* reference, i::Utf16CharacterStream* stream,
|
|
|
|
unsigned length, unsigned start, unsigned end) {
|
|
|
|
// Read streams one char at a time
|
|
|
|
unsigned i;
|
|
|
|
for (i = start; i < end; i++) {
|
|
|
|
CHECK_EQU(i, stream->pos());
|
|
|
|
CHECK_EQU(reference[i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQU(end, stream->pos());
|
2017-06-08 15:48:19 +00:00
|
|
|
CHECK_EQU(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
CHECK_EQU(end + 1, stream->pos());
|
|
|
|
stream->Back();
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
|
|
|
|
// Pushback, re-read, pushback again.
|
|
|
|
while (i > end / 4) {
|
|
|
|
int32_t c0 = reference[i - 1];
|
|
|
|
CHECK_EQU(i, stream->pos());
|
2016-09-20 09:43:41 +00:00
|
|
|
stream->Back();
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
i--;
|
|
|
|
CHECK_EQU(i, stream->pos());
|
|
|
|
int32_t c1 = stream->Advance();
|
|
|
|
i++;
|
|
|
|
CHECK_EQU(i, stream->pos());
|
|
|
|
CHECK_EQ(c0, c1);
|
2016-09-20 09:43:41 +00:00
|
|
|
stream->Back();
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
i--;
|
|
|
|
CHECK_EQU(i, stream->pos());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Seek + read streams one char at a time.
|
|
|
|
unsigned halfway = end / 2;
|
2016-09-20 09:43:41 +00:00
|
|
|
stream->Seek(stream->pos() + halfway - i);
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
for (i = halfway; i < end; i++) {
|
|
|
|
CHECK_EQU(i, stream->pos());
|
|
|
|
CHECK_EQU(reference[i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQU(i, stream->pos());
|
|
|
|
CHECK_LT(stream->Advance(), 0);
|
|
|
|
|
|
|
|
// Seek back, then seek beyond end of stream.
|
|
|
|
stream->Seek(start);
|
|
|
|
if (start < length) {
|
|
|
|
CHECK_EQU(stream->Advance(), reference[start]);
|
|
|
|
} else {
|
|
|
|
CHECK_LT(stream->Advance(), 0);
|
|
|
|
}
|
|
|
|
stream->Seek(length + 5);
|
|
|
|
CHECK_LT(stream->Advance(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef CHECK_EQU
|
|
|
|
|
|
|
|
void TestCharacterStreams(const char* one_byte_source, unsigned length,
|
|
|
|
unsigned start = 0, unsigned end = 0) {
|
|
|
|
if (end == 0) end = length;
|
|
|
|
|
|
|
|
i::Isolate* isolate = CcTest::i_isolate();
|
|
|
|
i::Factory* factory = isolate->factory();
|
|
|
|
|
|
|
|
// 2-byte external string
|
|
|
|
std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
|
|
|
|
i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
|
|
|
|
static_cast<int>(length));
|
|
|
|
{
|
|
|
|
for (unsigned i = 0; i < length; i++) {
|
|
|
|
uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
|
|
|
|
}
|
|
|
|
TestExternalResource resource(uc16_buffer.get(), length);
|
|
|
|
i::Handle<i::String> uc16_string(
|
|
|
|
factory->NewExternalStringFromTwoByte(&resource).ToHandleChecked());
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
|
|
|
|
i::ScannerStream::For(uc16_string, start, end));
|
|
|
|
TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 1-byte external string
|
2017-03-17 17:52:50 +00:00
|
|
|
i::Vector<const uint8_t> one_byte_vector =
|
|
|
|
i::OneByteVector(one_byte_source, static_cast<int>(length));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
i::Handle<i::String> one_byte_string =
|
2017-03-17 17:52:50 +00:00
|
|
|
factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
{
|
|
|
|
TestExternalOneByteResource one_byte_resource(one_byte_source, length);
|
|
|
|
i::Handle<i::String> ext_one_byte_string(
|
|
|
|
factory->NewExternalStringFromOneByte(&one_byte_resource)
|
|
|
|
.ToHandleChecked());
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
|
|
|
|
i::ScannerStream::For(ext_one_byte_string, start, end));
|
|
|
|
TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start,
|
|
|
|
end);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 1-byte generic i::String
|
|
|
|
{
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> string_stream(
|
|
|
|
i::ScannerStream::For(one_byte_string, start, end));
|
|
|
|
TestCharacterStream(one_byte_source, string_stream.get(), length, start,
|
|
|
|
end);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 2-byte generic i::String
|
|
|
|
{
|
|
|
|
i::Handle<i::String> two_byte_string =
|
|
|
|
factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
|
|
|
|
i::ScannerStream::For(two_byte_string, start, end));
|
|
|
|
TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length,
|
|
|
|
start, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Streaming has no notion of start/end, so let's skip streaming tests for
|
|
|
|
// these cases.
|
|
|
|
if (start != 0 || end != length) return;
|
|
|
|
|
|
|
|
// 1-byte streaming stream, single + many chunks.
|
|
|
|
{
|
2017-03-17 17:52:50 +00:00
|
|
|
const uint8_t* data = one_byte_vector.begin();
|
|
|
|
const uint8_t* data_end = one_byte_vector.end();
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
|
|
|
|
ChunkSource single_chunk(data, data_end - data, false);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
|
|
|
|
i::ScannerStream::For(&single_chunk,
|
2016-12-05 15:46:58 +00:00
|
|
|
v8::ScriptCompiler::StreamedSource::ONE_BYTE,
|
|
|
|
nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
|
|
|
|
length, start, end);
|
|
|
|
|
|
|
|
ChunkSource many_chunks(data, data_end - data, true);
|
|
|
|
one_byte_streaming_stream.reset(i::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
|
|
|
|
length, start, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF-8 streaming stream, single + many chunks.
|
|
|
|
{
|
2017-03-17 17:52:50 +00:00
|
|
|
const uint8_t* data = one_byte_vector.begin();
|
|
|
|
const uint8_t* data_end = one_byte_vector.end();
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
ChunkSource chunks(data, data_end - data, false);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
|
2016-12-05 15:46:58 +00:00
|
|
|
i::ScannerStream::For(&chunks, v8::ScriptCompiler::StreamedSource::UTF8,
|
|
|
|
nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
|
|
|
|
start, end);
|
|
|
|
|
|
|
|
ChunkSource many_chunks(data, data_end - data, true);
|
|
|
|
utf8_streaming_stream.reset(i::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&many_chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
|
|
|
|
start, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 2-byte streaming stream, single + many chunks.
|
|
|
|
{
|
|
|
|
const uint8_t* data =
|
|
|
|
reinterpret_cast<const uint8_t*>(two_byte_vector.begin());
|
|
|
|
const uint8_t* data_end =
|
|
|
|
reinterpret_cast<const uint8_t*>(two_byte_vector.end());
|
|
|
|
ChunkSource chunks(data, data_end - data, false);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
|
2016-12-05 15:46:58 +00:00
|
|
|
i::ScannerStream::For(
|
|
|
|
&chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
|
|
|
|
length, start, end);
|
|
|
|
|
|
|
|
ChunkSource many_chunks(data, data_end - data, true);
|
|
|
|
two_byte_streaming_stream.reset(i::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
|
Rework scanner-character-streams.
- Smaller, more consistent streams API (Advance, Back, pos, Seek)
- Remove implementations from the header, in favor of creation functions.
Observe:
- Performance:
- All Utf16CharacterStream methods have an inlinable V8_LIKELY w/ a
body of only a few instructions. I expect most calls to end up there.
- There used to be performance problems w/ bookmarking, particularly
with copying too much data on SetBookmark w/ UTF-8 streaming streams.
All those copies are gone.
- The old streaming streams implementation used to copy data even for
2-byte input. It no longer does.
- The only remaining 'slow' method is the Seek(.) slow case for utf-8
streaming streams. I don't expect this to be called a lot; and even if,
I expect it to be offset by the gains in the (vastly more frequent)
calls to the other methods or the 'fast path'.
- If it still bothers us, there are several ways to speed it up.
- API & code cleanliness:
- I want to remove the 'old' API in a follow-up CL, which should mostly
delete code, or replace it 1:1.
- In a 2nd follow-up I want to delete much of the UTF-8 handling in Blink
for streaming streams.
- The "bookmark" is now always implemented (and mostly very fast), so we
should be able to use it for more things.
- Testing & correctness:
- The unit tests now cover all stream implementations,
and are pretty good and triggering all the edge cases.
- Vastly more DCHECKs of the invariants.
BUG=v8:4947
Review-Url: https://codereview.chromium.org/2314663002
Cr-Commit-Position: refs/heads/master@{#39464}
2016-09-16 08:29:41 +00:00
|
|
|
TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
|
|
|
|
length, start, end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(CharacterStreams) {
|
|
|
|
v8::Isolate* isolate = CcTest::isolate();
|
|
|
|
v8::HandleScope handles(isolate);
|
|
|
|
v8::Local<v8::Context> context = v8::Context::New(isolate);
|
|
|
|
v8::Context::Scope context_scope(context);
|
|
|
|
|
|
|
|
TestCharacterStreams("abcdefghi", 9);
|
|
|
|
TestCharacterStreams("abc\0\n\r\x7f", 7);
|
|
|
|
TestCharacterStreams("\0", 1);
|
|
|
|
TestCharacterStreams("", 0);
|
|
|
|
|
|
|
|
// 4k large buffer.
|
|
|
|
char buffer[4096 + 1];
|
|
|
|
for (unsigned i = 0; i < arraysize(buffer); i++) {
|
|
|
|
buffer[i] = static_cast<char>(i & 0x7F);
|
|
|
|
}
|
|
|
|
buffer[arraysize(buffer) - 1] = '\0';
|
|
|
|
TestCharacterStreams(buffer, arraysize(buffer) - 1);
|
|
|
|
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
|
|
|
|
}
|
2016-10-05 17:18:36 +00:00
|
|
|
|
2017-04-28 08:54:52 +00:00
|
|
|
TEST(Uft8MultipleBOMChunks) {
|
|
|
|
const char* chunks = "\xef\xbb\xbf\0\xef\xbb\xbf\0\xef\xbb\xbf\0a\0";
|
|
|
|
const uint16_t unicode[] = {0xFEFF, 0xFEFF, 97};
|
|
|
|
ChunkSource chunk_source(chunks);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
|
|
|
for (size_t i = 0; i < arraysize(unicode); i++) {
|
|
|
|
CHECK_EQ(unicode[i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
}
|
|
|
|
|
2016-10-05 17:18:36 +00:00
|
|
|
// Regression test for crbug.com/651333. Read invalid utf-8.
|
|
|
|
TEST(Regress651333) {
|
|
|
|
const uint8_t bytes[] =
|
|
|
|
"A\xf1"
|
|
|
|
"ad"; // Anad, with n == n-with-tilde.
|
|
|
|
const uint16_t unicode[] = {65, 65533, 97, 100};
|
|
|
|
|
|
|
|
// Run the test for all sub-strings 0..N of bytes, to make sure we hit the
|
|
|
|
// error condition in and at chunk boundaries.
|
|
|
|
for (size_t len = 0; len < arraysize(bytes); len++) {
|
|
|
|
// Read len bytes from bytes, and compare against the expected unicode
|
|
|
|
// characters. Expect kBadChar ( == Unicode replacement char == code point
|
|
|
|
// 65533) instead of the incorrectly coded Latin1 char.
|
|
|
|
ChunkSource chunks(bytes, len, false);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
2016-12-05 15:46:58 +00:00
|
|
|
&chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
2016-10-05 17:18:36 +00:00
|
|
|
for (size_t i = 0; i < len; i++) {
|
|
|
|
CHECK_EQ(unicode[i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
2017-05-09 16:04:10 +00:00
|
|
|
|
|
|
|
TEST(Regress6377) {
|
|
|
|
const char* cases[] = {
|
|
|
|
"\xf0\x90\0" // first chunk - start of 4-byte seq
|
|
|
|
"\x80\x80" // second chunk - end of 4-byte seq
|
|
|
|
"a\0", // and an 'a'
|
|
|
|
|
|
|
|
"\xe0\xbf\0" // first chunk - start of 3-byte seq
|
|
|
|
"\xbf" // second chunk - one-byte end of 3-byte seq
|
|
|
|
"a\0", // and an 'a'
|
|
|
|
|
|
|
|
"\xc3\0" // first chunk - start of 2-byte seq
|
|
|
|
"\xbf" // second chunk - end of 2-byte seq
|
|
|
|
"a\0", // and an 'a'
|
|
|
|
|
|
|
|
"\xf0\x90\x80\0" // first chunk - start of 4-byte seq
|
|
|
|
"\x80" // second chunk - one-byte end of 4-byte seq
|
|
|
|
"a\xc3\0" // and an 'a' + start of 2-byte seq
|
|
|
|
"\xbf\0", // third chunk - end of 2-byte seq
|
|
|
|
};
|
|
|
|
const std::vector<std::vector<uint16_t>> unicode = {
|
|
|
|
{0xd800, 0xdc00, 97}, {0xfff, 97}, {0xff, 97}, {0xd800, 0xdc00, 97, 0xff},
|
|
|
|
};
|
|
|
|
CHECK_EQ(unicode.size(), sizeof(cases) / sizeof(cases[0]));
|
|
|
|
for (size_t c = 0; c < unicode.size(); ++c) {
|
|
|
|
ChunkSource chunk_source(cases[c]);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
|
|
|
for (size_t i = 0; i < unicode[c].size(); i++) {
|
|
|
|
CHECK_EQ(unicode[c][i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
stream->Seek(0);
|
|
|
|
for (size_t i = 0; i < unicode[c].size(); i++) {
|
|
|
|
CHECK_EQ(unicode[c][i], stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
2017-04-28 08:54:52 +00:00
|
|
|
|
|
|
|
TEST(Regress724166) {
|
|
|
|
// Chunk size has to be multiple of kBufferCharacterSize
|
|
|
|
constexpr size_t kBufferCharacterSize = 512;
|
|
|
|
constexpr size_t kChunkSize = kBufferCharacterSize * 8;
|
|
|
|
constexpr size_t kChunks = 4;
|
|
|
|
uint8_t buffer[kChunkSize * kChunks];
|
|
|
|
for (size_t j = 0; j < kChunks; ++j) {
|
|
|
|
for (size_t i = 0; i < kChunkSize; ++i) {
|
|
|
|
buffer[kChunkSize * j + i] = (i % 0x7e) + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Add BOM at the beginning
|
|
|
|
buffer[0] = '\xef';
|
|
|
|
buffer[1] = '\xbb';
|
|
|
|
buffer[2] = '\xbf';
|
|
|
|
ChunkSource chunk_source(buffer, arraysize(buffer), kChunkSize);
|
|
|
|
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
|
|
|
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
|
|
|
for (size_t i = 0; i < arraysize(buffer) - 3; ++i) {
|
|
|
|
CHECK_EQ(static_cast<i::uc32>(buffer[i + 3]), stream->Advance());
|
|
|
|
}
|
|
|
|
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
|
|
|
for (int z = -8; z < 8; ++z) {
|
|
|
|
for (size_t j = kBufferCharacterSize + z; j < arraysize(buffer);
|
|
|
|
j += kBufferCharacterSize) {
|
|
|
|
stream->Seek(j);
|
|
|
|
for (size_t i = j; i < arraysize(buffer) - 3; ++i) {
|
|
|
|
CHECK_EQ(static_cast<i::uc32>(buffer[i + 3]), stream->Advance());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|