v8/test/cctest/parsing/test-scanner-streams.cc
Leszek Swirski 602de389dc [parser] Allow cloning chunked streams
This allows streamed sources to also trigger parallel compile tasks. The
chunk vectors are shared via std::shared_ptr.

Clone chunked streams are initialised with a null source, and are not
allowed to fetch any more data. Similarly, the original stream is not
allowed to fetch data if it has been cloned (since the vector is shared
and would mutate if we added more data to it).

This is ok for the purposes of cloning for parallel compile tasks, as we
fully parse before cloning for the task.

Change-Id: Ic268e4956e0894acb63111bf0aaf32eaad426066
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3310917
Commit-Queue: Leszek Swirski <leszeks@chromium.org>
Reviewed-by: Toon Verwaest <verwaest@chromium.org>
Cr-Commit-Position: refs/heads/main@{#78209}
2021-12-02 13:13:33 +00:00

930 lines
34 KiB
C++

// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/base/strings.h"
#include "src/heap/factory-inl.h"
#include "src/objects/objects-inl.h"
#include "src/parsing/scanner-character-streams.h"
#include "src/parsing/scanner.h"
#include "test/cctest/cctest.h"
namespace {
// Implement ExternalSourceStream based on const char**.
// This will take each string as one chunk. The last chunk must be empty.
class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
public:
template <typename Char>
explicit ChunkSource(const Char** chunks) : current_(0) {
do {
chunks_.push_back({reinterpret_cast<const uint8_t*>(*chunks),
(sizeof(Char) / sizeof(uint8_t)) *
std::char_traits<Char>::length(*chunks)});
chunks++;
} while (chunks_.back().len > 0);
}
explicit ChunkSource(const char* chunks) : current_(0) {
do {
chunks_.push_back(
{reinterpret_cast<const uint8_t*>(chunks), strlen(chunks)});
chunks += strlen(chunks) + 1;
} while (chunks_.back().len > 0);
}
ChunkSource(const uint8_t* data, size_t char_size, size_t len,
bool extra_chunky)
: current_(0) {
// If extra_chunky, we'll use increasingly large chunk sizes. If not, we'll
// have a single chunk of full length. Make sure that chunks are always
// aligned to char-size though.
size_t chunk_size = extra_chunky ? char_size : len;
for (size_t i = 0; i < len; i += chunk_size, chunk_size += char_size) {
chunks_.push_back({data + i, std::min(chunk_size, len - i)});
}
chunks_.push_back({nullptr, 0});
}
~ChunkSource() override = default;
size_t GetMoreData(const uint8_t** src) override {
DCHECK_LT(current_, chunks_.size());
Chunk& next = chunks_[current_++];
uint8_t* chunk = new uint8_t[next.len];
if (next.len > 0) {
i::MemMove(chunk, next.ptr, next.len);
}
*src = chunk;
return next.len;
}
private:
struct Chunk {
const uint8_t* ptr;
size_t len;
};
std::vector<Chunk> chunks_;
size_t current_;
};
// Checks that Lock() / Unlock() pairs are balanced. Not thread-safe.
class LockChecker {
public:
LockChecker() : lock_depth_(0) {}
~LockChecker() { CHECK_EQ(0, lock_depth_); }
void Lock() const { lock_depth_++; }
void Unlock() const {
CHECK_GT(lock_depth_, 0);
lock_depth_--;
}
bool IsLocked() const { return lock_depth_ > 0; }
int LockDepth() const { return lock_depth_; }
protected:
mutable int lock_depth_;
};
class TestExternalResource : public v8::String::ExternalStringResource,
public LockChecker {
public:
explicit TestExternalResource(uint16_t* data, int length)
: LockChecker(), data_(data), length_(static_cast<size_t>(length)) {}
const uint16_t* data() const override {
CHECK(IsLocked());
return data_;
}
size_t length() const override { return length_; }
bool IsCacheable() const override { return false; }
void Lock() const override { LockChecker::Lock(); }
void Unlock() const override { LockChecker::Unlock(); }
private:
uint16_t* data_;
size_t length_;
};
class TestExternalOneByteResource
: public v8::String::ExternalOneByteStringResource,
public LockChecker {
public:
TestExternalOneByteResource(const char* data, size_t length)
: data_(data), length_(length) {}
const char* data() const override {
CHECK(IsLocked());
return data_;
}
size_t length() const override { return length_; }
bool IsCacheable() const override { return false; }
void Lock() const override { LockChecker::Lock(); }
void Unlock() const override { LockChecker::Unlock(); }
private:
const char* data_;
size_t length_;
};
// A test string with all lengths of utf-8 encodings.
const char unicode_utf8[] =
"abc" // 3x ascii
"\xc3\xa4" // a Umlaut, code point 228
"\xe2\xa8\xa0" // >> (math symbol), code point 10784
"\xf0\x9f\x92\xa9" // best character, code point 128169,
// as utf-16 surrogates: 55357 56489
"def"; // 3x ascii again.
const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357,
56489, 100, 101, 102, 0};
i::Handle<i::String> NewExternalTwoByteStringFromResource(
i::Isolate* isolate, TestExternalResource* resource) {
i::Factory* factory = isolate->factory();
// String creation accesses the resource.
resource->Lock();
i::Handle<i::String> uc16_string(
factory->NewExternalStringFromTwoByte(resource).ToHandleChecked());
resource->Unlock();
return uc16_string;
}
} // anonymous namespace
TEST(Utf8StreamAsciiOnly) {
const char* chunks[] = {"abc", "def", "ghi", ""};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without dying.
v8::base::uc32 c;
do {
c = stream->Advance();
} while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
}
TEST(Utf8StreamMaxNonSurrogateCharCode) {
const char* chunks[] = {"\uffff\uffff", ""};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the correct character.
uint16_t max = unibrow::Utf16::kMaxNonSurrogateCharCode;
CHECK_EQ(max, static_cast<uint32_t>(stream->Advance()));
CHECK_EQ(max, static_cast<uint32_t>(stream->Advance()));
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
TEST(Utf8StreamBOM) {
// Construct test string w/ UTF-8 BOM (byte order mark)
char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
const char* chunks[] = {data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
// Make sure seek works.
stream->Seek(0);
CHECK_EQ(unicode_ucs2[0], stream->Advance());
stream->Seek(5);
CHECK_EQ(unicode_ucs2[5], stream->Advance());
// Try again, but make sure we have to seek 'backwards'.
while (v8::internal::Utf16CharacterStream::kEndOfInput != stream->Advance()) {
// Do nothing. We merely advance the stream to the end of its input.
}
stream->Seek(5);
CHECK_EQ(unicode_ucs2[5], stream->Advance());
}
TEST(Utf8SplitBOM) {
// Construct chunks with a BOM split into two chunks.
char partial_bom[] = "\xef\xbb";
char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
{
const char* chunks[] = {partial_bom, data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
}
// And now with single-byte BOM chunks.
char bom_byte_1[] = "\xef";
char bom_byte_2[] = "\xbb";
{
const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
}
}
TEST(Utf8SplitMultiBOM) {
// Construct chunks with a split BOM followed by another split BOM.
const char* chunks[] = {"\xef\xbb", "\xbf\xef\xbb", "\xbf", ""};
ChunkSource chunk_source(chunks);
std::unique_ptr<i::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data, ensuring we get exactly one of the two BOMs back.
CHECK_EQ(0xFEFF, stream->Advance());
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
TEST(Utf8AdvanceUntil) {
// Test utf-8 advancing until a certain char.
const char line_term = '\n';
const size_t kLen = arraysize(unicode_utf8);
char data[kLen + 1];
strncpy(data, unicode_utf8, kLen);
data[kLen - 1] = line_term;
data[kLen] = '\0';
{
const char* chunks[] = {data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
int32_t res = stream->AdvanceUntil(
[](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
CHECK_EQ(line_term, res);
}
}
TEST(AdvanceMatchAdvanceUntil) {
// Test if single advance and advanceUntil behave the same
char data[] = {'a', 'b', '\n', 'c', '\0'};
{
const char* chunks[] = {data, "\0"};
ChunkSource chunk_source_a(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance(
v8::internal::ScannerStream::For(
&chunk_source_a, v8::ScriptCompiler::StreamedSource::UTF8));
ChunkSource chunk_source_au(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance_until(
v8::internal::ScannerStream::For(
&chunk_source_au, v8::ScriptCompiler::StreamedSource::UTF8));
int32_t au_c0_ = stream_advance_until->AdvanceUntil(
[](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
int32_t a_c0_ = '0';
while (!unibrow::IsLineTerminator(a_c0_)) {
a_c0_ = stream_advance->Advance();
}
// Check both advances methods have the same output
CHECK_EQ(a_c0_, au_c0_);
// Check if both set the cursor to the correct position by advancing both
// streams by one character.
a_c0_ = stream_advance->Advance();
au_c0_ = stream_advance_until->Advance();
CHECK_EQ(a_c0_, au_c0_);
}
}
TEST(Utf8AdvanceUntilOverChunkBoundaries) {
// Test utf-8 advancing until a certain char, crossing chunk boundaries.
// Split the test string at each byte and pass it to the stream. This way,
// we'll have a split at each possible boundary.
size_t len = strlen(unicode_utf8);
char buffer[arraysize(unicode_utf8) + 4];
for (size_t i = 1; i < len; i++) {
// Copy source string into buffer, splitting it at i.
// Then add three chunks, 0..i-1, i..strlen-1, empty.
memcpy(buffer, unicode_utf8, i);
memcpy(buffer + i + 1, unicode_utf8 + i, len - i);
buffer[i] = '\0';
buffer[len + 1] = '\n';
buffer[len + 2] = '\0';
buffer[len + 3] = '\0';
const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
int32_t res = stream->AdvanceUntil(
[](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
CHECK_EQ(buffer[len + 1], res);
}
}
TEST(Utf8ChunkBoundaries) {
// Test utf-8 parsing at chunk boundaries.
// Split the test string at each byte and pass it to the stream. This way,
// we'll have a split at each possible boundary.
size_t len = strlen(unicode_utf8);
char buffer[arraysize(unicode_utf8) + 3];
for (size_t i = 1; i < len; i++) {
// Copy source string into buffer, splitting it at i.
// Then add three chunks, 0..i-1, i..strlen-1, empty.
memcpy(buffer, unicode_utf8, i);
memcpy(buffer + i + 1, unicode_utf8 + i, len - i);
buffer[i] = '\0';
buffer[len + 1] = '\0';
buffer[len + 2] = '\0';
const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
for (size_t j = 0; unicode_ucs2[j]; j++) {
CHECK_EQ(unicode_ucs2[j], stream->Advance());
}
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
stream->Advance());
}
}
TEST(Utf8SingleByteChunks) {
// Have each byte as a single-byte chunk.
size_t len = strlen(unicode_utf8);
char buffer[arraysize(unicode_utf8) + 4];
for (size_t i = 1; i < len - 1; i++) {
// Copy source string into buffer, make a single-byte chunk at i.
strncpy(buffer, unicode_utf8, i);
strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1);
buffer[i] = '\0';
buffer[i + 1] = unicode_utf8[i];
buffer[i + 2] = '\0';
buffer[len + 2] = '\0';
buffer[len + 3] = '\0';
const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3,
buffer + len + 3};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
for (size_t j = 0; unicode_ucs2[j]; j++) {
CHECK_EQ(unicode_ucs2[j], stream->Advance());
}
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
stream->Advance());
}
}
#define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
void TestCharacterStream(const char* reference, i::Utf16CharacterStream* stream,
unsigned length, unsigned start, unsigned end) {
// Read streams one char at a time
unsigned i;
for (i = start; i < end; i++) {
CHECK_EQU(i, stream->pos());
CHECK_EQU(reference[i], stream->Advance());
}
CHECK_EQU(end, stream->pos());
CHECK_EQU(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
CHECK_EQU(end + 1, stream->pos());
stream->Back();
// Pushback, re-read, pushback again.
while (i > end / 4) {
int32_t c0 = reference[i - 1];
CHECK_EQU(i, stream->pos());
stream->Back();
i--;
CHECK_EQU(i, stream->pos());
int32_t c1 = stream->Advance();
i++;
CHECK_EQU(i, stream->pos());
CHECK_EQ(c0, c1);
stream->Back();
i--;
CHECK_EQU(i, stream->pos());
}
// Seek + read streams one char at a time.
unsigned halfway = end / 2;
stream->Seek(stream->pos() + halfway - i);
for (i = halfway; i < end; i++) {
CHECK_EQU(i, stream->pos());
CHECK_EQU(reference[i], stream->Advance());
}
CHECK_EQU(i, stream->pos());
CHECK(i::Scanner::IsInvalid(stream->Advance()));
// Seek back, then seek beyond end of stream.
stream->Seek(start);
if (start < length) {
CHECK_EQU(stream->Advance(), reference[start]);
} else {
CHECK(i::Scanner::IsInvalid(stream->Advance()));
}
stream->Seek(length + 5);
CHECK(i::Scanner::IsInvalid(stream->Advance()));
}
void TestCloneCharacterStream(const char* reference,
i::Utf16CharacterStream* stream,
unsigned length) {
// Test original stream through to the end.
TestCharacterStream(reference, stream, length, 0, length);
// Clone the stream after it completes.
std::unique_ptr<i::Utf16CharacterStream> clone = stream->Clone();
// Test that the clone through to the end.
TestCharacterStream(reference, clone.get(), length, 0, length);
// Rewind original stream to a third.
stream->Seek(length / 3);
// Rewind clone stream to two thirds.
clone->Seek(2 * length / 3);
// Test seeking clone didn't affect original stream.
TestCharacterStream(reference, stream, length, length / 3, length);
// Test seeking original stream didn't affect clone.
TestCharacterStream(reference, clone.get(), length, 2 * length / 3, length);
}
#undef CHECK_EQU
void TestCharacterStreams(const char* one_byte_source, unsigned length,
unsigned start = 0, unsigned end = 0) {
if (end == 0) end = length;
i::Isolate* isolate = CcTest::i_isolate();
i::Factory* factory = isolate->factory();
// 2-byte external string
std::unique_ptr<v8::base::uc16[]> uc16_buffer(new v8::base::uc16[length]);
v8::base::Vector<const v8::base::uc16> two_byte_vector(
uc16_buffer.get(), static_cast<int>(length));
{
for (unsigned i = 0; i < length; i++) {
uc16_buffer[i] = static_cast<v8::base::uc16>(one_byte_source[i]);
}
TestExternalResource resource(uc16_buffer.get(), length);
i::Handle<i::String> uc16_string(
NewExternalTwoByteStringFromResource(isolate, &resource));
std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
i::ScannerStream::For(isolate, uc16_string, start, end));
TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end);
// This avoids the GC from trying to free a stack allocated resource.
if (uc16_string->IsExternalString())
i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
->SetResource(isolate, nullptr);
}
// 1-byte external string
v8::base::Vector<const uint8_t> one_byte_vector =
v8::base::OneByteVector(one_byte_source, static_cast<int>(length));
i::Handle<i::String> one_byte_string =
factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
{
TestExternalOneByteResource one_byte_resource(one_byte_source, length);
i::Handle<i::String> ext_one_byte_string(
factory->NewExternalStringFromOneByte(&one_byte_resource)
.ToHandleChecked());
std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
i::ScannerStream::For(isolate, ext_one_byte_string, start, end));
TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start,
end);
// This avoids the GC from trying to free a stack allocated resource.
if (ext_one_byte_string->IsExternalString())
i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
->SetResource(isolate, nullptr);
}
// 1-byte generic i::String
{
std::unique_ptr<i::Utf16CharacterStream> string_stream(
i::ScannerStream::For(isolate, one_byte_string, start, end));
TestCharacterStream(one_byte_source, string_stream.get(), length, start,
end);
}
// 2-byte generic i::String
{
i::Handle<i::String> two_byte_string =
factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
i::ScannerStream::For(isolate, two_byte_string, start, end));
TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length,
start, end);
}
// Streaming has no notion of start/end, so let's skip streaming tests for
// these cases.
if (start != 0 || end != length) return;
// 1-byte streaming stream, single + many chunks.
{
const uint8_t* data = one_byte_vector.begin();
const uint8_t* data_end = one_byte_vector.end();
ChunkSource single_chunk(data, 1, data_end - data, false);
std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
i::ScannerStream::For(&single_chunk,
v8::ScriptCompiler::StreamedSource::ONE_BYTE));
TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
length, start, end);
ChunkSource many_chunks(data, 1, data_end - data, true);
one_byte_streaming_stream.reset(i::ScannerStream::For(
&many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE));
TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
length, start, end);
}
// UTF-8 streaming stream, single + many chunks.
{
const uint8_t* data = one_byte_vector.begin();
const uint8_t* data_end = one_byte_vector.end();
ChunkSource chunks(data, 1, data_end - data, false);
std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
i::ScannerStream::For(&chunks,
v8::ScriptCompiler::StreamedSource::UTF8));
TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
start, end);
ChunkSource many_chunks(data, 1, data_end - data, true);
utf8_streaming_stream.reset(i::ScannerStream::For(
&many_chunks, v8::ScriptCompiler::StreamedSource::UTF8));
TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
start, end);
}
// 2-byte streaming stream, single + many chunks.
{
const uint8_t* data =
reinterpret_cast<const uint8_t*>(two_byte_vector.begin());
const uint8_t* data_end =
reinterpret_cast<const uint8_t*>(two_byte_vector.end());
ChunkSource chunks(data, 2, data_end - data, false);
std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
i::ScannerStream::For(&chunks,
v8::ScriptCompiler::StreamedSource::TWO_BYTE));
TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
length, start, end);
ChunkSource many_chunks(data, 2, data_end - data, true);
two_byte_streaming_stream.reset(i::ScannerStream::For(
&many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE));
TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
length, start, end);
}
}
TEST(CharacterStreams) {
v8::Isolate* isolate = CcTest::isolate();
v8::HandleScope handles(isolate);
v8::Local<v8::Context> context = v8::Context::New(isolate);
v8::Context::Scope context_scope(context);
TestCharacterStreams("abcdefghi", 9);
TestCharacterStreams("abc\0\n\r\x7f", 7);
TestCharacterStreams("\0", 1);
TestCharacterStreams("", 0);
// 4k large buffer.
char buffer[4096 + 1];
for (unsigned i = 0; i < arraysize(buffer); i++) {
buffer[i] = static_cast<char>(i & 0x7F);
}
buffer[arraysize(buffer) - 1] = '\0';
TestCharacterStreams(buffer, arraysize(buffer) - 1);
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
}
// Regression test for crbug.com/651333. Read invalid utf-8.
TEST(Regress651333) {
const uint8_t bytes[] =
"A\xf1"
"ad"; // Anad, with n == n-with-tilde.
const uint16_t unicode[] = {65, 65533, 97, 100};
// Run the test for all sub-strings 0..N of bytes, to make sure we hit the
// error condition in and at chunk boundaries.
for (size_t len = 0; len < arraysize(bytes); len++) {
// Read len bytes from bytes, and compare against the expected unicode
// characters. Expect kBadChar ( == Unicode replacement char == code point
// 65533) instead of the incorrectly coded Latin1 char.
ChunkSource chunks(bytes, 1, len, false);
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
&chunks, v8::ScriptCompiler::StreamedSource::UTF8));
for (size_t i = 0; i < len; i++) {
CHECK_EQ(unicode[i], stream->Advance());
}
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
}
void TestChunkStreamAgainstReference(
const char* cases[],
const std::vector<std::vector<uint16_t>>& unicode_expected) {
for (size_t c = 0; c < unicode_expected.size(); ++c) {
ChunkSource chunk_source(cases[c]);
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
for (size_t i = 0; i < unicode_expected[c].size(); i++) {
CHECK_EQ(unicode_expected[c][i], stream->Advance());
}
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
stream->Seek(0);
for (size_t i = 0; i < unicode_expected[c].size(); i++) {
CHECK_EQ(unicode_expected[c][i], stream->Advance());
}
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
}
TEST(Regress6377) {
const char* cases[] = {
"\xf0\x90\0" // first chunk - start of 4-byte seq
"\x80\x80" // second chunk - end of 4-byte seq
"a\0", // and an 'a'
"\xe0\xbf\0" // first chunk - start of 3-byte seq
"\xbf" // second chunk - one-byte end of 3-byte seq
"a\0", // and an 'a'
"\xc3\0" // first chunk - start of 2-byte seq
"\xbf" // second chunk - end of 2-byte seq
"a\0", // and an 'a'
"\xf0\x90\x80\0" // first chunk - start of 4-byte seq
"\x80" // second chunk - one-byte end of 4-byte seq
"a\xc3\0" // and an 'a' + start of 2-byte seq
"\xbf\0", // third chunk - end of 2-byte seq
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0xD800, 0xDC00, 97}, {0xFFF, 97}, {0xFF, 97}, {0xD800, 0xDC00, 97, 0xFF},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestChunkStreamAgainstReference(cases, unicode_expected);
}
TEST(Regress6836) {
const char* cases[] = {
// 0xC2 is a lead byte, but there's no continuation. The bug occurs when
// this happens near the chunk end.
"X\xc2Y\0",
// Last chunk ends with a 2-byte char lead.
"X\xc2\0",
// Last chunk ends with a 3-byte char lead and only one continuation
// character.
"X\xe0\xbf\0",
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0x58, 0xFFFD, 0x59}, {0x58, 0xFFFD}, {0x58, 0xFFFD},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestChunkStreamAgainstReference(cases, unicode_expected);
}
TEST(TestOverlongAndInvalidSequences) {
const char* cases[] = {
// Overlong 2-byte sequence.
"X\xc0\xbfY\0",
// Another overlong 2-byte sequence.
"X\xc1\xbfY\0",
// Overlong 3-byte sequence.
"X\xe0\x9f\xbfY\0",
// Overlong 4-byte sequence.
"X\xf0\x89\xbf\xbfY\0",
// Invalid 3-byte sequence (reserved for surrogates).
"X\xed\xa0\x80Y\0",
// Invalid 4-bytes sequence (value out of range).
"X\xf4\x90\x80\x80Y\0",
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0x58, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestChunkStreamAgainstReference(cases, unicode_expected);
}
TEST(RelocatingCharacterStream) {
// This test relies on the invariant that the scavenger will move objects
if (i::FLAG_single_generation) return;
ManualGCScope manual_gc_scope;
CcTest::InitializeVM();
i::Isolate* i_isolate = CcTest::i_isolate();
v8::HandleScope scope(CcTest::isolate());
const char* string = "abcd";
int length = static_cast<int>(strlen(string));
std::unique_ptr<v8::base::uc16[]> uc16_buffer(new v8::base::uc16[length]);
for (int i = 0; i < length; i++) {
uc16_buffer[i] = string[i];
}
v8::base::Vector<const v8::base::uc16> two_byte_vector(uc16_buffer.get(),
length);
i::Handle<i::String> two_byte_string =
i_isolate->factory()
->NewStringFromTwoByte(two_byte_vector, i::AllocationType::kYoung)
.ToHandleChecked();
std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
i::ScannerStream::For(i_isolate, two_byte_string, 0, length));
CHECK_EQ('a', two_byte_string_stream->Advance());
CHECK_EQ('b', two_byte_string_stream->Advance());
CHECK_EQ(size_t{2}, two_byte_string_stream->pos());
i::String raw = *two_byte_string;
i_isolate->heap()->CollectGarbage(i::NEW_SPACE,
i::GarbageCollectionReason::kUnknown);
// GC moved the string.
CHECK_NE(raw, *two_byte_string);
CHECK_EQ('c', two_byte_string_stream->Advance());
CHECK_EQ('d', two_byte_string_stream->Advance());
}
TEST(RelocatingUnbufferedCharacterStream) {
// This test relies on the invariant that the scavenger will move objects
if (i::FLAG_single_generation) return;
ManualGCScope manual_gc_scope;
CcTest::InitializeVM();
i::Isolate* i_isolate = CcTest::i_isolate();
v8::HandleScope scope(CcTest::isolate());
const char16_t* string = u"abc\u2603";
int length = static_cast<int>(std::char_traits<char16_t>::length(string));
std::unique_ptr<v8::base::uc16[]> uc16_buffer(new v8::base::uc16[length]);
for (int i = 0; i < length; i++) {
uc16_buffer[i] = string[i];
}
v8::base::Vector<const v8::base::uc16> two_byte_vector(uc16_buffer.get(),
length);
i::Handle<i::String> two_byte_string =
i_isolate->factory()
->NewStringFromTwoByte(two_byte_vector, i::AllocationType::kYoung)
.ToHandleChecked();
std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
i::ScannerStream::For(i_isolate, two_byte_string, 0, length));
// Seek to offset 2 so that the buffer_pos_ is not zero initially.
two_byte_string_stream->Seek(2);
CHECK_EQ('c', two_byte_string_stream->Advance());
CHECK_EQ(size_t{3}, two_byte_string_stream->pos());
i::String raw = *two_byte_string;
i_isolate->heap()->CollectGarbage(i::NEW_SPACE,
i::GarbageCollectionReason::kUnknown);
// GC moved the string and buffer was updated to the correct location.
CHECK_NE(raw, *two_byte_string);
// Check that we correctly moved based on buffer_pos_, not based on a position
// of zero.
CHECK_EQ(u'\u2603', two_byte_string_stream->Advance());
CHECK_EQ(size_t{4}, two_byte_string_stream->pos());
}
TEST(CloneCharacterStreams) {
v8::HandleScope handles(CcTest::isolate());
v8::Local<v8::Context> context = v8::Context::New(CcTest::isolate());
v8::Context::Scope context_scope(context);
i::Isolate* isolate = CcTest::i_isolate();
i::Factory* factory = isolate->factory();
const char* one_byte_source = "abcdefghi";
unsigned length = static_cast<unsigned>(strlen(one_byte_source));
// Check that cloning a character stream does not update
// 2-byte external string
std::unique_ptr<v8::base::uc16[]> uc16_buffer(new v8::base::uc16[length]);
v8::base::Vector<const v8::base::uc16> two_byte_vector(
uc16_buffer.get(), static_cast<int>(length));
{
for (unsigned i = 0; i < length; i++) {
uc16_buffer[i] = static_cast<v8::base::uc16>(one_byte_source[i]);
}
TestExternalResource resource(uc16_buffer.get(), length);
i::Handle<i::String> uc16_string(
NewExternalTwoByteStringFromResource(isolate, &resource));
std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
i::ScannerStream::For(isolate, uc16_string, 0, length));
CHECK(resource.IsLocked());
CHECK_EQ(1, resource.LockDepth());
std::unique_ptr<i::Utf16CharacterStream> cloned = uc16_stream->Clone();
CHECK_EQ(2, resource.LockDepth());
uc16_stream = std::move(cloned);
CHECK_EQ(1, resource.LockDepth());
TestCloneCharacterStream(one_byte_source, uc16_stream.get(), length);
// This avoids the GC from trying to free a stack allocated resource.
if (uc16_string->IsExternalString())
i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
->SetResource(isolate, nullptr);
}
// 1-byte external string
v8::base::Vector<const uint8_t> one_byte_vector =
v8::base::OneByteVector(one_byte_source, static_cast<int>(length));
i::Handle<i::String> one_byte_string =
factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
{
TestExternalOneByteResource one_byte_resource(one_byte_source, length);
i::Handle<i::String> ext_one_byte_string(
factory->NewExternalStringFromOneByte(&one_byte_resource)
.ToHandleChecked());
std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
i::ScannerStream::For(isolate, ext_one_byte_string, 0, length));
TestCloneCharacterStream(one_byte_source, one_byte_stream.get(), length);
// This avoids the GC from trying to free a stack allocated resource.
if (ext_one_byte_string->IsExternalString())
i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
->SetResource(isolate, nullptr);
}
// Relocatable streams are't clonable.
{
std::unique_ptr<i::Utf16CharacterStream> string_stream(
i::ScannerStream::For(isolate, one_byte_string, 0, length));
CHECK(!string_stream->can_be_cloned());
i::Handle<i::String> two_byte_string =
factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
i::ScannerStream::For(isolate, two_byte_string, 0, length));
CHECK(!two_byte_string_stream->can_be_cloned());
}
// Chunk sources are cloneable.
{
const char* chunks[] = {"1234", "5678", ""};
ChunkSource chunk_source(chunks);
std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
i::ScannerStream::For(&chunk_source,
v8::ScriptCompiler::StreamedSource::ONE_BYTE));
TestCloneCharacterStream("12345678", one_byte_streaming_stream.get(), 8);
}
{
const char* chunks[] = {"1234", "5678", ""};
ChunkSource chunk_source(chunks);
std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
i::ScannerStream::For(&chunk_source,
v8::ScriptCompiler::StreamedSource::UTF8));
CHECK(utf8_streaming_stream->can_be_cloned());
TestCloneCharacterStream("12345678", utf8_streaming_stream.get(), 8);
}
{
const char16_t* chunks[] = {u"1234", u"5678", u""};
ChunkSource chunk_source(chunks);
std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
i::ScannerStream::For(&chunk_source,
v8::ScriptCompiler::StreamedSource::TWO_BYTE));
CHECK(two_byte_streaming_stream->can_be_cloned());
TestCloneCharacterStream("12345678", two_byte_streaming_stream.get(), 8);
}
}