v8/test/cctest/parsing/test-scanner-streams.cc
Toon Verwaest 928e7b2973 [scanner] Decode utf8 as chunks come in to utf16, allowing unbuffered streaming
Change-Id: Iaad8bc94e9222d309749491df9a500544b5b37da
Reviewed-on: https://chromium-review.googlesource.com/1158687
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Reviewed-by: Marja Hölttä <marja@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54877}
2018-08-02 19:00:09 +00:00

702 lines
25 KiB
C++

// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/heap/factory-inl.h"
#include "src/objects-inl.h"
#include "src/parsing/scanner-character-streams.h"
#include "src/parsing/scanner.h"
#include "test/cctest/cctest.h"
namespace {
// Implement ExternalSourceStream based on const char**.
// This will take each string as one chunk. The last chunk must be empty.
template <typename Char = char>
class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
public:
explicit ChunkSource(const Char** chunks) : current_(0) {
do {
chunks_.push_back({*chunks, string_length(*chunks)});
chunks++;
} while (chunks_.back().len > 0);
}
explicit ChunkSource(const Char* chunks) : current_(0) {
do {
size_t length = string_length(chunks);
chunks_.push_back({chunks, length});
chunks += length + 1;
} while (chunks_.back().len > 0);
}
ChunkSource(const Char* data, size_t len, bool extra_chunky) : current_(0) {
// If extra_chunky, we'll use increasingly large chunk sizes. If not, we'll
// have a single chunk of full length. Make sure that chunks are always
// aligned to char-size though.
size_t chunk_size = extra_chunky ? 1 : len;
for (size_t i = 0; i < len; i += chunk_size, chunk_size += 1) {
chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
}
chunks_.push_back({nullptr, 0});
}
~ChunkSource() {}
bool SetBookmark() override { return false; }
void ResetToBookmark() override {}
size_t GetMoreData(const uint8_t** src) override {
DCHECK_LT(current_, chunks_.size());
Chunk& next = chunks_[current_++];
const uint8_t* chunk = reinterpret_cast<const uint8_t*>(next.ptr);
size_t bytelength = next.len * sizeof(Char);
uint8_t* copy = new uint8_t[bytelength];
i::MemMove(copy, chunk, bytelength);
*src = copy;
return bytelength;
}
private:
static size_t string_length(const Char* string) {
size_t length = 0;
while (*(string++) != 0) length++;
return length;
}
struct Chunk {
const Char* ptr;
size_t len;
};
std::vector<Chunk> chunks_;
size_t current_;
};
class TestExternalResource : public v8::String::ExternalStringResource {
public:
explicit TestExternalResource(uint16_t* data, int length)
: data_(data), length_(static_cast<size_t>(length)) {}
~TestExternalResource() {}
const uint16_t* data() const { return data_; }
size_t length() const { return length_; }
private:
uint16_t* data_;
size_t length_;
};
class TestExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
TestExternalOneByteResource(const char* data, size_t length)
: data_(data), length_(length) {}
const char* data() const { return data_; }
size_t length() const { return length_; }
private:
const char* data_;
size_t length_;
};
// A test string with all lengths of utf-8 encodings.
const char unicode_utf8[] =
"abc" // 3x ascii
"\xc3\xa4" // a Umlaut, code point 228
"\xe2\xa8\xa0" // >> (math symbol), code point 10784
"\xf0\x9f\x92\xa9" // best character, code point 128169,
// as utf-16 surrogates: 55357 56489
"def"; // 3x ascii again.
const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357,
56489, 100, 101, 102, 0};
} // anonymous namespace
TEST(Utf8StreamAsciiOnly) {
const char* chunks[] = {"abc", "def", "ghi", ""};
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(i::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
// Read the data without dying.
v8::internal::uc32 c;
do {
c = stream->Advance();
} while (c != i::ScannerStream::kEndOfInput);
}
TEST(Utf8StreamBOM) {
// Construct test string w/ UTF-8 BOM (byte order mark)
char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
const char* chunks[] = {data, "\0"};
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
// Make sure seek works.
stream->Seek(0);
CHECK_EQ(unicode_ucs2[0], stream->Advance());
stream->Seek(5);
CHECK_EQ(unicode_ucs2[5], stream->Advance());
// Try again, but make sure we have to seek 'backwards'.
while (i::ScannerStream::kEndOfInput != stream->Advance()) {
// Do nothing. We merely advance the stream to the end of its input.
}
stream->Seek(5);
CHECK_EQ(unicode_ucs2[5], stream->Advance());
}
TEST(Utf8SplitBOM) {
// Construct chunks with a BOM split into two chunks.
char partial_bom[] = "\xef\xbb";
char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
{
const char* chunks[] = {partial_bom, data, "\0"};
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
}
// And now with single-byte BOM chunks.
char bom_byte_1[] = "\xef";
char bom_byte_2[] = "\xbb";
{
const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
}
}
TEST(Utf8SplitMultiBOM) {
// Construct chunks with a split BOM followed by another split BOM.
const char* chunks = "\xef\xbb\0\xbf\xef\xbb\0\xbf\0\0";
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
// Read the data, ensuring we get exactly one of the two BOMs back.
CHECK_EQ(0xFEFF, stream->Advance());
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
}
TEST(Ucs2AdvanceUntil) {
// Test utf-8 advancing until a certain char.
const char line_term = '\n';
const size_t kLen = arraysize(unicode_ucs2) - 1;
uint16_t data[kLen + 1];
memcpy(data, unicode_ucs2, kLen * 2);
data[kLen - 1] = line_term;
data[kLen] = '\0';
{
uint16_t end[] = {0};
const uint16_t* chunks[] = {data, end};
ChunkSource<uint16_t> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
int32_t res = static_cast<i::CharacterStream<uint16_t>*>(stream.get())
->AdvanceUntil([](int32_t c0) {
return unibrow::IsLineTerminator(c0);
});
CHECK_EQ(line_term, res);
}
}
TEST(AdvanceMatchAdvanceUntil) {
// Test if single advance and advanceUntil behave the same
char data[] = {'a', 'b', '\n', 'c', '\0'};
{
const char* chunks[] = {data, "\0"};
ChunkSource<char> chunk_source_a(chunks);
std::unique_ptr<v8::internal::ScannerStream> stream_advance(
v8::internal::ScannerStream::For(
&chunk_source_a, v8::ScriptCompiler::StreamedSource::ONE_BYTE,
nullptr));
ChunkSource<char> chunk_source_au(chunks);
std::unique_ptr<v8::internal::ScannerStream> stream_advance_until(
v8::internal::ScannerStream::For(
&chunk_source_au, v8::ScriptCompiler::StreamedSource::ONE_BYTE,
nullptr));
int32_t au_c0 =
static_cast<i::CharacterStream<uint16_t>*>(stream_advance_until.get())
->AdvanceUntil(
[](int32_t c0) { return unibrow::IsLineTerminator(c0); });
int32_t a_c0 = '0';
while (!unibrow::IsLineTerminator(a_c0)) {
a_c0 = stream_advance->Advance();
}
// Check both advances methods have the same output
CHECK_EQ(a_c0, au_c0);
// Check if both set the cursor to the correct position by advancing both
// streams by one character.
a_c0 = stream_advance->Advance();
au_c0 = stream_advance_until->Advance();
CHECK_EQ(a_c0, au_c0);
}
}
TEST(Ucs2AdvanceUntilOverChunkBoundaries) {
// Test utf-8 advancing until a certain char, crossing chunk boundaries.
// Split the test string at each byte and pass it to the stream. This way,
// we'll have a split at each possible boundary.
const size_t kLen = arraysize(unicode_ucs2) - 1;
uint16_t buffer[kLen + 4];
for (size_t i = 1; i < kLen; i++) {
// Copy source string into buffer, splitting it at i.
// Then add three chunks, 0..i-1, i..strlen-1, empty.
memcpy(buffer, unicode_ucs2, i * 2);
memcpy(buffer + i + 1, unicode_ucs2 + i, (kLen - i) * 2);
buffer[i] = '\0';
buffer[kLen + 1] = '\n';
buffer[kLen + 2] = '\0';
buffer[kLen + 3] = '\0';
const uint16_t* chunks[] = {buffer, buffer + i + 1, buffer + kLen + 2};
ChunkSource<uint16_t> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(i::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
int32_t res = static_cast<i::CharacterStream<uint16_t>*>(stream.get())
->AdvanceUntil([](int32_t c0) {
return unibrow::IsLineTerminator(c0);
});
CHECK_EQ(buffer[kLen + 1], res);
}
}
TEST(Ucs2ChunkBoundaries) {
// Test utf-8 parsing at chunk boundaries.
// Split the test string at each byte and pass it to the stream. This way,
// we'll have a split at each possible boundary.
size_t len = strlen(unicode_utf8);
char buffer[arraysize(unicode_utf8) + 3];
for (size_t i = 1; i < len; i++) {
// Copy source string into buffer, splitting it at i.
// Then add three chunks, 0..i-1, i..strlen-1, empty.
strncpy(buffer, unicode_utf8, i);
strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
buffer[i] = '\0';
buffer[len + 1] = '\0';
buffer[len + 2] = '\0';
const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
}
}
TEST(Utf8SingleByteChunks) {
// Have each byte as a single-byte chunk.
size_t len = strlen(unicode_utf8);
char buffer[arraysize(unicode_utf8) + 4];
for (size_t i = 1; i < len - 1; i++) {
// Copy source string into buffer, make a single-byte chunk at i.
strncpy(buffer, unicode_utf8, i);
strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1);
buffer[i] = '\0';
buffer[i + 1] = unicode_utf8[i];
buffer[i + 2] = '\0';
buffer[len + 2] = '\0';
buffer[len + 3] = '\0';
const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3,
buffer + len + 3};
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
for (size_t j = 0; unicode_ucs2[j]; j++) {
CHECK_EQ(unicode_ucs2[j], stream->Advance());
}
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
}
}
#define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
void TestCharacterStream(const char* reference, i::ScannerStream* stream,
unsigned length, unsigned start, unsigned end) {
// Read streams one char at a time
unsigned i;
for (i = start; i < end; i++) {
CHECK_EQU(i, stream->pos());
CHECK_EQU(reference[i], stream->Advance());
}
CHECK_EQU(end, stream->pos());
CHECK_EQU(i::ScannerStream::kEndOfInput, stream->Advance());
CHECK_EQU(end + 1, stream->pos());
stream->Back();
// Pushback, re-read, pushback again.
while (i > end / 4) {
int32_t c0 = reference[i - 1];
CHECK_EQU(i, stream->pos());
stream->Back();
i--;
CHECK_EQU(i, stream->pos());
int32_t c1 = stream->Advance();
i++;
CHECK_EQU(i, stream->pos());
CHECK_EQ(c0, c1);
stream->Back();
i--;
CHECK_EQU(i, stream->pos());
}
// Seek + read streams one char at a time.
unsigned halfway = end / 2;
stream->Seek(stream->pos() + halfway - i);
for (i = halfway; i < end; i++) {
CHECK_EQU(i, stream->pos());
CHECK_EQU(reference[i], stream->Advance());
}
CHECK_EQU(i, stream->pos());
CHECK_LT(stream->Advance(), 0);
// Seek back, then seek beyond end of stream.
stream->Seek(start);
if (start < length) {
CHECK_EQU(stream->Advance(), reference[start]);
} else {
CHECK_LT(stream->Advance(), 0);
}
stream->Seek(length + 5);
CHECK_LT(stream->Advance(), 0);
}
#undef CHECK_EQU
void TestCharacterStreams(const char* one_byte_source, unsigned length,
unsigned start = 0, unsigned end = 0) {
if (end == 0) end = length;
i::Isolate* isolate = CcTest::i_isolate();
i::Factory* factory = isolate->factory();
// 2-byte external string
std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
static_cast<int>(length));
{
for (unsigned i = 0; i < length; i++) {
uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
}
TestExternalResource resource(uc16_buffer.get(), length);
i::Handle<i::String> uc16_string(
factory->NewExternalStringFromTwoByte(&resource).ToHandleChecked());
std::unique_ptr<i::ScannerStream> uc16_stream(
i::ScannerStream::For(isolate, uc16_string, start, end));
TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end);
// This avoids the GC from trying to free a stack allocated resource.
if (uc16_string->IsExternalString())
i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
->SetResource(isolate, nullptr);
}
// 1-byte external string
i::Vector<const uint8_t> one_byte_vector =
i::OneByteVector(one_byte_source, static_cast<int>(length));
i::Handle<i::String> one_byte_string =
factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
{
TestExternalOneByteResource one_byte_resource(one_byte_source, length);
i::Handle<i::String> ext_one_byte_string(
factory->NewExternalStringFromOneByte(&one_byte_resource)
.ToHandleChecked());
std::unique_ptr<i::ScannerStream> one_byte_stream(
i::ScannerStream::For(isolate, ext_one_byte_string, start, end));
TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start,
end);
// This avoids the GC from trying to free a stack allocated resource.
if (ext_one_byte_string->IsExternalString())
i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
->SetResource(isolate, nullptr);
}
// 1-byte generic i::String
{
std::unique_ptr<i::ScannerStream> string_stream(
i::ScannerStream::For(isolate, one_byte_string, start, end));
TestCharacterStream(one_byte_source, string_stream.get(), length, start,
end);
}
// 2-byte generic i::String
{
i::Handle<i::String> two_byte_string =
factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
std::unique_ptr<i::ScannerStream> two_byte_string_stream(
i::ScannerStream::For(isolate, two_byte_string, start, end));
TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length,
start, end);
}
// Streaming has no notion of start/end, so let's skip streaming tests for
// these cases.
if (start != 0 || end != length) return;
// 1-byte streaming stream, single + many chunks.
{
const char* data = reinterpret_cast<const char*>(one_byte_vector.begin());
const char* data_end = reinterpret_cast<const char*>(one_byte_vector.end());
ChunkSource<char> single_chunk(data, data_end - data, false);
std::unique_ptr<i::ScannerStream> one_byte_streaming_stream(
i::ScannerStream::For(&single_chunk,
v8::ScriptCompiler::StreamedSource::ONE_BYTE,
nullptr));
TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
length, start, end);
ChunkSource<char> many_chunks(data, data_end - data, true);
one_byte_streaming_stream.reset(i::ScannerStream::For(
&many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE, nullptr));
TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
length, start, end);
}
// UTF-8 streaming stream, single + many chunks.
{
const char* data = reinterpret_cast<const char*>(one_byte_vector.begin());
const char* data_end = reinterpret_cast<const char*>(one_byte_vector.end());
ChunkSource<char> chunks(data, data_end - data, false);
std::unique_ptr<i::ScannerStream> utf8_streaming_stream(
i::ScannerStream::For(&chunks, v8::ScriptCompiler::StreamedSource::UTF8,
nullptr));
TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
start, end);
ChunkSource<char> many_chunks(data, data_end - data, true);
utf8_streaming_stream.reset(i::ScannerStream::For(
&many_chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
start, end);
}
// 2-byte streaming stream, single + many chunks.
{
const uint16_t* data =
reinterpret_cast<const uint16_t*>(two_byte_vector.begin());
const uint16_t* data_end =
reinterpret_cast<const uint16_t*>(two_byte_vector.end());
ChunkSource<uint16_t> chunks(data, data_end - data, false);
std::unique_ptr<i::ScannerStream> two_byte_streaming_stream(
i::ScannerStream::For(
&chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
length, start, end);
ChunkSource<uint16_t> many_chunks(data, data_end - data, true);
two_byte_streaming_stream.reset(i::ScannerStream::For(
&many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
length, start, end);
}
}
TEST(CharacterStreams) {
v8::Isolate* isolate = CcTest::isolate();
v8::HandleScope handles(isolate);
v8::Local<v8::Context> context = v8::Context::New(isolate);
v8::Context::Scope context_scope(context);
TestCharacterStreams("abcdefghi", 9);
TestCharacterStreams("abc\0\n\r\x7f", 7);
TestCharacterStreams("\0", 1);
TestCharacterStreams("", 0);
// 4k large buffer.
char buffer[4096 + 1];
for (unsigned i = 0; i < arraysize(buffer); i++) {
buffer[i] = static_cast<char>(i & 0x7F);
}
buffer[arraysize(buffer) - 1] = '\0';
TestCharacterStreams(buffer, arraysize(buffer) - 1);
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
}
// Regression test for crbug.com/651333. Read invalid utf-8.
TEST(Regress651333) {
const char bytes[] =
"A\xf1"
"ad"; // Anad, with n == n-with-tilde.
const uint16_t unicode[] = {65, 65533, 97, 100};
// Run the test for all sub-strings 0..N of bytes, to make sure we hit the
// error condition in and at chunk boundaries.
for (size_t len = 0; len < arraysize(bytes); len++) {
// Read len bytes from bytes, and compare against the expected unicode
// characters. Expect kBadChar ( == Unicode replacement char == code point
// 65533) instead of the incorrectly coded Latin1 char.
ChunkSource<char> chunks(bytes, len, false);
std::unique_ptr<i::ScannerStream> stream(i::ScannerStream::For(
&chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
for (size_t i = 0; i < len; i++) {
CHECK_EQ(unicode[i], stream->Advance());
}
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
}
}
void TestChunkStreamAgainstReference(
const char* cases[],
const std::vector<std::vector<uint16_t>>& unicode_expected) {
for (size_t c = 0; c < unicode_expected.size(); ++c) {
ChunkSource<char> chunk_source(cases[c]);
std::unique_ptr<i::ScannerStream> stream(i::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
for (size_t i = 0; i < unicode_expected[c].size(); i++) {
CHECK_EQ(unicode_expected[c][i], stream->Advance());
}
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
stream->Seek(0);
for (size_t i = 0; i < unicode_expected[c].size(); i++) {
CHECK_EQ(unicode_expected[c][i], stream->Advance());
}
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
}
}
TEST(Regress6377) {
const char* cases[] = {
"\xf0\x90\0" // first chunk - start of 4-byte seq
"\x80\x80" // second chunk - end of 4-byte seq
"a\0", // and an 'a'
"\xe0\xbf\0" // first chunk - start of 3-byte seq
"\xbf" // second chunk - one-byte end of 3-byte seq
"a\0", // and an 'a'
"\xc3\0" // first chunk - start of 2-byte seq
"\xbf" // second chunk - end of 2-byte seq
"a\0", // and an 'a'
"\xf0\x90\x80\0" // first chunk - start of 4-byte seq
"\x80" // second chunk - one-byte end of 4-byte seq
"a\xc3\0" // and an 'a' + start of 2-byte seq
"\xbf\0", // third chunk - end of 2-byte seq
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0xD800, 0xDC00, 97}, {0xFFF, 97}, {0xFF, 97}, {0xD800, 0xDC00, 97, 0xFF},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestChunkStreamAgainstReference(cases, unicode_expected);
}
TEST(Regress6836) {
const char* cases[] = {
// 0xC2 is a lead byte, but there's no continuation. The bug occurs when
// this happens near the chunk end.
"X\xc2Y\0",
// Last chunk ends with a 2-byte char lead.
"X\xc2\0",
// Last chunk ends with a 3-byte char lead and only one continuation
// character.
"X\xe0\xbf\0",
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0x58, 0xFFFD, 0x59}, {0x58, 0xFFFD}, {0x58, 0xFFFD},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestChunkStreamAgainstReference(cases, unicode_expected);
}
TEST(TestOverlongAndInvalidSequences) {
const char* cases[] = {
// Overlong 2-byte sequence.
"X\xc0\xbfY\0",
// Another overlong 2-byte sequence.
"X\xc1\xbfY\0",
// Overlong 3-byte sequence.
"X\xe0\x9f\xbfY\0",
// Overlong 4-byte sequence.
"X\xf0\x89\xbf\xbfY\0",
// Invalid 3-byte sequence (reserved for surrogates).
"X\xed\xa0\x80Y\0",
// Invalid 4-bytes sequence (value out of range).
"X\xf4\x90\x80\x80Y\0",
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0x58, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
{0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestChunkStreamAgainstReference(cases, unicode_expected);
}
TEST(RelocatingCharacterStream) {
ManualGCScope manual_gc_scope;
CcTest::InitializeVM();
i::Isolate* i_isolate = CcTest::i_isolate();
v8::HandleScope scope(CcTest::isolate());
const char* string = "abcd";
int length = static_cast<int>(strlen(string));
std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
for (int i = 0; i < length; i++) {
uc16_buffer[i] = string[i];
}
i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(), length);
i::Handle<i::String> two_byte_string =
i_isolate->factory()
->NewStringFromTwoByte(two_byte_vector, i::NOT_TENURED)
.ToHandleChecked();
std::unique_ptr<i::ScannerStream> two_byte_string_stream(
i::ScannerStream::For(i_isolate, two_byte_string, 0, length));
CHECK_EQ('a', two_byte_string_stream->Advance());
CHECK_EQ('b', two_byte_string_stream->Advance());
CHECK_EQ(size_t{2}, two_byte_string_stream->pos());
i::String* raw = *two_byte_string;
i_isolate->heap()->CollectGarbage(i::NEW_SPACE,
i::GarbageCollectionReason::kUnknown);
// GC moved the string.
CHECK_NE(raw, *two_byte_string);
CHECK_EQ('c', two_byte_string_stream->Advance());
CHECK_EQ('d', two_byte_string_stream->Advance());
}