[scanner] UTF-8 handling fix (errors near chunk end).
The bug occurred when we detected an erroneous char late, and put the last character in a chunk into the "incomplete char" buffer. It was not correctly retrieved when seeking. BUG=v8:6836 Change-Id: I8ca946dfdb39244c5ca0bdcebe047047010b3a07 Reviewed-on: https://chromium-review.googlesource.com/670729 Commit-Queue: Marja Hölttä <marja@chromium.org> Reviewed-by: Camillo Bruni <cbruni@chromium.org> Reviewed-by: Daniel Vogelheim <vogelheim@chromium.org> Cr-Commit-Position: refs/heads/master@{#48066}
This commit is contained in:
parent
5114f14cc1
commit
68310c9f69
@ -443,7 +443,9 @@ size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
|
||||
|
||||
SearchPosition(position);
|
||||
bool out_of_data = current_.chunk_no != chunks_.size() &&
|
||||
chunks_[current_.chunk_no].length == 0;
|
||||
chunks_[current_.chunk_no].length == 0 &&
|
||||
current_.pos.incomplete_char == 0;
|
||||
|
||||
if (out_of_data) return 0;
|
||||
|
||||
// Fill the buffer, until we have at least one char (or are out of data).
|
||||
|
@ -461,6 +461,25 @@ TEST(Regress651333) {
|
||||
}
|
||||
}
|
||||
|
||||
void TestChunkStreamAgainstReference(
|
||||
const char* cases[],
|
||||
const std::vector<std::vector<uint16_t>>& unicode_expected) {
|
||||
for (size_t c = 0; c < unicode_expected.size(); ++c) {
|
||||
ChunkSource chunk_source(cases[c]);
|
||||
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
||||
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
||||
for (size_t i = 0; i < unicode_expected[c].size(); i++) {
|
||||
CHECK_EQ(unicode_expected[c][i], stream->Advance());
|
||||
}
|
||||
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
||||
stream->Seek(0);
|
||||
for (size_t i = 0; i < unicode_expected[c].size(); i++) {
|
||||
CHECK_EQ(unicode_expected[c][i], stream->Advance());
|
||||
}
|
||||
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Regress6377) {
|
||||
const char* cases[] = {
|
||||
"\xf0\x90\0" // first chunk - start of 4-byte seq
|
||||
@ -480,22 +499,27 @@ TEST(Regress6377) {
|
||||
"a\xc3\0" // and an 'a' + start of 2-byte seq
|
||||
"\xbf\0", // third chunk - end of 2-byte seq
|
||||
};
|
||||
const std::vector<std::vector<uint16_t>> unicode = {
|
||||
const std::vector<std::vector<uint16_t>> unicode_expected = {
|
||||
{0xd800, 0xdc00, 97}, {0xfff, 97}, {0xff, 97}, {0xd800, 0xdc00, 97, 0xff},
|
||||
};
|
||||
CHECK_EQ(unicode.size(), sizeof(cases) / sizeof(cases[0]));
|
||||
for (size_t c = 0; c < unicode.size(); ++c) {
|
||||
ChunkSource chunk_source(cases[c]);
|
||||
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
||||
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
|
||||
for (size_t i = 0; i < unicode[c].size(); i++) {
|
||||
CHECK_EQ(unicode[c][i], stream->Advance());
|
||||
}
|
||||
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
||||
stream->Seek(0);
|
||||
for (size_t i = 0; i < unicode[c].size(); i++) {
|
||||
CHECK_EQ(unicode[c][i], stream->Advance());
|
||||
}
|
||||
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
||||
}
|
||||
CHECK_EQ(unicode_expected.size(), arraysize(cases));
|
||||
TestChunkStreamAgainstReference(cases, unicode_expected);
|
||||
}
|
||||
|
||||
TEST(Regress6836) {
|
||||
const char* cases[] = {
|
||||
// 0xc2 is a lead byte, but there's no continuation. The bug occurs when
|
||||
// this happens near the chunk end.
|
||||
"X\xc2Y\0",
|
||||
// Last chunk ends with a 2-byte char lead.
|
||||
"X\xc2\0",
|
||||
// Last chunk ends with a 3-byte char lead and only one continuation
|
||||
// character.
|
||||
"X\xe0\xbf\0",
|
||||
};
|
||||
const std::vector<std::vector<uint16_t>> unicode_expected = {
|
||||
{0x58, 0xfffd, 0x59}, {0x58, 0xfffd}, {0x58, 0xfffd},
|
||||
};
|
||||
CHECK_EQ(unicode_expected.size(), arraysize(cases));
|
||||
TestChunkStreamAgainstReference(cases, unicode_expected);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user