Script streaming: more UTF-8 handing fixes (again).
1) Since we fill the output buffer both from the chunks and the conversion buffer, it's possible that we run out of space and call CopyCharsHelper with 0 length. The underlying functions don't handle it gracefully, so check there. 2) There was a bug where we used to try to copy too many characters from the beginning of the data chunk into the conversion buffer. Continuation bytes in UTF-8 are of the form 0b10XXXXXX. If a byte is bigger than that, it's the first byte of a new UTF-8 character and we should ignore it. These two together (or maybe in combination with surrogates) are a probable reason for crbug.com/420932. 3) The test data was off; \uc481 is \xec\x92\x81. BUG=420932 LOG=N R=yangguo@chromium.org Review URL: https://codereview.chromium.org/662003003 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@24725 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
7cfd0ed4d6
commit
394af55a8c
@ -18,6 +18,10 @@ namespace {
|
||||
unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
|
||||
unsigned* src_pos, unsigned src_length,
|
||||
ScriptCompiler::StreamedSource::Encoding encoding) {
|
||||
// It's possible that this will be called with length 0, but don't assume that
|
||||
// the functions this calls handle it gracefully.
|
||||
if (length == 0) return 0;
|
||||
|
||||
if (encoding == ScriptCompiler::StreamedSource::UTF8) {
|
||||
return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
|
||||
dest, length, src, src_pos, src_length);
|
||||
@ -381,15 +385,22 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
|
||||
|
||||
void ExternalStreamingStream::HandleUtf8SplitCharacters(
|
||||
unsigned* data_in_buffer) {
|
||||
// Note the following property of UTF-8 which makes this function possible:
|
||||
// Given any byte, we can always read its local environment (in both
|
||||
// directions) to find out the (possibly multi-byte) character it belongs
|
||||
// to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a
|
||||
// multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or
|
||||
// 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.
|
||||
|
||||
// First check if we have leftover data from the last chunk.
|
||||
unibrow::uchar c;
|
||||
if (utf8_split_char_buffer_length_ > 0) {
|
||||
// Move the bytes which are part of the split character (which started in
|
||||
// the previous chunk) into utf8_split_char_buffer_.
|
||||
// the previous chunk) into utf8_split_char_buffer_. Note that the
|
||||
// continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
|
||||
while (current_data_offset_ < current_data_length_ &&
|
||||
utf8_split_char_buffer_length_ < 4 &&
|
||||
(c = current_data_[current_data_offset_]) >
|
||||
unibrow::Utf8::kMaxOneByteChar) {
|
||||
(c = current_data_[current_data_offset_]) >> 6 == 2) {
|
||||
utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
|
||||
++utf8_split_char_buffer_length_;
|
||||
++current_data_offset_;
|
||||
|
@ -23636,14 +23636,14 @@ TEST(StreamingScriptWithParseError) {
|
||||
|
||||
|
||||
TEST(StreamingUtf8Script) {
|
||||
// We'd want to write \uc481 instead of \xeb\x91\x80, but Windows compilers
|
||||
// We'd want to write \uc481 instead of \xec\x92\x81, but Windows compilers
|
||||
// don't like it.
|
||||
const char* chunk1 =
|
||||
"function foo() {\n"
|
||||
" // This function will contain an UTF-8 character which is not in\n"
|
||||
" // ASCII.\n"
|
||||
" var foob\xeb\x91\x80r = 13;\n"
|
||||
" return foob\xeb\x91\x80r;\n"
|
||||
" var foob\xec\x92\x81r = 13;\n"
|
||||
" return foob\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
const char* chunks[] = {chunk1, "foo(); ", NULL};
|
||||
RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
|
||||
@ -23654,7 +23654,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersSanityCheck) {
|
||||
// A sanity check to prove that the approach of splitting UTF-8
|
||||
// characters is correct. Here is an UTF-8 character which will take three
|
||||
// bytes.
|
||||
const char* reference = "\xeb\x91\x80";
|
||||
const char* reference = "\xec\x92\x81";
|
||||
CHECK(3u == strlen(reference)); // NOLINT - no CHECK_EQ for unsigned.
|
||||
|
||||
char chunk1[] =
|
||||
@ -23664,7 +23664,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersSanityCheck) {
|
||||
" var foob";
|
||||
char chunk2[] =
|
||||
"XXXr = 13;\n"
|
||||
" return foob\xeb\x91\x80r;\n"
|
||||
" return foob\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
chunk2[i] = reference[i];
|
||||
@ -23677,7 +23677,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersSanityCheck) {
|
||||
TEST(StreamingUtf8ScriptWithSplitCharacters) {
|
||||
// Stream data where a multi-byte UTF-8 character is split between two data
|
||||
// chunks.
|
||||
const char* reference = "\xeb\x91\x80";
|
||||
const char* reference = "\xec\x92\x81";
|
||||
char chunk1[] =
|
||||
"function foo() {\n"
|
||||
" // This function will contain an UTF-8 character which is not in\n"
|
||||
@ -23685,7 +23685,7 @@ TEST(StreamingUtf8ScriptWithSplitCharacters) {
|
||||
" var foobX";
|
||||
char chunk2[] =
|
||||
"XXr = 13;\n"
|
||||
" return foob\xeb\x91\x80r;\n"
|
||||
" return foob\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
chunk1[strlen(chunk1) - 1] = reference[0];
|
||||
chunk2[0] = reference[1];
|
||||
@ -23701,7 +23701,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {
|
||||
// Case 1: a chunk contains only bytes for a split character (and no other
|
||||
// data). This kind of a chunk would be exceptionally small, but we should
|
||||
// still decode it correctly.
|
||||
const char* reference = "\xeb\x91\x80";
|
||||
const char* reference = "\xec\x92\x81";
|
||||
// The small chunk is at the beginning of the split character
|
||||
{
|
||||
char chunk1[] =
|
||||
@ -23712,7 +23712,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {
|
||||
char chunk2[] = "XX";
|
||||
char chunk3[] =
|
||||
"Xr = 13;\n"
|
||||
" return foob\xeb\x91\x80r;\n"
|
||||
" return foob\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
chunk2[0] = reference[0];
|
||||
chunk2[1] = reference[1];
|
||||
@ -23730,7 +23730,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {
|
||||
char chunk2[] = "XX";
|
||||
char chunk3[] =
|
||||
"r = 13;\n"
|
||||
" return foob\xeb\x91\x80r;\n"
|
||||
" return foob\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
chunk1[strlen(chunk1) - 1] = reference[0];
|
||||
chunk2[0] = reference[1];
|
||||
@ -23742,8 +23742,8 @@ TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {
|
||||
// decoded correctly and not just ignored.
|
||||
{
|
||||
char chunk1[] =
|
||||
"var foob\xeb\x91\x80 = 13;\n"
|
||||
"foob\xeb\x91\x80";
|
||||
"var foob\xec\x92\x81 = 13;\n"
|
||||
"foob\xec\x92\x81";
|
||||
const char* chunks[] = {chunk1, NULL};
|
||||
RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
|
||||
}
|
||||
@ -23754,7 +23754,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersInvalidEdgeCases) {
|
||||
// Test cases where a UTF-8 character is split over several chunks. Those
|
||||
// cases are not supported (the embedder should give the data in big enough
|
||||
// chunks), but we shouldn't crash, just produce a parse error.
|
||||
const char* reference = "\xeb\x91\x80";
|
||||
const char* reference = "\xec\x92\x81";
|
||||
char chunk1[] =
|
||||
"function foo() {\n"
|
||||
" // This function will contain an UTF-8 character which is not in\n"
|
||||
@ -23763,7 +23763,7 @@ TEST(StreamingUtf8ScriptWithSplitCharactersInvalidEdgeCases) {
|
||||
char chunk2[] = "X";
|
||||
char chunk3[] =
|
||||
"Xr = 13;\n"
|
||||
" return foob\xeb\x91\x80r;\n"
|
||||
" return foob\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
chunk1[strlen(chunk1) - 1] = reference[0];
|
||||
chunk2[0] = reference[1];
|
||||
@ -23805,7 +23805,7 @@ TEST(StreamingProducesParserCache) {
|
||||
TEST(StreamingScriptWithInvalidUtf8) {
|
||||
// Regression test for a crash: test that invalid UTF-8 bytes in the end of a
|
||||
// chunk don't produce a crash.
|
||||
const char* reference = "\xeb\x91\x80\x80\x80";
|
||||
const char* reference = "\xec\x92\x81\x80\x80";
|
||||
char chunk1[] =
|
||||
"function foo() {\n"
|
||||
" // This function will contain an UTF-8 character which is not in\n"
|
||||
@ -23813,7 +23813,7 @@ TEST(StreamingScriptWithInvalidUtf8) {
|
||||
" var foobXXXXX"; // Too many bytes which look like incomplete chars!
|
||||
char chunk2[] =
|
||||
"r = 13;\n"
|
||||
" return foob\xeb\x91\x80\x80\x80r;\n"
|
||||
" return foob\xec\x92\x81\x80\x80r;\n"
|
||||
"}\n";
|
||||
for (int i = 0; i < 5; ++i) chunk1[strlen(chunk1) - 5 + i] = reference[i];
|
||||
|
||||
@ -23825,15 +23825,36 @@ TEST(StreamingScriptWithInvalidUtf8) {
|
||||
TEST(StreamingUtf8ScriptWithMultipleMultibyteCharactersSomeSplit) {
|
||||
// Regression test: Stream data where there are several multi-byte UTF-8
|
||||
// characters in a sequence and one of them is split between two data chunks.
|
||||
const char* reference = "\xeb\x91\x80";
|
||||
const char* reference = "\xec\x92\x81";
|
||||
char chunk1[] =
|
||||
"function foo() {\n"
|
||||
" // This function will contain an UTF-8 character which is not in\n"
|
||||
" // ASCII.\n"
|
||||
" var foob\xeb\x91\x80X";
|
||||
" var foob\xec\x92\x81X";
|
||||
char chunk2[] =
|
||||
"XXr = 13;\n"
|
||||
" return foob\xeb\x91\x80\xeb\x91\x80r;\n"
|
||||
" return foob\xec\x92\x81\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
chunk1[strlen(chunk1) - 1] = reference[0];
|
||||
chunk2[0] = reference[1];
|
||||
chunk2[1] = reference[2];
|
||||
const char* chunks[] = {chunk1, chunk2, "foo();", NULL};
|
||||
RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
|
||||
}
|
||||
|
||||
|
||||
TEST(StreamingUtf8ScriptWithMultipleMultibyteCharactersSomeSplit2) {
|
||||
// Another regression test, similar to the previous one. The difference is
|
||||
// that the split character is not the last one in the sequence.
|
||||
const char* reference = "\xec\x92\x81";
|
||||
char chunk1[] =
|
||||
"function foo() {\n"
|
||||
" // This function will contain an UTF-8 character which is not in\n"
|
||||
" // ASCII.\n"
|
||||
" var foobX";
|
||||
char chunk2[] =
|
||||
"XX\xec\x92\x81r = 13;\n"
|
||||
" return foob\xec\x92\x81\xec\x92\x81r;\n"
|
||||
"}\n";
|
||||
chunk1[strlen(chunk1) - 1] = reference[0];
|
||||
chunk2[0] = reference[1];
|
||||
|
Loading…
Reference in New Issue
Block a user