Fix bad-char handling in utf-8 streaming streams. Also add test.
R=jochen@chromium.org BUG=chromium:651333, v8:4947 Review-Url: https://codereview.chromium.org/2391273002 Cr-Commit-Position: refs/heads/master@{#40004}
This commit is contained in:
parent
186e7db8dd
commit
138127a608
@ -286,6 +286,20 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
|
||||
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
|
||||
DCHECK_EQ(cursor, buffer_end_);
|
||||
|
||||
// If the current chunk is the last (empty) chunk we'll have to process
|
||||
// any left-over, partial characters.
|
||||
if (chunk.length == 0) {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncrementalFinish(¤t_.pos.incomplete_char);
|
||||
if (t != unibrow::Utf8::kBufferEmpty) {
|
||||
DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
|
||||
*cursor = static_cast<uc16>(t);
|
||||
buffer_end_++;
|
||||
current_.pos.chars++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static const unibrow::uchar kUtf8Bom = 0xfeff;
|
||||
|
||||
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
|
||||
@ -421,7 +435,7 @@ size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
|
||||
if (current_.chunk_no == chunks_.size()) {
|
||||
out_of_data = !FetchChunk();
|
||||
}
|
||||
if (!out_of_data) FillBufferFromCurrentChunk();
|
||||
FillBufferFromCurrentChunk();
|
||||
}
|
||||
|
||||
DCHECK_EQ(current_.pos.chars - position, buffer_end_ - buffer_cursor_);
|
||||
|
@ -333,25 +333,54 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
|
||||
*buffer = 0;
|
||||
return kBadChar;
|
||||
}
|
||||
} else {
|
||||
// We're inside of a character, as described by buffer.
|
||||
if (IsContinuationCharacter(next)) {
|
||||
// How many bytes (excluding this one) do we still expect?
|
||||
uint8_t count = (*buffer >> 24) - 1;
|
||||
// Update the value.
|
||||
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
|
||||
if (count) {
|
||||
*buffer = count << 24 | value;
|
||||
return kIncomplete;
|
||||
} else {
|
||||
*buffer = 0;
|
||||
return value;
|
||||
}
|
||||
} else if (*buffer <= 0xff) {
|
||||
// We have one unprocessed byte left (from the last else case in this if
|
||||
// statement).
|
||||
uchar previous = *buffer;
|
||||
*buffer = 0;
|
||||
uchar t = ValueOfIncremental(previous, buffer);
|
||||
if (t == kIncomplete) {
|
||||
// If we have an incomplete character, process both the previous and the
|
||||
// next byte at once.
|
||||
return ValueOfIncremental(next, buffer);
|
||||
} else {
|
||||
// Within a character, but not a continuation character? Bad char.
|
||||
*buffer = 0;
|
||||
return kBadChar;
|
||||
// Otherwise, process the previous byte and save the next byte for next
|
||||
// time.
|
||||
DCHECK_EQ(0, *buffer);
|
||||
*buffer = next;
|
||||
return t;
|
||||
}
|
||||
} else if (IsContinuationCharacter(next)) {
|
||||
// We're inside of a character, as described by buffer.
|
||||
|
||||
// How many bytes (excluding this one) do we still expect?
|
||||
uint8_t count = (*buffer >> 24) - 1;
|
||||
// Update the value.
|
||||
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
|
||||
if (count) {
|
||||
*buffer = count << 24 | value;
|
||||
return kIncomplete;
|
||||
} else {
|
||||
*buffer = 0;
|
||||
return value;
|
||||
}
|
||||
} else {
|
||||
// Within a character, but not a continuation character? Then the
|
||||
// previous char was a bad char. But we need to save the current
|
||||
// one.
|
||||
*buffer = next;
|
||||
return kBadChar;
|
||||
}
|
||||
}
|
||||
|
||||
uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
|
||||
DCHECK_NOT_NULL(buffer);
|
||||
if (*buffer == 0) {
|
||||
return kBufferEmpty;
|
||||
} else {
|
||||
// Process left-over chars. An incomplete char at the end maps to kBadChar.
|
||||
uchar t = ValueOfIncremental(0, buffer);
|
||||
return (t == kIncomplete) ? kBadChar : t;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,6 +161,7 @@ class Utf8 {
|
||||
typedef uint32_t Utf8IncrementalBuffer;
|
||||
static uchar ValueOfIncremental(byte next_byte,
|
||||
Utf8IncrementalBuffer* buffer);
|
||||
static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer);
|
||||
|
||||
// Excludes non-characters from the set of valid code points.
|
||||
static inline bool IsValidCharacter(uchar c);
|
||||
|
@ -423,3 +423,26 @@ TEST(CharacterStreams) {
|
||||
TestCharacterStreams(buffer, arraysize(buffer) - 1);
|
||||
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
|
||||
}
|
||||
|
||||
// Regression test for crbug.com/651333. Read invalid utf-8.
|
||||
TEST(Regress651333) {
|
||||
const uint8_t bytes[] =
|
||||
"A\xf1"
|
||||
"ad"; // Anad, with n == n-with-tilde.
|
||||
const uint16_t unicode[] = {65, 65533, 97, 100};
|
||||
|
||||
// Run the test for all sub-strings 0..N of bytes, to make sure we hit the
|
||||
// error condition in and at chunk boundaries.
|
||||
for (size_t len = 0; len < arraysize(bytes); len++) {
|
||||
// Read len bytes from bytes, and compare against the expected unicode
|
||||
// characters. Expect kBadChar ( == Unicode replacement char == code point
|
||||
// 65533) instead of the incorrectly coded Latin1 char.
|
||||
ChunkSource chunks(bytes, len, false);
|
||||
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
|
||||
&chunks, v8::ScriptCompiler::StreamedSource::UTF8));
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
CHECK_EQ(unicode[i], stream->Advance());
|
||||
}
|
||||
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user