Handle Utf-8 BOM at beginning of an Utf-8 stream.
(This should enable to drop the BOM handling in the Blink bindings.) R=marja@chromium.org BUG=v8:4947 Review-Url: https://codereview.chromium.org/2354973002 Cr-Commit-Position: refs/heads/master@{#39579}
This commit is contained in:
parent
a7455beba1
commit
a2b8b6e7db
@ -286,6 +286,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
|
||||
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
|
||||
DCHECK_EQ(cursor, buffer_end_);
|
||||
|
||||
static const unibrow::uchar kUtf8Bom = 0xfeff;
|
||||
|
||||
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
|
||||
current_.pos.incomplete_char;
|
||||
size_t it;
|
||||
@ -294,7 +296,11 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
|
||||
if (t == unibrow::Utf8::kIncomplete) continue;
|
||||
if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
||||
if (V8_LIKELY(t < kUtf8Bom)) {
|
||||
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
|
||||
} else if (t == kUtf8Bom && current_.pos.bytes + it == 2) {
|
||||
// BOM detected at beginning of the stream. Don't copy it.
|
||||
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
||||
*(cursor++) = static_cast<uc16>(t);
|
||||
} else {
|
||||
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
|
||||
|
@ -109,6 +109,67 @@ TEST(Utf8StreamAsciiOnly) {
|
||||
} while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
|
||||
}
|
||||
|
||||
TEST(Utf8StreamBOM) {
|
||||
// Construct test string w/ UTF-8 BOM (byte order mark)
|
||||
char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
|
||||
strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
|
||||
|
||||
const char* chunks[] = {data, "\0"};
|
||||
ChunkSource chunk_source(chunks);
|
||||
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
||||
v8::internal::ScannerStream::For(
|
||||
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
|
||||
|
||||
// Read the data without tripping over the BOM.
|
||||
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
||||
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
||||
}
|
||||
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
|
||||
|
||||
// Make sure seek works.
|
||||
stream->Seek(0);
|
||||
CHECK_EQ(unicode_ucs2[0], stream->Advance());
|
||||
|
||||
stream->Seek(5);
|
||||
CHECK_EQ(unicode_ucs2[5], stream->Advance());
|
||||
}
|
||||
|
||||
TEST(Utf8SplitBOM) {
|
||||
// Construct chunks with a BOM split into two chunks.
|
||||
char partial_bom[] = "\xef\xbb";
|
||||
char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
|
||||
strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
|
||||
|
||||
{
|
||||
const char* chunks[] = {partial_bom, data, "\0"};
|
||||
ChunkSource chunk_source(chunks);
|
||||
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
||||
v8::internal::ScannerStream::For(
|
||||
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
|
||||
|
||||
// Read the data without tripping over the BOM.
|
||||
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
||||
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
||||
}
|
||||
}
|
||||
|
||||
// And now with single-byte BOM chunks.
|
||||
char bom_byte_1[] = "\xef";
|
||||
char bom_byte_2[] = "\xbb";
|
||||
{
|
||||
const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
|
||||
ChunkSource chunk_source(chunks);
|
||||
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
|
||||
v8::internal::ScannerStream::For(
|
||||
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
|
||||
|
||||
// Read the data without tripping over the BOM.
|
||||
for (size_t i = 0; unicode_ucs2[i]; i++) {
|
||||
CHECK_EQ(unicode_ucs2[i], stream->Advance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Utf8ChunkBoundaries) {
|
||||
// Test utf-8 parsing at chunk boundaries.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user