Update UTF-8 decoder to detect more special cases.

The blink version is stricter and for parsing it's important that both
decoders behave the same.

BUG=chromium:489944
R=vogelheim@chromium.org
LOG=n

Review URL: https://codereview.chromium.org/1148653007

Cr-Commit-Position: refs/heads/master@{#28601}
This commit is contained in:
jochen 2015-05-22 11:47:36 -07:00 committed by Commit bot
parent c52bb1f03a
commit 3d5b2f807b
3 changed files with 113 additions and 153 deletions

View File

@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table,
}
uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {
// We only get called for non-ASCII characters.
if (length == 1) {
static inline size_t NonASCIISequenceLength(byte first) {
// clang-format off
static const uint8_t lengths[256] = {
// The first 128 entries correspond to ASCII characters.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// The following 64 entries correspond to continuation bytes.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// The next are two invalid overlong encodings and 30 two-byte sequences.
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// 16 three-byte sequences.
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
// 5 four-byte sequences, followed by sequences that could only encode
// code points outside of the unicode range.
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// clang-format on
return lengths[first];
}
static inline bool IsContinuationCharacter(byte chr) {
return chr >= 0x80 && chr <= 0xBF;
}
// This method decodes an UTF-8 value according to RFC 3629.
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
size_t length = NonASCIISequenceLength(str[0]);
if (length == 0 || max_length < length) {
*cursor += 1;
return kBadChar;
}
byte first = str[0];
byte second = str[1] ^ 0x80;
if (second & 0xC0) {
*cursor += 1;
return kBadChar;
}
if (first < 0xE0) {
if (first < 0xC0) {
*cursor += 1;
return kBadChar;
}
uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
if (code_point <= kMaxOneByteChar) {
if (length == 2) {
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
*cursor += 2;
return code_point;
return ((str[0] << 6) + str[1]) - 0x00003080;
}
if (length == 2) {
*cursor += 1;
return kBadChar;
}
byte third = str[2] ^ 0x80;
if (third & 0xC0) {
*cursor += 1;
return kBadChar;
}
if (first < 0xF0) {
uchar code_point = ((((first << 6) | second) << 6) | third)
& kMaxThreeByteChar;
if (code_point <= kMaxTwoByteChar) {
if (length == 3) {
switch (str[0]) {
case 0xE0:
// Overlong three-byte sequence.
if (str[1] < 0xA0 || str[1] > 0xBF) {
*cursor += 1;
return kBadChar;
}
break;
case 0xED:
// High and low surrogate halves.
if (str[1] < 0x80 || str[1] > 0x9F) {
*cursor += 1;
return kBadChar;
}
break;
default:
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
}
if (!IsContinuationCharacter(str[2])) {
*cursor += 1;
return kBadChar;
}
*cursor += 3;
return code_point;
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
}
if (length == 3) {
DCHECK(length == 4);
switch (str[0]) {
case 0xF0:
// Overlong four-byte sequence.
if (str[1] < 0x90 || str[1] > 0xBF) {
*cursor += 1;
return kBadChar;
}
break;
case 0xF4:
// Code points outside of the unicode range.
if (str[1] < 0x80 || str[1] > 0x8F) {
*cursor += 1;
return kBadChar;
}
break;
default:
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
}
if (!IsContinuationCharacter(str[2])) {
*cursor += 1;
return kBadChar;
}
byte fourth = str[3] ^ 0x80;
if (fourth & 0xC0) {
if (!IsContinuationCharacter(str[3])) {
*cursor += 1;
return kBadChar;
}
if (first < 0xF8) {
uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth)
& kMaxFourByteChar;
if (code_point <= kMaxThreeByteChar) {
*cursor += 1;
return kBadChar;
}
*cursor += 4;
return code_point;
}
*cursor += 1;
return kBadChar;
*cursor += 4;
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
0x03C82080;
}

View File

@ -7267,84 +7267,6 @@ static void Utf16Helper(
}
static uint16_t StringGet(Handle<String> str, int index) {
i::Handle<i::String> istring =
v8::Utils::OpenHandle(String::Cast(*str));
return istring->Get(index);
}
static void WriteUtf8Helper(
LocalContext& context, // NOLINT
const char* name,
const char* lengths_name,
int len) {
Local<v8::Array> b =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
Local<v8::Array> alens =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
char buffer[1000];
char buffer2[1000];
for (int i = 0; i < len; i++) {
Local<v8::String> string =
Local<v8::String>::Cast(b->Get(i));
Local<v8::Number> expected_len =
Local<v8::Number>::Cast(alens->Get(i));
int utf8_length = static_cast<int>(expected_len->Value());
for (int j = utf8_length + 1; j >= 0; j--) {
memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
int nchars;
int utf8_written =
string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
int utf8_written2 =
string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
CHECK_GE(utf8_length + 1, utf8_written);
CHECK_GE(utf8_length, utf8_written2);
for (int k = 0; k < utf8_written2; k++) {
CHECK_EQ(buffer[k], buffer2[k]);
}
CHECK(nchars * 3 >= utf8_written - 1);
CHECK(nchars <= utf8_written);
if (j == utf8_length + 1) {
CHECK_EQ(utf8_written2, utf8_length);
CHECK_EQ(utf8_written2 + 1, utf8_written);
}
CHECK_EQ(buffer[utf8_written], 42);
if (j > utf8_length) {
if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
Handle<String> roundtrip = v8_str(buffer);
CHECK(roundtrip->Equals(string));
} else {
if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
}
if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
if (nchars >= 2) {
uint16_t trail = StringGet(string, nchars - 1);
uint16_t lead = StringGet(string, nchars - 2);
if (((lead & 0xfc00) == 0xd800) &&
((trail & 0xfc00) == 0xdc00)) {
unsigned u1 = buffer2[utf8_written2 - 4];
unsigned u2 = buffer2[utf8_written2 - 3];
unsigned u3 = buffer2[utf8_written2 - 2];
unsigned u4 = buffer2[utf8_written2 - 1];
CHECK_EQ((u1 & 0xf8), 0xf0u);
CHECK_EQ((u2 & 0xc0), 0x80u);
CHECK_EQ((u3 & 0xc0), 0x80u);
CHECK_EQ((u4 & 0xc0), 0x80u);
uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
CHECK_EQ((u4 & 0x3f), (c & 0x3f));
CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
CHECK_EQ((u1 & 0x3), c >> 18);
}
}
}
}
}
THREADED_TEST(Utf16) {
LocalContext context;
v8::HandleScope scope(context->GetIsolate());
@ -7391,9 +7313,6 @@ THREADED_TEST(Utf16) {
"}");
Utf16Helper(context, "a", "alens", 9);
Utf16Helper(context, "a2", "a2lens", 81);
WriteUtf8Helper(context, "b", "alens", 9);
WriteUtf8Helper(context, "b2", "a2lens", 81);
WriteUtf8Helper(context, "c2", "a2lens", 81);
}
@ -7403,15 +7322,6 @@ static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
return *is1 == *is2;
}
static void SameSymbolHelper(v8::Isolate* isolate, const char* a,
const char* b) {
Handle<String> symbol1 =
v8::String::NewFromUtf8(isolate, a, v8::String::kInternalizedString);
Handle<String> symbol2 =
v8::String::NewFromUtf8(isolate, b, v8::String::kInternalizedString);
CHECK(SameSymbol(symbol1, symbol2));
}
THREADED_TEST(Utf16Symbol) {
LocalContext context;
@ -7423,18 +7333,6 @@ THREADED_TEST(Utf16Symbol) {
context->GetIsolate(), "abc", v8::String::kInternalizedString);
CHECK(SameSymbol(symbol1, symbol2));
SameSymbolHelper(context->GetIsolate(),
"\360\220\220\205", // 4 byte encoding.
"\355\240\201\355\260\205"); // 2 3-byte surrogates.
SameSymbolHelper(context->GetIsolate(),
"\355\240\201\355\260\206", // 2 3-byte surrogates.
"\360\220\220\206"); // 4 byte encoding.
SameSymbolHelper(context->GetIsolate(),
"x\360\220\220\205", // 4 byte encoding.
"x\355\240\201\355\260\205"); // 2 3-byte surrogates.
SameSymbolHelper(context->GetIsolate(),
"x\355\240\201\355\260\206", // 2 3-byte surrogates.
"x\360\220\220\206"); // 4 byte encoding.
CompileRun(
"var sym0 = 'benedictus';"
"var sym0b = 'S\303\270ren';"

View File

@ -699,18 +699,22 @@ TEST(Utf8CharacterStream) {
char buffer[kAllUtf8CharsSizeU];
unsigned cursor = 0;
for (int i = 0; i <= kMaxUC16Char; i++) {
cursor += unibrow::Utf8::Encode(buffer + cursor,
i,
unibrow::Utf16::kNoPreviousCharacter);
cursor += unibrow::Utf8::Encode(buffer + cursor, i,
unibrow::Utf16::kNoPreviousCharacter, true);
}
DCHECK(cursor == kAllUtf8CharsSizeU);
i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
kAllUtf8CharsSizeU);
int32_t bad = unibrow::Utf8::kBadChar;
for (int i = 0; i <= kMaxUC16Char; i++) {
CHECK_EQU(i, stream.pos());
int32_t c = stream.Advance();
CHECK_EQ(i, c);
if (i >= 0xd800 && i <= 0xdfff) {
CHECK_EQ(bad, c);
} else {
CHECK_EQ(i, c);
}
CHECK_EQU(i + 1, stream.pos());
}
for (int i = kMaxUC16Char; i >= 0; i--) {
@ -724,7 +728,9 @@ TEST(Utf8CharacterStream) {
int progress = static_cast<int>(stream.SeekForward(12));
i += progress;
int32_t c = stream.Advance();
if (i <= kMaxUC16Char) {
if (i >= 0xd800 && i <= 0xdfff) {
CHECK_EQ(bad, c);
} else if (i <= kMaxUC16Char) {
CHECK_EQ(i, c);
} else {
CHECK_EQ(-1, c);
@ -913,6 +919,15 @@ static int Utf8LengthHelper(const char* s) {
// Record a single kBadChar for the first byte and continue.
continue;
}
if (c == 0xed) {
unsigned char d = s[i + 1];
if ((d < 0x80) || (d > 0x9f)) {
// This 3 byte sequence is part of a surrogate pair which is not
// supported by UTF-8. Record a single kBadChar for the first byte
// and continue.
continue;
}
}
input_offset = 2;
// 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
output_adjust = 2;