Update UTF-8 decoder to detect more special cases.
The blink version is stricter and for parsing it's important that both decoders behave the same. BUG=chromium:489944 R=vogelheim@chromium.org LOG=n Review URL: https://codereview.chromium.org/1148653007 Cr-Commit-Position: refs/heads/master@{#28601}
This commit is contained in:
parent
c52bb1f03a
commit
3d5b2f807b
139
src/unicode.cc
139
src/unicode.cc
@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table,
|
||||
}
|
||||
|
||||
|
||||
uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {
|
||||
// We only get called for non-ASCII characters.
|
||||
if (length == 1) {
|
||||
static inline size_t NonASCIISequenceLength(byte first) {
|
||||
// clang-format off
|
||||
static const uint8_t lengths[256] = {
|
||||
// The first 128 entries correspond to ASCII characters.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// The following 64 entries correspond to continuation bytes.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// The next are two invalid overlong encodings and 30 two-byte sequences.
|
||||
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
// 16 three-byte sequences.
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
// 5 four-byte sequences, followed by sequences that could only encode
|
||||
// code points outside of the unicode range.
|
||||
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
// clang-format on
|
||||
return lengths[first];
|
||||
}
|
||||
|
||||
|
||||
static inline bool IsContinuationCharacter(byte chr) {
|
||||
return chr >= 0x80 && chr <= 0xBF;
|
||||
}
|
||||
|
||||
|
||||
// This method decodes an UTF-8 value according to RFC 3629.
|
||||
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
|
||||
size_t length = NonASCIISequenceLength(str[0]);
|
||||
if (length == 0 || max_length < length) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
byte first = str[0];
|
||||
byte second = str[1] ^ 0x80;
|
||||
if (second & 0xC0) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
if (first < 0xE0) {
|
||||
if (first < 0xC0) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
|
||||
if (code_point <= kMaxOneByteChar) {
|
||||
if (length == 2) {
|
||||
if (!IsContinuationCharacter(str[1])) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
*cursor += 2;
|
||||
return code_point;
|
||||
return ((str[0] << 6) + str[1]) - 0x00003080;
|
||||
}
|
||||
if (length == 2) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
byte third = str[2] ^ 0x80;
|
||||
if (third & 0xC0) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
if (first < 0xF0) {
|
||||
uchar code_point = ((((first << 6) | second) << 6) | third)
|
||||
& kMaxThreeByteChar;
|
||||
if (code_point <= kMaxTwoByteChar) {
|
||||
if (length == 3) {
|
||||
switch (str[0]) {
|
||||
case 0xE0:
|
||||
// Overlong three-byte sequence.
|
||||
if (str[1] < 0xA0 || str[1] > 0xBF) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
break;
|
||||
case 0xED:
|
||||
// High and low surrogate halves.
|
||||
if (str[1] < 0x80 || str[1] > 0x9F) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (!IsContinuationCharacter(str[1])) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
}
|
||||
if (!IsContinuationCharacter(str[2])) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
*cursor += 3;
|
||||
return code_point;
|
||||
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
|
||||
}
|
||||
if (length == 3) {
|
||||
DCHECK(length == 4);
|
||||
switch (str[0]) {
|
||||
case 0xF0:
|
||||
// Overlong four-byte sequence.
|
||||
if (str[1] < 0x90 || str[1] > 0xBF) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
break;
|
||||
case 0xF4:
|
||||
// Code points outside of the unicode range.
|
||||
if (str[1] < 0x80 || str[1] > 0x8F) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (!IsContinuationCharacter(str[1])) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
}
|
||||
if (!IsContinuationCharacter(str[2])) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
byte fourth = str[3] ^ 0x80;
|
||||
if (fourth & 0xC0) {
|
||||
if (!IsContinuationCharacter(str[3])) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
if (first < 0xF8) {
|
||||
uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth)
|
||||
& kMaxFourByteChar;
|
||||
if (code_point <= kMaxThreeByteChar) {
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
}
|
||||
*cursor += 4;
|
||||
return code_point;
|
||||
}
|
||||
*cursor += 1;
|
||||
return kBadChar;
|
||||
*cursor += 4;
|
||||
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
|
||||
0x03C82080;
|
||||
}
|
||||
|
||||
|
||||
|
@ -7267,84 +7267,6 @@ static void Utf16Helper(
|
||||
}
|
||||
|
||||
|
||||
static uint16_t StringGet(Handle<String> str, int index) {
|
||||
i::Handle<i::String> istring =
|
||||
v8::Utils::OpenHandle(String::Cast(*str));
|
||||
return istring->Get(index);
|
||||
}
|
||||
|
||||
|
||||
static void WriteUtf8Helper(
|
||||
LocalContext& context, // NOLINT
|
||||
const char* name,
|
||||
const char* lengths_name,
|
||||
int len) {
|
||||
Local<v8::Array> b =
|
||||
Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
|
||||
Local<v8::Array> alens =
|
||||
Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
|
||||
char buffer[1000];
|
||||
char buffer2[1000];
|
||||
for (int i = 0; i < len; i++) {
|
||||
Local<v8::String> string =
|
||||
Local<v8::String>::Cast(b->Get(i));
|
||||
Local<v8::Number> expected_len =
|
||||
Local<v8::Number>::Cast(alens->Get(i));
|
||||
int utf8_length = static_cast<int>(expected_len->Value());
|
||||
for (int j = utf8_length + 1; j >= 0; j--) {
|
||||
memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
|
||||
memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
|
||||
int nchars;
|
||||
int utf8_written =
|
||||
string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
|
||||
int utf8_written2 =
|
||||
string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
|
||||
CHECK_GE(utf8_length + 1, utf8_written);
|
||||
CHECK_GE(utf8_length, utf8_written2);
|
||||
for (int k = 0; k < utf8_written2; k++) {
|
||||
CHECK_EQ(buffer[k], buffer2[k]);
|
||||
}
|
||||
CHECK(nchars * 3 >= utf8_written - 1);
|
||||
CHECK(nchars <= utf8_written);
|
||||
if (j == utf8_length + 1) {
|
||||
CHECK_EQ(utf8_written2, utf8_length);
|
||||
CHECK_EQ(utf8_written2 + 1, utf8_written);
|
||||
}
|
||||
CHECK_EQ(buffer[utf8_written], 42);
|
||||
if (j > utf8_length) {
|
||||
if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
|
||||
if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
|
||||
Handle<String> roundtrip = v8_str(buffer);
|
||||
CHECK(roundtrip->Equals(string));
|
||||
} else {
|
||||
if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
|
||||
}
|
||||
if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
|
||||
if (nchars >= 2) {
|
||||
uint16_t trail = StringGet(string, nchars - 1);
|
||||
uint16_t lead = StringGet(string, nchars - 2);
|
||||
if (((lead & 0xfc00) == 0xd800) &&
|
||||
((trail & 0xfc00) == 0xdc00)) {
|
||||
unsigned u1 = buffer2[utf8_written2 - 4];
|
||||
unsigned u2 = buffer2[utf8_written2 - 3];
|
||||
unsigned u3 = buffer2[utf8_written2 - 2];
|
||||
unsigned u4 = buffer2[utf8_written2 - 1];
|
||||
CHECK_EQ((u1 & 0xf8), 0xf0u);
|
||||
CHECK_EQ((u2 & 0xc0), 0x80u);
|
||||
CHECK_EQ((u3 & 0xc0), 0x80u);
|
||||
CHECK_EQ((u4 & 0xc0), 0x80u);
|
||||
uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
|
||||
CHECK_EQ((u4 & 0x3f), (c & 0x3f));
|
||||
CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
|
||||
CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
|
||||
CHECK_EQ((u1 & 0x3), c >> 18);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
THREADED_TEST(Utf16) {
|
||||
LocalContext context;
|
||||
v8::HandleScope scope(context->GetIsolate());
|
||||
@ -7391,9 +7313,6 @@ THREADED_TEST(Utf16) {
|
||||
"}");
|
||||
Utf16Helper(context, "a", "alens", 9);
|
||||
Utf16Helper(context, "a2", "a2lens", 81);
|
||||
WriteUtf8Helper(context, "b", "alens", 9);
|
||||
WriteUtf8Helper(context, "b2", "a2lens", 81);
|
||||
WriteUtf8Helper(context, "c2", "a2lens", 81);
|
||||
}
|
||||
|
||||
|
||||
@ -7403,15 +7322,6 @@ static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
|
||||
return *is1 == *is2;
|
||||
}
|
||||
|
||||
static void SameSymbolHelper(v8::Isolate* isolate, const char* a,
|
||||
const char* b) {
|
||||
Handle<String> symbol1 =
|
||||
v8::String::NewFromUtf8(isolate, a, v8::String::kInternalizedString);
|
||||
Handle<String> symbol2 =
|
||||
v8::String::NewFromUtf8(isolate, b, v8::String::kInternalizedString);
|
||||
CHECK(SameSymbol(symbol1, symbol2));
|
||||
}
|
||||
|
||||
|
||||
THREADED_TEST(Utf16Symbol) {
|
||||
LocalContext context;
|
||||
@ -7423,18 +7333,6 @@ THREADED_TEST(Utf16Symbol) {
|
||||
context->GetIsolate(), "abc", v8::String::kInternalizedString);
|
||||
CHECK(SameSymbol(symbol1, symbol2));
|
||||
|
||||
SameSymbolHelper(context->GetIsolate(),
|
||||
"\360\220\220\205", // 4 byte encoding.
|
||||
"\355\240\201\355\260\205"); // 2 3-byte surrogates.
|
||||
SameSymbolHelper(context->GetIsolate(),
|
||||
"\355\240\201\355\260\206", // 2 3-byte surrogates.
|
||||
"\360\220\220\206"); // 4 byte encoding.
|
||||
SameSymbolHelper(context->GetIsolate(),
|
||||
"x\360\220\220\205", // 4 byte encoding.
|
||||
"x\355\240\201\355\260\205"); // 2 3-byte surrogates.
|
||||
SameSymbolHelper(context->GetIsolate(),
|
||||
"x\355\240\201\355\260\206", // 2 3-byte surrogates.
|
||||
"x\360\220\220\206"); // 4 byte encoding.
|
||||
CompileRun(
|
||||
"var sym0 = 'benedictus';"
|
||||
"var sym0b = 'S\303\270ren';"
|
||||
|
@ -699,18 +699,22 @@ TEST(Utf8CharacterStream) {
|
||||
char buffer[kAllUtf8CharsSizeU];
|
||||
unsigned cursor = 0;
|
||||
for (int i = 0; i <= kMaxUC16Char; i++) {
|
||||
cursor += unibrow::Utf8::Encode(buffer + cursor,
|
||||
i,
|
||||
unibrow::Utf16::kNoPreviousCharacter);
|
||||
cursor += unibrow::Utf8::Encode(buffer + cursor, i,
|
||||
unibrow::Utf16::kNoPreviousCharacter, true);
|
||||
}
|
||||
DCHECK(cursor == kAllUtf8CharsSizeU);
|
||||
|
||||
i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
|
||||
kAllUtf8CharsSizeU);
|
||||
int32_t bad = unibrow::Utf8::kBadChar;
|
||||
for (int i = 0; i <= kMaxUC16Char; i++) {
|
||||
CHECK_EQU(i, stream.pos());
|
||||
int32_t c = stream.Advance();
|
||||
CHECK_EQ(i, c);
|
||||
if (i >= 0xd800 && i <= 0xdfff) {
|
||||
CHECK_EQ(bad, c);
|
||||
} else {
|
||||
CHECK_EQ(i, c);
|
||||
}
|
||||
CHECK_EQU(i + 1, stream.pos());
|
||||
}
|
||||
for (int i = kMaxUC16Char; i >= 0; i--) {
|
||||
@ -724,7 +728,9 @@ TEST(Utf8CharacterStream) {
|
||||
int progress = static_cast<int>(stream.SeekForward(12));
|
||||
i += progress;
|
||||
int32_t c = stream.Advance();
|
||||
if (i <= kMaxUC16Char) {
|
||||
if (i >= 0xd800 && i <= 0xdfff) {
|
||||
CHECK_EQ(bad, c);
|
||||
} else if (i <= kMaxUC16Char) {
|
||||
CHECK_EQ(i, c);
|
||||
} else {
|
||||
CHECK_EQ(-1, c);
|
||||
@ -913,6 +919,15 @@ static int Utf8LengthHelper(const char* s) {
|
||||
// Record a single kBadChar for the first byte and continue.
|
||||
continue;
|
||||
}
|
||||
if (c == 0xed) {
|
||||
unsigned char d = s[i + 1];
|
||||
if ((d < 0x80) || (d > 0x9f)) {
|
||||
// This 3 byte sequence is part of a surrogate pair which is not
|
||||
// supported by UTF-8. Record a single kBadChar for the first byte
|
||||
// and continue.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
input_offset = 2;
|
||||
// 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
|
||||
output_adjust = 2;
|
||||
|
Loading…
Reference in New Issue
Block a user