diff --git a/src/unicode.cc b/src/unicode.cc index 0d0d63d177..df45697bde 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table, } -uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { - // We only get called for non-ASCII characters. - if (length == 1) { +static inline size_t NonASCIISequenceLength(byte first) { + // clang-format off + static const uint8_t lengths[256] = { + // The first 128 entries correspond to ASCII characters. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // The following 64 entries correspond to continuation bytes. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // The next are two invalid overlong encodings and 30 two-byte sequences. + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + // 16 three-byte sequences. + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // 5 four-byte sequences, followed by sequences that could only encode + // code points outside of the unicode range. + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + // clang-format on + return lengths[first]; +} + + +static inline bool IsContinuationCharacter(byte chr) { + return chr >= 0x80 && chr <= 0xBF; +} + + +// This method decodes an UTF-8 value according to RFC 3629. +uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { + size_t length = NonASCIISequenceLength(str[0]); + if (length == 0 || max_length < length) { *cursor += 1; return kBadChar; } - byte first = str[0]; - byte second = str[1] ^ 0x80; - if (second & 0xC0) { - *cursor += 1; - return kBadChar; - } - if (first < 0xE0) { - if (first < 0xC0) { - *cursor += 1; - return kBadChar; - } - uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; - if (code_point <= kMaxOneByteChar) { + if (length == 2) { + if (!IsContinuationCharacter(str[1])) { *cursor += 1; return kBadChar; } *cursor += 2; - return code_point; + return ((str[0] << 6) + str[1]) - 0x00003080; } - if (length == 2) { - *cursor += 1; - return kBadChar; - } - byte third = str[2] ^ 0x80; - if (third & 0xC0) { - *cursor += 1; - return kBadChar; - } - if (first < 0xF0) { - uchar code_point = ((((first << 6) | second) << 6) | third) - & kMaxThreeByteChar; - if (code_point <= kMaxTwoByteChar) { + if (length == 3) { + switch (str[0]) { + case 0xE0: + // Overlong three-byte sequence. + if (str[1] < 0xA0 || str[1] > 0xBF) { + *cursor += 1; + return kBadChar; + } + break; + case 0xED: + // High and low surrogate halves. + if (str[1] < 0x80 || str[1] > 0x9F) { + *cursor += 1; + return kBadChar; + } + break; + default: + if (!IsContinuationCharacter(str[1])) { + *cursor += 1; + return kBadChar; + } + } + if (!IsContinuationCharacter(str[2])) { *cursor += 1; return kBadChar; } *cursor += 3; - return code_point; + return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; } - if (length == 3) { + DCHECK(length == 4); + switch (str[0]) { + case 0xF0: + // Overlong four-byte sequence. + if (str[1] < 0x90 || str[1] > 0xBF) { + *cursor += 1; + return kBadChar; + } + break; + case 0xF4: + // Code points outside of the unicode range. + if (str[1] < 0x80 || str[1] > 0x8F) { + *cursor += 1; + return kBadChar; + } + break; + default: + if (!IsContinuationCharacter(str[1])) { + *cursor += 1; + return kBadChar; + } + } + if (!IsContinuationCharacter(str[2])) { *cursor += 1; return kBadChar; } - byte fourth = str[3] ^ 0x80; - if (fourth & 0xC0) { + if (!IsContinuationCharacter(str[3])) { *cursor += 1; return kBadChar; } - if (first < 0xF8) { - uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) - & kMaxFourByteChar; - if (code_point <= kMaxThreeByteChar) { - *cursor += 1; - return kBadChar; - } - *cursor += 4; - return code_point; - } - *cursor += 1; - return kBadChar; + *cursor += 4; + return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - + 0x03C82080; } diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc index ba58d2437a..4890b80d25 100644 --- a/test/cctest/test-api.cc +++ b/test/cctest/test-api.cc @@ -7267,84 +7267,6 @@ static void Utf16Helper( } -static uint16_t StringGet(Handle str, int index) { - i::Handle istring = - v8::Utils::OpenHandle(String::Cast(*str)); - return istring->Get(index); -} - - -static void WriteUtf8Helper( - LocalContext& context, // NOLINT - const char* name, - const char* lengths_name, - int len) { - Local b = - Local::Cast(context->Global()->Get(v8_str(name))); - Local alens = - Local::Cast(context->Global()->Get(v8_str(lengths_name))); - char buffer[1000]; - char buffer2[1000]; - for (int i = 0; i < len; i++) { - Local string = - Local::Cast(b->Get(i)); - Local expected_len = - Local::Cast(alens->Get(i)); - int utf8_length = static_cast(expected_len->Value()); - for (int j = utf8_length + 1; j >= 0; j--) { - memset(reinterpret_cast(&buffer), 42, sizeof(buffer)); - memset(reinterpret_cast(&buffer2), 42, sizeof(buffer2)); - int nchars; - int utf8_written = - string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS); - int utf8_written2 = - string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION); - CHECK_GE(utf8_length + 1, utf8_written); - CHECK_GE(utf8_length, utf8_written2); - for (int k = 0; k < utf8_written2; k++) { - CHECK_EQ(buffer[k], buffer2[k]); - } - CHECK(nchars * 3 >= utf8_written - 1); - CHECK(nchars <= utf8_written); - if (j == utf8_length + 1) { - CHECK_EQ(utf8_written2, utf8_length); - CHECK_EQ(utf8_written2 + 1, utf8_written); - } - CHECK_EQ(buffer[utf8_written], 42); - if (j > utf8_length) { - if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0); - if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42); - Handle roundtrip = v8_str(buffer); - CHECK(roundtrip->Equals(string)); - } else { - if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42); - } - if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42); - if (nchars >= 2) { - uint16_t trail = StringGet(string, nchars - 1); - uint16_t lead = StringGet(string, nchars - 2); - if (((lead & 0xfc00) == 0xd800) && - ((trail & 0xfc00) == 0xdc00)) { - unsigned u1 = buffer2[utf8_written2 - 4]; - unsigned u2 = buffer2[utf8_written2 - 3]; - unsigned u3 = buffer2[utf8_written2 - 2]; - unsigned u4 = buffer2[utf8_written2 - 1]; - CHECK_EQ((u1 & 0xf8), 0xf0u); - CHECK_EQ((u2 & 0xc0), 0x80u); - CHECK_EQ((u3 & 0xc0), 0x80u); - CHECK_EQ((u4 & 0xc0), 0x80u); - uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); - CHECK_EQ((u4 & 0x3f), (c & 0x3f)); - CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f)); - CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f)); - CHECK_EQ((u1 & 0x3), c >> 18); - } - } - } - } -} - - THREADED_TEST(Utf16) { LocalContext context; v8::HandleScope scope(context->GetIsolate()); @@ -7391,9 +7313,6 @@ THREADED_TEST(Utf16) { "}"); Utf16Helper(context, "a", "alens", 9); Utf16Helper(context, "a2", "a2lens", 81); - WriteUtf8Helper(context, "b", "alens", 9); - WriteUtf8Helper(context, "b2", "a2lens", 81); - WriteUtf8Helper(context, "c2", "a2lens", 81); } @@ -7403,15 +7322,6 @@ static bool SameSymbol(Handle s1, Handle s2) { return *is1 == *is2; } -static void SameSymbolHelper(v8::Isolate* isolate, const char* a, - const char* b) { - Handle symbol1 = - v8::String::NewFromUtf8(isolate, a, v8::String::kInternalizedString); - Handle symbol2 = - v8::String::NewFromUtf8(isolate, b, v8::String::kInternalizedString); - CHECK(SameSymbol(symbol1, symbol2)); -} - THREADED_TEST(Utf16Symbol) { LocalContext context; @@ -7423,18 +7333,6 @@ THREADED_TEST(Utf16Symbol) { context->GetIsolate(), "abc", v8::String::kInternalizedString); CHECK(SameSymbol(symbol1, symbol2)); - SameSymbolHelper(context->GetIsolate(), - "\360\220\220\205", // 4 byte encoding. - "\355\240\201\355\260\205"); // 2 3-byte surrogates. - SameSymbolHelper(context->GetIsolate(), - "\355\240\201\355\260\206", // 2 3-byte surrogates. - "\360\220\220\206"); // 4 byte encoding. - SameSymbolHelper(context->GetIsolate(), - "x\360\220\220\205", // 4 byte encoding. - "x\355\240\201\355\260\205"); // 2 3-byte surrogates. - SameSymbolHelper(context->GetIsolate(), - "x\355\240\201\355\260\206", // 2 3-byte surrogates. - "x\360\220\220\206"); // 4 byte encoding. CompileRun( "var sym0 = 'benedictus';" "var sym0b = 'S\303\270ren';" diff --git a/test/cctest/test-parsing.cc b/test/cctest/test-parsing.cc index c7b044d719..03afabc29f 100644 --- a/test/cctest/test-parsing.cc +++ b/test/cctest/test-parsing.cc @@ -699,18 +699,22 @@ TEST(Utf8CharacterStream) { char buffer[kAllUtf8CharsSizeU]; unsigned cursor = 0; for (int i = 0; i <= kMaxUC16Char; i++) { - cursor += unibrow::Utf8::Encode(buffer + cursor, - i, - unibrow::Utf16::kNoPreviousCharacter); + cursor += unibrow::Utf8::Encode(buffer + cursor, i, + unibrow::Utf16::kNoPreviousCharacter, true); } DCHECK(cursor == kAllUtf8CharsSizeU); i::Utf8ToUtf16CharacterStream stream(reinterpret_cast(buffer), kAllUtf8CharsSizeU); + int32_t bad = unibrow::Utf8::kBadChar; for (int i = 0; i <= kMaxUC16Char; i++) { CHECK_EQU(i, stream.pos()); int32_t c = stream.Advance(); - CHECK_EQ(i, c); + if (i >= 0xd800 && i <= 0xdfff) { + CHECK_EQ(bad, c); + } else { + CHECK_EQ(i, c); + } CHECK_EQU(i + 1, stream.pos()); } for (int i = kMaxUC16Char; i >= 0; i--) { @@ -724,7 +728,9 @@ TEST(Utf8CharacterStream) { int progress = static_cast(stream.SeekForward(12)); i += progress; int32_t c = stream.Advance(); - if (i <= kMaxUC16Char) { + if (i >= 0xd800 && i <= 0xdfff) { + CHECK_EQ(bad, c); + } else if (i <= kMaxUC16Char) { CHECK_EQ(i, c); } else { CHECK_EQ(-1, c); @@ -913,6 +919,15 @@ static int Utf8LengthHelper(const char* s) { // Record a single kBadChar for the first byte and continue. continue; } + if (c == 0xed) { + unsigned char d = s[i + 1]; + if ((d < 0x80) || (d > 0x9f)) { + // This 3 byte sequence is part of a surrogate pair which is not + // supported by UTF-8. Record a single kBadChar for the first byte + // and continue. + continue; + } + } input_offset = 2; // 3 bytes of UTF-8 turn into 1 UTF-16 code unit. output_adjust = 2;