Update UTF-8 decoder to detect more special cases.

The blink version is stricter and for parsing it's important that both decoders behave the same. BUG=chromium:489944 R=vogelheim@chromium.org LOG=n Review URL: https://codereview.chromium.org/1148653007 Cr-Commit-Position: refs/heads/master@{#28601}
2015-05-22 11:47:36 -07:00 · 2015-05-22 11:47:36 -07:00 · 3d5b2f807b
commit 3d5b2f807b
parent c52bb1f03a
3 changed files with 113 additions and 153 deletions
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table,
 }


-uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {
-  // We only get called for non-ASCII characters.
-  if (length == 1) {
+static inline size_t NonASCIISequenceLength(byte first) {
+  // clang-format off
+  static const uint8_t lengths[256] = {
+      // The first 128 entries correspond to ASCII characters.
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      // The following 64 entries correspond to continuation bytes.
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      // The next are two invalid overlong encodings and 30 two-byte sequences.
+      0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      // 16 three-byte sequences.
+      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      // 5 four-byte sequences, followed by sequences that could only encode
+      // code points outside of the unicode range.
+      4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  // clang-format on
+  return lengths[first];
+}
+
+
+static inline bool IsContinuationCharacter(byte chr) {
+  return chr >= 0x80 && chr <= 0xBF;
+}
+
+
+// This method decodes an UTF-8 value according to RFC 3629.
+uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
+  size_t length = NonASCIISequenceLength(str[0]);
+  if (length == 0 || max_length < length) {
    *cursor += 1;
    return kBadChar;
  }
-  byte first = str[0];
-  byte second = str[1] ^ 0x80;
-  if (second & 0xC0) {
-    *cursor += 1;
-    return kBadChar;
-  }
-  if (first < 0xE0) {
-    if (first < 0xC0) {
-      *cursor += 1;
-      return kBadChar;
-    }
-    uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
-    if (code_point <= kMaxOneByteChar) {
+  if (length == 2) {
+    if (!IsContinuationCharacter(str[1])) {
      *cursor += 1;
      return kBadChar;
    }
    *cursor += 2;
-    return code_point;
+    return ((str[0] << 6) + str[1]) - 0x00003080;
  }
-  if (length == 2) {
-    *cursor += 1;
-    return kBadChar;
-  }
-  byte third = str[2] ^ 0x80;
-  if (third & 0xC0) {
-    *cursor += 1;
-    return kBadChar;
-  }
-  if (first < 0xF0) {
-    uchar code_point = ((((first << 6) | second) << 6) | third)
-        & kMaxThreeByteChar;
-    if (code_point <= kMaxTwoByteChar) {
+  if (length == 3) {
+    switch (str[0]) {
+      case 0xE0:
+        // Overlong three-byte sequence.
+        if (str[1] < 0xA0 || str[1] > 0xBF) {
+          *cursor += 1;
+          return kBadChar;
+        }
+        break;
+      case 0xED:
+        // High and low surrogate halves.
+        if (str[1] < 0x80 || str[1] > 0x9F) {
+          *cursor += 1;
+          return kBadChar;
+        }
+        break;
+      default:
+        if (!IsContinuationCharacter(str[1])) {
+          *cursor += 1;
+          return kBadChar;
+        }
+    }
+    if (!IsContinuationCharacter(str[2])) {
      *cursor += 1;
      return kBadChar;
    }
    *cursor += 3;
-    return code_point;
+    return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
  }
-  if (length == 3) {
+  DCHECK(length == 4);
+  switch (str[0]) {
+    case 0xF0:
+      // Overlong four-byte sequence.
+      if (str[1] < 0x90 || str[1] > 0xBF) {
+        *cursor += 1;
+        return kBadChar;
+      }
+      break;
+    case 0xF4:
+      // Code points outside of the unicode range.
+      if (str[1] < 0x80 || str[1] > 0x8F) {
+        *cursor += 1;
+        return kBadChar;
+      }
+      break;
+    default:
+      if (!IsContinuationCharacter(str[1])) {
+        *cursor += 1;
+        return kBadChar;
+      }
+  }
+  if (!IsContinuationCharacter(str[2])) {
    *cursor += 1;
    return kBadChar;
  }
-  byte fourth = str[3] ^ 0x80;
-  if (fourth & 0xC0) {
+  if (!IsContinuationCharacter(str[3])) {
    *cursor += 1;
    return kBadChar;
  }
-  if (first < 0xF8) {
-    uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth)
-        & kMaxFourByteChar;
-    if (code_point <= kMaxThreeByteChar) {
-      *cursor += 1;
-      return kBadChar;
-    }
-    *cursor += 4;
-    return code_point;
-  }
-  *cursor += 1;
-  return kBadChar;
+  *cursor += 4;
+  return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
+         0x03C82080;
 }


--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@ -7267,84 +7267,6 @@ static void Utf16Helper(
 }


-static uint16_t StringGet(Handle<String> str, int index) {
-  i::Handle<i::String> istring =
-      v8::Utils::OpenHandle(String::Cast(*str));
-  return istring->Get(index);
-}
-
-
-static void WriteUtf8Helper(
-    LocalContext& context,  // NOLINT
-    const char* name,
-    const char* lengths_name,
-    int len) {
-  Local<v8::Array> b =
-      Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
-  Local<v8::Array> alens =
-      Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
-  char buffer[1000];
-  char buffer2[1000];
-  for (int i = 0; i < len; i++) {
-    Local<v8::String> string =
-      Local<v8::String>::Cast(b->Get(i));
-    Local<v8::Number> expected_len =
-      Local<v8::Number>::Cast(alens->Get(i));
-    int utf8_length = static_cast<int>(expected_len->Value());
-    for (int j = utf8_length + 1; j >= 0; j--) {
-      memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
-      memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
-      int nchars;
-      int utf8_written =
-          string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
-      int utf8_written2 =
-          string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
-      CHECK_GE(utf8_length + 1, utf8_written);
-      CHECK_GE(utf8_length, utf8_written2);
-      for (int k = 0; k < utf8_written2; k++) {
-        CHECK_EQ(buffer[k], buffer2[k]);
-      }
-      CHECK(nchars * 3 >= utf8_written - 1);
-      CHECK(nchars <= utf8_written);
-      if (j == utf8_length + 1) {
-        CHECK_EQ(utf8_written2, utf8_length);
-        CHECK_EQ(utf8_written2 + 1, utf8_written);
-      }
-      CHECK_EQ(buffer[utf8_written], 42);
-      if (j > utf8_length) {
-        if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
-        if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
-        Handle<String> roundtrip = v8_str(buffer);
-        CHECK(roundtrip->Equals(string));
-      } else {
-        if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
-      }
-      if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
-      if (nchars >= 2) {
-        uint16_t trail = StringGet(string, nchars - 1);
-        uint16_t lead = StringGet(string, nchars - 2);
-        if (((lead & 0xfc00) == 0xd800) &&
-            ((trail & 0xfc00) == 0xdc00)) {
-          unsigned u1 = buffer2[utf8_written2 - 4];
-          unsigned u2 = buffer2[utf8_written2 - 3];
-          unsigned u3 = buffer2[utf8_written2 - 2];
-          unsigned u4 = buffer2[utf8_written2 - 1];
-          CHECK_EQ((u1 & 0xf8), 0xf0u);
-          CHECK_EQ((u2 & 0xc0), 0x80u);
-          CHECK_EQ((u3 & 0xc0), 0x80u);
-          CHECK_EQ((u4 & 0xc0), 0x80u);
-          uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
-          CHECK_EQ((u4 & 0x3f), (c & 0x3f));
-          CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
-          CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
-          CHECK_EQ((u1 & 0x3), c >> 18);
-        }
-      }
-    }
-  }
-}
-
-
 THREADED_TEST(Utf16) {
  LocalContext context;
  v8::HandleScope scope(context->GetIsolate());
@ -7391,9 +7313,6 @@ THREADED_TEST(Utf16) {
      "}");
  Utf16Helper(context, "a", "alens", 9);
  Utf16Helper(context, "a2", "a2lens", 81);
-  WriteUtf8Helper(context, "b", "alens", 9);
-  WriteUtf8Helper(context, "b2", "a2lens", 81);
-  WriteUtf8Helper(context, "c2", "a2lens", 81);
 }


@ -7403,15 +7322,6 @@ static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
  return *is1 == *is2;
 }

-static void SameSymbolHelper(v8::Isolate* isolate, const char* a,
-                             const char* b) {
-  Handle<String> symbol1 =
-      v8::String::NewFromUtf8(isolate, a, v8::String::kInternalizedString);
-  Handle<String> symbol2 =
-      v8::String::NewFromUtf8(isolate, b, v8::String::kInternalizedString);
-  CHECK(SameSymbol(symbol1, symbol2));
-}
-

 THREADED_TEST(Utf16Symbol) {
  LocalContext context;
@ -7423,18 +7333,6 @@ THREADED_TEST(Utf16Symbol) {
      context->GetIsolate(), "abc", v8::String::kInternalizedString);
  CHECK(SameSymbol(symbol1, symbol2));

-  SameSymbolHelper(context->GetIsolate(),
-                   "\360\220\220\205",  // 4 byte encoding.
-                   "\355\240\201\355\260\205");  // 2 3-byte surrogates.
-  SameSymbolHelper(context->GetIsolate(),
-                   "\355\240\201\355\260\206",  // 2 3-byte surrogates.
-                   "\360\220\220\206");  // 4 byte encoding.
-  SameSymbolHelper(context->GetIsolate(),
-                   "x\360\220\220\205",  // 4 byte encoding.
-                   "x\355\240\201\355\260\205");  // 2 3-byte surrogates.
-  SameSymbolHelper(context->GetIsolate(),
-                   "x\355\240\201\355\260\206",  // 2 3-byte surrogates.
-                   "x\360\220\220\206");  // 4 byte encoding.
  CompileRun(
      "var sym0 = 'benedictus';"
      "var sym0b = 'S\303\270ren';"
--- a/test/cctest/test-parsing.cc
+++ b/test/cctest/test-parsing.cc
@ -699,18 +699,22 @@ TEST(Utf8CharacterStream) {
  char buffer[kAllUtf8CharsSizeU];
  unsigned cursor = 0;
  for (int i = 0; i <= kMaxUC16Char; i++) {
-    cursor += unibrow::Utf8::Encode(buffer + cursor,
-                                    i,
-                                    unibrow::Utf16::kNoPreviousCharacter);
+    cursor += unibrow::Utf8::Encode(buffer + cursor, i,
+                                    unibrow::Utf16::kNoPreviousCharacter, true);
  }
  DCHECK(cursor == kAllUtf8CharsSizeU);

  i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
                                       kAllUtf8CharsSizeU);
+  int32_t bad = unibrow::Utf8::kBadChar;
  for (int i = 0; i <= kMaxUC16Char; i++) {
    CHECK_EQU(i, stream.pos());
    int32_t c = stream.Advance();
-    CHECK_EQ(i, c);
+    if (i >= 0xd800 && i <= 0xdfff) {
+      CHECK_EQ(bad, c);
+    } else {
+      CHECK_EQ(i, c);
+    }
    CHECK_EQU(i + 1, stream.pos());
  }
  for (int i = kMaxUC16Char; i >= 0; i--) {
@ -724,7 +728,9 @@ TEST(Utf8CharacterStream) {
    int progress = static_cast<int>(stream.SeekForward(12));
    i += progress;
    int32_t c = stream.Advance();
-    if (i <= kMaxUC16Char) {
+    if (i >= 0xd800 && i <= 0xdfff) {
+      CHECK_EQ(bad, c);
+    } else if (i <= kMaxUC16Char) {
      CHECK_EQ(i, c);
    } else {
      CHECK_EQ(-1, c);
@ -913,6 +919,15 @@ static int Utf8LengthHelper(const char* s) {
          // Record a single kBadChar for the first byte and continue.
          continue;
        }
+        if (c == 0xed) {
+          unsigned char d = s[i + 1];
+          if ((d < 0x80) || (d > 0x9f)) {
+            // This 3 byte sequence is part of a surrogate pair which is not
+            // supported by UTF-8. Record a single kBadChar for the first byte
+            // and continue.
+            continue;
+          }
+        }
        input_offset = 2;
        // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
        output_adjust = 2;