Fix out-of-range access in unibrow::Utf8::CalculateValue.

This code should not access bytes out of the permitted range in order to check the range of a possible UTF-8 value. Instead, the length check should occur before such checks. BUG=chromium:667260, chromium:662822 Review-Url: https://codereview.chromium.org/2520053003 Cr-Commit-Position: refs/heads/master@{#41165}
2016-11-22 01:27:41 -08:00 · 2016-11-22 01:27:41 -08:00 · 9d524bd33d
commit 9d524bd33d
parent 8c4988f738
5 changed files with 52 additions and 14 deletions
--- a/src/unicode-decoder.h
+++ b/src/unicode-decoder.h
@ -7,10 +7,11 @@

 #include <sys/types.h>
 #include "src/globals.h"
+#include "src/utils.h"

 namespace unibrow {

-class Utf8DecoderBase {
+class V8_EXPORT_PRIVATE Utf8DecoderBase {
 public:
  // Initialization done in subclass.
  inline Utf8DecoderBase();
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -235,35 +235,31 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
  while (count < max_count && IsContinuationCharacter(str[count])) {
    count++;
  }
+  *cursor += count;

-  // Check overly long sequences & other conditions. Use length as error
-  // indicator.
+  // There must be enough continuation characters.
+  if (count != length) return kBadChar;
+
+  // Check overly long sequences & other conditions.
  if (length == 3) {
    if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
      // Overlong three-byte sequence?
-      length = 0;
+      return kBadChar;
    } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
      // High and low surrogate halves?
-      length = 0;
+      return kBadChar;
    }
  } else if (length == 4) {
    if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
      // Overlong four-byte sequence.
-      length = 0;
+      return kBadChar;
    } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
      // Code points outside of the unicode range.
-      length = 0;
-    }
-  }
-
-  if (count != length) {
-    // All invalid encodings should land here.
-    *cursor += count;
      return kBadChar;
    }
+  }

  // All errors have been handled, so we only have to assemble the result.
-  *cursor += length;
  switch (length) {
    case 1:
      return str[0];
--- a/test/unittests/BUILD.gn
+++ b/test/unittests/BUILD.gn
@ -120,6 +120,7 @@ v8_executable("unittests") {
    "source-position-table-unittest.cc",
    "test-utils.cc",
    "test-utils.h",
+    "unicode-unittest.cc",
    "value-serializer-unittest.cc",
    "wasm/asm-types-unittest.cc",
    "wasm/ast-decoder-unittest.cc",
--- a/test/unittests/unicode-unittest.cc
+++ b/test/unittests/unicode-unittest.cc
@ -0,0 +1,39 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <memory>
+#include <string>
+
+#include "src/unicode-decoder.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace v8 {
+namespace internal {
+
+namespace {
+
+using Utf8Decoder = unibrow::Utf8Decoder<512>;
+
+void Decode(Utf8Decoder* decoder, const std::string& str) {
+  // Put the string in its own buffer on the heap to make sure that
+  // AddressSanitizer's heap-buffer-overflow logic can see what's going on.
+  std::unique_ptr<char[]> buffer(new char[str.length()]);
+  memcpy(buffer.get(), str.data(), str.length());
+  decoder->Reset(buffer.get(), str.length());
+}
+
+}  // namespace
+
+TEST(UnicodeTest, ReadOffEndOfUtf8String) {
+  Utf8Decoder decoder;
+
+  // Not enough continuation bytes before string ends.
+  Decode(&decoder, "\xE0");
+  Decode(&decoder, "\xED");
+  Decode(&decoder, "\xF0");
+  Decode(&decoder, "\xF4");
+}
+
+}  // namespace internal
+}  // namespace v8
--- a/test/unittests/unittests.gyp
+++ b/test/unittests/unittests.gyp
@ -118,6 +118,7 @@
      'source-position-table-unittest.cc',
      'test-utils.h',
      'test-utils.cc',
+      'unicode-unittest.cc',
      'value-serializer-unittest.cc',
      'zone/segmentpool-unittest.cc',
      'zone/zone-chunk-list-unittest.cc',