Fix out-of-range access in unibrow::Utf8::CalculateValue.

This code should not access bytes out of the permitted range in order to check the range of a possible UTF-8 value. Instead, the length check should occur before such checks. BUG=chromium:667260, chromium:662822 Review-Url: https://codereview.chromium.org/2520053003 Cr-Commit-Position: refs/heads/master@{#41165}
2016-11-22 01:27:41 -08:00 · 2016-11-22 01:27:41 -08:00 · 9d524bd33d
commit 9d524bd33d
parent 8c4988f738
5 changed files with 52 additions and 14 deletions
--- a/src/unicode-decoder.h
+++ b/src/unicode-decoder.h
@ -7,10 +7,11 @@
 #include <sys/types.h>
 #include "src/globals.h"
 #include "src/utils.h"
 namespace unibrow {
-class Utf8DecoderBase {
+class V8_EXPORT_PRIVATE Utf8DecoderBase {
 public:
  // Initialization done in subclass.
  inline Utf8DecoderBase();
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -235,35 +235,31 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
  while (count < max_count && IsContinuationCharacter(str[count])) {
    count++;
  }
  *cursor += count;
-  // Check overly long sequences & other conditions. Use length as error
+  // There must be enough continuation characters.
-  // indicator.
+  if (count != length) return kBadChar;
  // Check overly long sequences & other conditions.
  if (length == 3) {
    if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
      // Overlong three-byte sequence?
-      length = 0;
+      return kBadChar;
    } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
      // High and low surrogate halves?
-      length = 0;
+      return kBadChar;
    }
  } else if (length == 4) {
    if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
      // Overlong four-byte sequence.
-      length = 0;
+      return kBadChar;
    } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
      // Code points outside of the unicode range.
-      length = 0;
+      return kBadChar;
    }
  }
  if (count != length) {
    // All invalid encodings should land here.
    *cursor += count;
    return kBadChar;
  }
  // All errors have been handled, so we only have to assemble the result.
  *cursor += length;
  switch (length) {
    case 1:
      return str[0];
--- a/test/unittests/BUILD.gn
+++ b/test/unittests/BUILD.gn
@ -120,6 +120,7 @@ v8_executable("unittests") {
    "source-position-table-unittest.cc",
    "test-utils.cc",
    "test-utils.h",
    "unicode-unittest.cc",
    "value-serializer-unittest.cc",
    "wasm/asm-types-unittest.cc",
    "wasm/ast-decoder-unittest.cc",
--- a/test/unittests/unicode-unittest.cc
+++ b/test/unittests/unicode-unittest.cc
@ -0,0 +1,39 @@
 // Copyright 2016 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 #include <memory>
 #include <string>
 #include "src/unicode-decoder.h"
 #include "testing/gtest/include/gtest/gtest.h"
 namespace v8 {
 namespace internal {
 namespace {
 using Utf8Decoder = unibrow::Utf8Decoder<512>;
 void Decode(Utf8Decoder* decoder, const std::string& str) {
  // Put the string in its own buffer on the heap to make sure that
  // AddressSanitizer's heap-buffer-overflow logic can see what's going on.
  std::unique_ptr<char[]> buffer(new char[str.length()]);
  memcpy(buffer.get(), str.data(), str.length());
  decoder->Reset(buffer.get(), str.length());
 }
 }  // namespace
 TEST(UnicodeTest, ReadOffEndOfUtf8String) {
  Utf8Decoder decoder;
  // Not enough continuation bytes before string ends.
  Decode(&decoder, "\xE0");
  Decode(&decoder, "\xED");
  Decode(&decoder, "\xF0");
  Decode(&decoder, "\xF4");
 }
 }  // namespace internal
 }  // namespace v8
--- a/test/unittests/unittests.gyp
+++ b/test/unittests/unittests.gyp
@ -118,6 +118,7 @@
      'source-position-table-unittest.cc',
      'test-utils.h',
      'test-utils.cc',
      'unicode-unittest.cc',
      'value-serializer-unittest.cc',
      'zone/segmentpool-unittest.cc',
      'zone/zone-chunk-list-unittest.cc',