Update unicode to 7.0.0.

And do not use code points with PATTERN_* property for identifier start. Maintain that \u180E is a white space character. BUG=v8:2892 LOG=Y R=dpino@igalia.com, mathias@qiwi.be Review URL: https://codereview.chromium.org/638643002 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@24473 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2014-10-08 14:55:03 +00:00 · 2014-10-08 14:55:03 +00:00 · 8659e50723
commit 8659e50723
parent c8b1c3e784
15 changed files with 2934 additions and 1153 deletions
--- a/BUILD.gn
+++ b/BUILD.gn
@ -895,6 +895,8 @@ source_set("v8_base") {
    "src/unicode-inl.h",
    "src/unicode.cc",
    "src/unicode.h",
+    "src/unicode-decoder.cc",
+    "src/unicode-decoder.h",
    "src/unique.h",
    "src/uri.h",
    "src/utils-inl.h",
--- a/src/char-predicates.h
+++ b/src/char-predicates.h
@ -22,42 +22,40 @@ inline bool IsBinaryDigit(uc32 c);
 inline bool IsRegExpWord(uc32 c);
 inline bool IsRegExpNewline(uc32 c);

+// ES6 draft section 11.6
+// This includes '_', '$' and '\', and ID_Start according to
+// http://www.unicode.org/reports/tr31/, which consists of categories
+// 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
+// 'Pattern_Syntax' or 'Pattern_White_Space'.
 struct IdentifierStart {
-  static inline bool Is(uc32 c) {
-    switch (c) {
-      case '$': case '_': case '\\': return true;
-      default: return unibrow::Letter::Is(c);
-    }
-  }
+  static inline bool Is(uc32 c) { return unibrow::ID_Start::Is(c); }
 };


+// ES6 draft section 11.6
+// This includes \u200c and \u200d, and ID_Continue according to
+// http://www.unicode.org/reports/tr31/, which consists of ID_Start,
+// the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
+// 'Pattern_Syntax' or 'Pattern_White_Space'.
 struct IdentifierPart {
  static inline bool Is(uc32 c) {
-    return IdentifierStart::Is(c)
-        || unibrow::Number::Is(c)
-        || c == 0x200C  // U+200C is Zero-Width Non-Joiner.
-        || c == 0x200D  // U+200D is Zero-Width Joiner.
-        || unibrow::CombiningMark::Is(c)
-        || unibrow::ConnectorPunctuation::Is(c);
+    return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
  }
 };


-// WhiteSpace according to ECMA-262 5.1, 7.2.
+// ES6 draft section 11.2
+// This includes all code points of Unicode category 'Zs'.
+// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
+// so it is also included.
+// Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
 struct WhiteSpace {
-  static inline bool Is(uc32 c) {
-    return c == 0x0009 ||  // <TAB>
-           c == 0x000B ||  // <VT>
-           c == 0x000C ||  // <FF>
-           c == 0xFEFF ||  // <BOM>
-           // \u0020 and \u00A0 are included in unibrow::WhiteSpace.
-           unibrow::WhiteSpace::Is(c);
-  }
+  static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
 };


-// WhiteSpace and LineTerminator according to ECMA-262 5.1, 7.2 and 7.3.
+// WhiteSpace and LineTerminator according to ES6 draft section 11.2 and 11.3
+// This consists of \000a, \000d, \u2028, and \u2029.
 struct WhiteSpaceOrLineTerminator {
  static inline bool Is(uc32 c) {
    return WhiteSpace::Is(c) || unibrow::LineTerminator::Is(c);
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@ -20,6 +20,7 @@
 #include "src/regexp-stack.h"
 #include "src/runtime/runtime.h"
 #include "src/string-search.h"
+#include "src/unicode-decoder.h"

 #ifndef V8_INTERPRETED_REGEXP
 #if V8_TARGET_ARCH_IA32
--- a/src/objects.h
+++ b/src/objects.h
@ -20,6 +20,7 @@
 #include "src/property-details.h"
 #include "src/smart-pointers.h"
 #include "src/unicode-inl.h"
+#include "src/unicode-decoder.h"
 #include "src/zone.h"

 #if V8_TARGET_ARCH_ARM
--- a/src/scanner.h
+++ b/src/scanner.h
@ -15,6 +15,7 @@
 #include "src/list.h"
 #include "src/token.h"
 #include "src/unicode-inl.h"
+#include "src/unicode-decoder.h"
 #include "src/utils.h"

 namespace v8 {
--- a/src/unicode-decoder.cc
+++ b/src/unicode-decoder.cc
@ -0,0 +1,78 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+
+#include "src/unicode-inl.h"
+#include "src/unicode-decoder.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace unibrow {
+
+void Utf8DecoderBase::Reset(uint16_t* buffer, unsigned buffer_length,
+                            const uint8_t* stream, unsigned stream_length) {
+  // Assume everything will fit in the buffer and stream won't be needed.
+  last_byte_of_buffer_unused_ = false;
+  unbuffered_start_ = NULL;
+  bool writing_to_buffer = true;
+  // Loop until stream is read, writing to buffer as long as buffer has space.
+  unsigned utf16_length = 0;
+  while (stream_length != 0) {
+    unsigned cursor = 0;
+    uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
+    DCHECK(cursor > 0 && cursor <= stream_length);
+    stream += cursor;
+    stream_length -= cursor;
+    bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
+    utf16_length += is_two_characters ? 2 : 1;
+    // Don't need to write to the buffer, but still need utf16_length.
+    if (!writing_to_buffer) continue;
+    // Write out the characters to the buffer.
+    // Must check for equality with buffer_length as we've already updated it.
+    if (utf16_length <= buffer_length) {
+      if (is_two_characters) {
+        *buffer++ = Utf16::LeadSurrogate(character);
+        *buffer++ = Utf16::TrailSurrogate(character);
+      } else {
+        *buffer++ = character;
+      }
+      if (utf16_length == buffer_length) {
+        // Just wrote last character of buffer
+        writing_to_buffer = false;
+        unbuffered_start_ = stream;
+      }
+      continue;
+    }
+    // Have gone over buffer.
+    // Last char of buffer is unused, set cursor back.
+    DCHECK(is_two_characters);
+    writing_to_buffer = false;
+    last_byte_of_buffer_unused_ = true;
+    unbuffered_start_ = stream - cursor;
+  }
+  utf16_length_ = utf16_length;
+}
+
+
+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, uint16_t* data,
+                                     unsigned data_length) {
+  while (data_length != 0) {
+    unsigned cursor = 0;
+    uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);
+    // There's a total lack of bounds checking for stream
+    // as it was already done in Reset.
+    stream += cursor;
+    if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      *data++ = Utf16::LeadSurrogate(character);
+      *data++ = Utf16::TrailSurrogate(character);
+      DCHECK(data_length > 1);
+      data_length -= 2;
+    } else {
+      *data++ = character;
+      data_length -= 1;
+    }
+  }
+}
+
+}  // namespace unibrow
--- a/src/unicode-decoder.h
+++ b/src/unicode-decoder.h
@ -0,0 +1,121 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_UNICODE_DECODER_H_
+#define V8_UNICODE_DECODER_H_
+
+#include <sys/types.h>
+#include "src/globals.h"
+
+namespace unibrow {
+
+class Utf8DecoderBase {
+ public:
+  // Initialization done in subclass.
+  inline Utf8DecoderBase();
+  inline Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length,
+                         const uint8_t* stream, unsigned stream_length);
+  inline unsigned Utf16Length() const { return utf16_length_; }
+
+ protected:
+  // This reads all characters and sets the utf16_length_.
+  // The first buffer_length utf16 chars are cached in the buffer.
+  void Reset(uint16_t* buffer, unsigned buffer_length, const uint8_t* stream,
+             unsigned stream_length);
+  static void WriteUtf16Slow(const uint8_t* stream, uint16_t* data,
+                             unsigned length);
+  const uint8_t* unbuffered_start_;
+  unsigned utf16_length_;
+  bool last_byte_of_buffer_unused_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
+};
+
+template <unsigned kBufferSize>
+class Utf8Decoder : public Utf8DecoderBase {
+ public:
+  inline Utf8Decoder() {}
+  inline Utf8Decoder(const char* stream, unsigned length);
+  inline void Reset(const char* stream, unsigned length);
+  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
+
+ private:
+  uint16_t buffer_[kBufferSize];
+};
+
+
+Utf8DecoderBase::Utf8DecoderBase()
+    : unbuffered_start_(NULL),
+      utf16_length_(0),
+      last_byte_of_buffer_unused_(false) {}
+
+
+Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length,
+                                 const uint8_t* stream,
+                                 unsigned stream_length) {
+  Reset(buffer, buffer_length, stream, stream_length);
+}
+
+
+template <unsigned kBufferSize>
+Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
+    : Utf8DecoderBase(buffer_, kBufferSize,
+                      reinterpret_cast<const uint8_t*>(stream), length) {}
+
+
+template <unsigned kBufferSize>
+void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
+  Utf8DecoderBase::Reset(buffer_, kBufferSize,
+                         reinterpret_cast<const uint8_t*>(stream), length);
+}
+
+
+template <unsigned kBufferSize>
+unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
+                                              unsigned length) const {
+  DCHECK(length > 0);
+  if (length > utf16_length_) length = utf16_length_;
+  // memcpy everything in buffer.
+  unsigned buffer_length =
+      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
+  unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
+  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
+  if (length <= buffer_length) return length;
+  DCHECK(unbuffered_start_ != NULL);
+  // Copy the rest the slow way.
+  WriteUtf16Slow(unbuffered_start_, data + buffer_length,
+                 length - buffer_length);
+  return length;
+}
+
+class Latin1 {
+ public:
+  static const unsigned kMaxChar = 0xff;
+  // Returns 0 if character does not convert to single latin-1 character
+  // or if the character doesn't not convert back to latin-1 via inverse
+  // operation (upper to lower, etc).
+  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
+};
+
+
+uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
+  DCHECK(c > Latin1::kMaxChar);
+  switch (c) {
+    // This are equivalent characters in unicode.
+    case 0x39c:
+    case 0x3bc:
+      return 0xb5;
+    // This is an uppercase of a Latin-1 character
+    // outside of Latin-1.
+    case 0x178:
+      return 0xff;
+  }
+  return 0;
+}
+
+
+}  // namespace unibrow
+
+#endif  // V8_UNICODE_DECODER_H_
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@ -57,22 +57,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
 }


-uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
-  DCHECK(c > Latin1::kMaxChar);
-  switch (c) {
-    // This are equivalent characters in unicode.
-    case 0x39c:
-    case 0x3bc:
-      return 0xb5;
-    // This is an uppercase of a Latin-1 character
-    // outside of Latin-1.
-    case 0x178:
-      return 0xff;
-  }
-  return 0;
-}
-
-
 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
  static const int kMask = ~(1 << 6);
  if (c <= kMaxOneByteChar) {
@ -153,53 +137,6 @@ unsigned Utf8::Length(uchar c, int previous) {
  }
 }

-Utf8DecoderBase::Utf8DecoderBase()
-  : unbuffered_start_(NULL),
-    utf16_length_(0),
-    last_byte_of_buffer_unused_(false) {}
-
-Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
-                                 unsigned buffer_length,
-                                 const uint8_t* stream,
-                                 unsigned stream_length) {
-  Reset(buffer, buffer_length, stream, stream_length);
-}
-
-template<unsigned kBufferSize>
-Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
-  : Utf8DecoderBase(buffer_,
-                    kBufferSize,
-                    reinterpret_cast<const uint8_t*>(stream),
-                    length) {
-}
-
-template<unsigned kBufferSize>
-void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
-  Utf8DecoderBase::Reset(buffer_,
-                         kBufferSize,
-                         reinterpret_cast<const uint8_t*>(stream),
-                         length);
-}
-
-template <unsigned kBufferSize>
-unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
-                                              unsigned length) const {
-  DCHECK(length > 0);
-  if (length > utf16_length_) length = utf16_length_;
-  // memcpy everything in buffer.
-  unsigned buffer_length =
-      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
-  unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
-  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
-  if (length <= buffer_length) return length;
-  DCHECK(unbuffered_start_ != NULL);
-  // Copy the rest the slow way.
-  WriteUtf16Slow(unbuffered_start_,
-                 data + buffer_length,
-                 length - buffer_length);
-  return length;
-}
-
 }  // namespace unibrow

 #endif  // V8_UNICODE_INL_H_
--- a/src/unicode.cc
+++ b/src/unicode.cc
--- a/src/unicode.h
+++ b/src/unicode.h
@ -44,6 +44,7 @@ class Predicate {
  CacheEntry entries_[kSize];
 };

+
 // A cache used in case conversion.  It caches the value for characters
 // that either have no mapping or map to a single character independent
 // of context.  Characters that map to more than one character or that
@ -70,6 +71,7 @@ class Mapping {
  CacheEntry entries_[kSize];
 };

+
 class UnicodeData {
 private:
  friend class Test;
@ -77,6 +79,7 @@ class UnicodeData {
  static const uchar kMaxCodePoint;
 };

+
 class Utf16 {
 public:
  static inline bool IsSurrogatePair(int lead, int trail) {
@ -113,14 +116,6 @@ class Utf16 {
  }
 };

-class Latin1 {
- public:
-  static const unsigned kMaxChar = 0xff;
-  // Returns 0 if character does not convert to single latin-1 character
-  // or if the character doesn't not convert back to latin-1 via inverse
-  // operation (upper to lower, etc).
-  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
-};

 class Utf8 {
 public:
@ -155,45 +150,6 @@ class Utf8 {
                              unsigned* cursor);
 };

-
-class Utf8DecoderBase {
- public:
-  // Initialization done in subclass.
-  inline Utf8DecoderBase();
-  inline Utf8DecoderBase(uint16_t* buffer,
-                         unsigned buffer_length,
-                         const uint8_t* stream,
-                         unsigned stream_length);
-  inline unsigned Utf16Length() const { return utf16_length_; }
- protected:
-  // This reads all characters and sets the utf16_length_.
-  // The first buffer_length utf16 chars are cached in the buffer.
-  void Reset(uint16_t* buffer,
-             unsigned buffer_length,
-             const uint8_t* stream,
-             unsigned stream_length);
-  static void WriteUtf16Slow(const uint8_t* stream,
-                             uint16_t* data,
-                             unsigned length);
-  const uint8_t* unbuffered_start_;
-  unsigned utf16_length_;
-  bool last_byte_of_buffer_unused_;
- private:
-  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
-};
-
-template <unsigned kBufferSize>
-class Utf8Decoder : public Utf8DecoderBase {
- public:
-  inline Utf8Decoder() {}
-  inline Utf8Decoder(const char* stream, unsigned length);
-  inline void Reset(const char* stream, unsigned length);
-  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
- private:
-  uint16_t buffer_[kBufferSize];
-};
-
-
 struct Uppercase {
  static bool Is(uchar c);
 };
@ -203,7 +159,10 @@ struct Lowercase {
 struct Letter {
  static bool Is(uchar c);
 };
-struct Number {
+struct ID_Start {
+  static bool Is(uchar c);
+};
+struct ID_Continue {
  static bool Is(uchar c);
 };
 struct WhiteSpace {
@ -212,12 +171,6 @@ struct WhiteSpace {
 struct LineTerminator {
  static bool Is(uchar c);
 };
-struct CombiningMark {
-  static bool Is(uchar c);
-};
-struct ConnectorPunctuation {
-  static bool Is(uchar c);
-};
 struct ToLowercase {
  static const int kMaxWidth = 3;
  static const bool kIsToLower = true;
--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@ -37,6 +37,7 @@
 #include "src/api.h"
 #include "src/factory.h"
 #include "src/objects.h"
+#include "src/unicode-decoder.h"
 #include "test/cctest/cctest.h"

 // Adapted from http://en.wikipedia.org/wiki/Multiply-with-carry
--- a/test/mjsunit/var.js
+++ b/test/mjsunit/var.js
@ -35,3 +35,22 @@ assertEquals(87, y);
 assertTrue(!z && typeof z == 'undefined');
 if (false) { var z; }
 assertTrue(!z && typeof z == 'undefined');
+
+assertThrows("var \u2E2F;", SyntaxError);
+assertThrows("var \\u2E2F;", SyntaxError);
+
+assertDoesNotThrow("var \u2118;");
+assertDoesNotThrow("var \\u2118;");
+assertDoesNotThrow("var \u212E;");
+assertDoesNotThrow("var \\u212E;");
+assertDoesNotThrow("var \u309B;");
+assertDoesNotThrow("var \\u309B;");
+assertDoesNotThrow("var \u309C;");
+assertDoesNotThrow("var \\u309C;");
+
+assertDoesNotThrow("var $\u00B7;");
+assertDoesNotThrow("var $\u0387;");
+assertDoesNotThrow("var $\u1369;");
+assertDoesNotThrow("var $\u1370;");
+assertDoesNotThrow("var $\u1371;");
+assertDoesNotThrow("var $\u19DA;");
--- a/test/unittests/unicode/unicode-predicates-unittest.cc
+++ b/test/unittests/unicode/unicode-predicates-unittest.cc
@ -0,0 +1,90 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/char-predicates.h"
+#include "src/unicode.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace v8 {
+namespace internal {
+
+TEST(UnicodePredicatesTest, WhiteSpace) {
+  // As of Unicode 6.3.0, \u180E is no longer a white space. We still consider
+  // it to be one though, since JS recognizes all white spaces in Unicode 5.1.
+  EXPECT_TRUE(WhiteSpace::Is(0x0009));
+  EXPECT_TRUE(WhiteSpace::Is(0x000B));
+  EXPECT_TRUE(WhiteSpace::Is(0x000C));
+  EXPECT_TRUE(WhiteSpace::Is(' '));
+  EXPECT_TRUE(WhiteSpace::Is(0x00A0));
+  EXPECT_TRUE(WhiteSpace::Is(0x180E));
+  EXPECT_TRUE(WhiteSpace::Is(0xFEFF));
+}
+
+
+TEST(UnicodePredicatesTest, WhiteSpaceOrLineTerminator) {
+  // As of Unicode 6.3.0, \u180E is no longer a white space. We still consider
+  // it to be one though, since JS recognizes all white spaces in Unicode 5.1.
+  // White spaces
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x0009));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000B));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000C));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(' '));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x00A0));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x180E));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0xFEFF));
+  // Line terminators
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000A));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000D));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2028));
+  EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2029));
+}
+
+
+TEST(UnicodePredicatesTest, IdentifierStart) {
+  EXPECT_TRUE(IdentifierStart::Is('$'));
+  EXPECT_TRUE(IdentifierStart::Is('_'));
+  EXPECT_TRUE(IdentifierStart::Is('\\'));
+
+  // http://www.unicode.org/reports/tr31/
+  // Other_ID_Start
+  EXPECT_TRUE(IdentifierStart::Is(0x2118));
+  EXPECT_TRUE(IdentifierStart::Is(0x212E));
+  EXPECT_TRUE(IdentifierStart::Is(0x309B));
+  EXPECT_TRUE(IdentifierStart::Is(0x309C));
+
+  // Issue 2892:
+  // \u2E2F has the Pattern_Syntax property, excluding it from ID_Start.
+  EXPECT_FALSE(unibrow::ID_Start::Is(0x2E2F));
+}
+
+
+TEST(UnicodePredicatesTest, IdentifierPart) {
+  EXPECT_TRUE(IdentifierPart::Is('$'));
+  EXPECT_TRUE(IdentifierPart::Is('_'));
+  EXPECT_TRUE(IdentifierPart::Is('\\'));
+  EXPECT_TRUE(IdentifierPart::Is(0x200C));
+  EXPECT_TRUE(IdentifierPart::Is(0x200D));
+
+  // http://www.unicode.org/reports/tr31/
+  // Other_ID_Start
+  EXPECT_TRUE(IdentifierPart::Is(0x2118));
+  EXPECT_TRUE(IdentifierPart::Is(0x212E));
+  EXPECT_TRUE(IdentifierPart::Is(0x309B));
+  EXPECT_TRUE(IdentifierPart::Is(0x309C));
+
+  // Other_ID_Continue
+  EXPECT_TRUE(IdentifierPart::Is(0x00B7));
+  EXPECT_TRUE(IdentifierPart::Is(0x0387));
+  EXPECT_TRUE(IdentifierPart::Is(0x1369));
+  EXPECT_TRUE(IdentifierPart::Is(0x1370));
+  EXPECT_TRUE(IdentifierPart::Is(0x1371));
+  EXPECT_TRUE(IdentifierPart::Is(0x19DA));
+
+  // Issue 2892:
+  // \u2E2F has the Pattern_Syntax property, excluding it from ID_Start.
+  EXPECT_FALSE(IdentifierPart::Is(0x2E2F));
+}
+
+}  // namespace internal
+}  // namespace v8
--- a/test/unittests/unittests.gyp
+++ b/test/unittests/unittests.gyp
@ -58,6 +58,7 @@
        'run-all-unittests.cc',
        'test-utils.h',
        'test-utils.cc',
+        'unicode/unicode-predicates-unittest.cc',
      ],
      'conditions': [
        ['v8_target_arch=="arm"', {
--- a/tools/gyp/v8.gyp
+++ b/tools/gyp/v8.gyp
@ -808,6 +808,8 @@
        '../../src/unicode-inl.h',
        '../../src/unicode.cc',
        '../../src/unicode.h',
+        '../../src/unicode-decoder.cc',
+        '../../src/unicode-decoder.h',
        '../../src/unique.h',
        '../../src/uri.h',
        '../../src/utils-inl.h',
@ -1694,6 +1696,12 @@
        '../../src/mksnapshot.cc',
      ],
      'conditions': [
+        ['v8_enable_i18n_support==1', {
+          'dependencies': [
+            '<(icu_gyp_path):icui18n',
+            '<(icu_gyp_path):icuuc',
+          ]
+        }],
        ['want_separate_host_toolset==1', {
          'toolsets': ['host'],
        }, {