Allow identifier code points from supplementary multilingual planes.
ES5.1 section 6 ("Source Text"): "Throughout the rest of this document, the phrase “code unit” and the word “character” will be used to refer to a 16-bit unsigned value used to represent a single 16-bit unit of text." This changed in ES6 draft section 10.1 ("Source Text"): "The ECMAScript code is expressed using Unicode, version 5.1 or later. ECMAScript source text is a sequence of code points. All Unicode code point values from U+0000 to U+10FFFF, including surrogate code points, may occur in source text where permitted by the ECMAScript grammars." This patch is to reflect this spec change. BUG=v8:3617 LOG=Y R=jochen@chromium.org Review URL: https://codereview.chromium.org/640193002 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@24510 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
bb117b4dde
commit
0dd69ec439
1
BUILD.gn
1
BUILD.gn
@ -453,6 +453,7 @@ source_set("v8_base") {
|
||||
"src/bytecodes-irregexp.h",
|
||||
"src/cached-powers.cc",
|
||||
"src/cached-powers.h",
|
||||
"src/char-predicates.cc",
|
||||
"src/char-predicates-inl.h",
|
||||
"src/char-predicates.h",
|
||||
"src/checks.cc",
|
||||
|
42
src/char-predicates.cc
Normal file
42
src/char-predicates.cc
Normal file
@ -0,0 +1,42 @@
|
||||
// Copyright 2011 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "src/char-predicates.h"
|
||||
|
||||
#ifdef V8_I18N_SUPPORT
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/urename.h"
|
||||
#endif // V8_I18N_SUPPORT
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
bool SupplementaryPlanes::IsIDStart(uc32 c) {
|
||||
DCHECK(c > 0xFFFF);
|
||||
#ifdef V8_I18N_SUPPORT
|
||||
// This only works for code points in the SMPs, since ICU does not exclude
|
||||
// code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
|
||||
// Code points in the SMP do not have those properties.
|
||||
return u_isIDStart(c);
|
||||
#else
|
||||
// This is incorrect, but if we don't have ICU, use this as fallback.
|
||||
return false;
|
||||
#endif // V8_I18N_SUPPORT
|
||||
}
|
||||
|
||||
|
||||
bool SupplementaryPlanes::IsIDPart(uc32 c) {
|
||||
DCHECK(c > 0xFFFF);
|
||||
#ifdef V8_I18N_SUPPORT
|
||||
// This only works for code points in the SMPs, since ICU does not exclude
|
||||
// code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
|
||||
// Code points in the SMP do not have those properties.
|
||||
return u_isIDPart(c);
|
||||
#else
|
||||
// This is incorrect, but if we don't have ICU, use this as fallback.
|
||||
return false;
|
||||
#endif // V8_I18N_SUPPORT
|
||||
}
|
||||
}
|
||||
} // namespace v8::internal
|
@ -22,13 +22,24 @@ inline bool IsBinaryDigit(uc32 c);
|
||||
inline bool IsRegExpWord(uc32 c);
|
||||
inline bool IsRegExpNewline(uc32 c);
|
||||
|
||||
|
||||
struct SupplementaryPlanes {
|
||||
static bool IsIDStart(uc32 c);
|
||||
static bool IsIDPart(uc32 c);
|
||||
};
|
||||
|
||||
|
||||
// ES6 draft section 11.6
|
||||
// This includes '_', '$' and '\', and ID_Start according to
|
||||
// http://www.unicode.org/reports/tr31/, which consists of categories
|
||||
// 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
|
||||
// 'Pattern_Syntax' or 'Pattern_White_Space'.
|
||||
// For code points in the SMPs, we can resort to ICU (if available).
|
||||
struct IdentifierStart {
|
||||
static inline bool Is(uc32 c) { return unibrow::ID_Start::Is(c); }
|
||||
static inline bool Is(uc32 c) {
|
||||
if (c > 0xFFFF) return SupplementaryPlanes::IsIDStart(c);
|
||||
return unibrow::ID_Start::Is(c);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -37,8 +48,10 @@ struct IdentifierStart {
|
||||
// http://www.unicode.org/reports/tr31/, which consists of ID_Start,
|
||||
// the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
|
||||
// 'Pattern_Syntax' or 'Pattern_White_Space'.
|
||||
// For code points in the SMPs, we can resort to ICU (if available).
|
||||
struct IdentifierPart {
|
||||
static inline bool Is(uc32 c) {
|
||||
if (c > 0xFFFF) return SupplementaryPlanes::IsIDPart(c);
|
||||
return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
|
||||
}
|
||||
};
|
||||
@ -49,6 +62,7 @@ struct IdentifierPart {
|
||||
// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
|
||||
// so it is also included.
|
||||
// Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
|
||||
// There are no category 'Zs' code points in the SMPs.
|
||||
struct WhiteSpace {
|
||||
static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
|
||||
};
|
||||
|
@ -212,9 +212,17 @@ class LiteralBuffer {
|
||||
}
|
||||
ConvertToTwoByte();
|
||||
}
|
||||
DCHECK(code_unit < 0x10000u);
|
||||
*reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
|
||||
position_ += kUC16Size;
|
||||
if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
||||
*reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
|
||||
position_ += kUC16Size;
|
||||
} else {
|
||||
*reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
|
||||
unibrow::Utf16::LeadSurrogate(code_unit);
|
||||
position_ += kUC16Size;
|
||||
*reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
|
||||
unibrow::Utf16::TrailSurrogate(code_unit);
|
||||
position_ += kUC16Size;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_one_byte() const { return is_one_byte_; }
|
||||
@ -519,9 +527,25 @@ class Scanner {
|
||||
}
|
||||
|
||||
// Low-level scanning support.
|
||||
void Advance() { c0_ = source_->Advance(); }
|
||||
void Advance() {
|
||||
c0_ = source_->Advance();
|
||||
if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
|
||||
uc32 c1 = source_->Advance();
|
||||
if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
|
||||
source_->PushBack(c1);
|
||||
} else {
|
||||
c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PushBack(uc32 ch) {
|
||||
source_->PushBack(c0_);
|
||||
if (ch > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
|
||||
source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
|
||||
source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
|
||||
} else {
|
||||
source_->PushBack(c0_);
|
||||
}
|
||||
c0_ = ch;
|
||||
}
|
||||
|
||||
|
43
test/intl/general/smp-identifier.js
Normal file
43
test/intl/general/smp-identifier.js
Normal file
@ -0,0 +1,43 @@
|
||||
// Copyright 2014 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
function toSurrogatePair(c) {
|
||||
return String.fromCharCode(((c - 0x10000) >>> 10) & 0x3FF | 0xD800) +
|
||||
String.fromCharCode(c & 0x3FF | 0xDC00);
|
||||
}
|
||||
|
||||
function testIdStart(c, is_id_start) {
|
||||
var source = "var " + toSurrogatePair(c);
|
||||
print(source);
|
||||
if (is_id_start) {
|
||||
assertDoesNotThrow(source);
|
||||
} else {
|
||||
assertThrows(source);
|
||||
}
|
||||
}
|
||||
|
||||
function testIdPart(c, is_id_start) {
|
||||
var source = "var v" + toSurrogatePair(c);
|
||||
print(source);
|
||||
if (is_id_start) {
|
||||
assertDoesNotThrow(source);
|
||||
} else {
|
||||
assertThrows(source);
|
||||
}
|
||||
}
|
||||
|
||||
[0x10403, 0x1043C, 0x16F9C, 0x10048, 0x1014D].forEach(function(c) {
|
||||
testIdStart(c, true);
|
||||
testIdPart(c, true);
|
||||
});
|
||||
|
||||
[0x101FD, 0x11002, 0x104A9].forEach(function(c) {
|
||||
testIdStart(c, false);
|
||||
testIdPart(c, true);
|
||||
});
|
||||
|
||||
[0x10111, 0x1F4A9].forEach(function(c) {
|
||||
testIdStart(c, false);
|
||||
testIdPart(c, false);
|
||||
});
|
7
test/mjsunit/parse-surrogates.js
Normal file
7
test/mjsunit/parse-surrogates.js
Normal file
@ -0,0 +1,7 @@
|
||||
// Copyright 2014 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// Test that the parser throws on unmatched surrogates.
|
||||
assertThrows("var \uD801\uABCD;", SyntaxError);
|
||||
assertThrows("'\\u000\uD801\uABCD'", SyntaxError);
|
@ -86,5 +86,36 @@ TEST(UnicodePredicatesTest, IdentifierPart) {
|
||||
EXPECT_FALSE(IdentifierPart::Is(0x2E2F));
|
||||
}
|
||||
|
||||
|
||||
#ifdef V8_I18N_SUPPORT
|
||||
TEST(UnicodePredicatesTest, SupplementaryPlaneIdentifiers) {
|
||||
// Both ID_Start and ID_Continue.
|
||||
EXPECT_TRUE(IdentifierStart::Is(0x10403)); // Category Lu
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x10403));
|
||||
EXPECT_TRUE(IdentifierStart::Is(0x1043C)); // Category Ll
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x1043C));
|
||||
EXPECT_TRUE(IdentifierStart::Is(0x16F9C)); // Category Lm
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x16F9C));
|
||||
EXPECT_TRUE(IdentifierStart::Is(0x10048)); // Category Lo
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x10048));
|
||||
EXPECT_TRUE(IdentifierStart::Is(0x1014D)); // Category Nl
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x1014D));
|
||||
|
||||
// Only ID_Continue.
|
||||
EXPECT_FALSE(IdentifierStart::Is(0x101FD)); // Category Mn
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x101FD));
|
||||
EXPECT_FALSE(IdentifierStart::Is(0x11002)); // Category Mc
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x11002));
|
||||
EXPECT_FALSE(IdentifierStart::Is(0x104A9)); // Category Nd
|
||||
EXPECT_TRUE(IdentifierPart::Is(0x104A9));
|
||||
|
||||
// Neither.
|
||||
EXPECT_FALSE(IdentifierStart::Is(0x10111)); // Category No
|
||||
EXPECT_FALSE(IdentifierPart::Is(0x10111));
|
||||
EXPECT_FALSE(IdentifierStart::Is(0x1F4A9)); // Category So
|
||||
EXPECT_FALSE(IdentifierPart::Is(0x1F4A9));
|
||||
}
|
||||
#endif // V8_I18N_SUPPORT
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
@ -364,6 +364,7 @@
|
||||
'../../src/bytecodes-irregexp.h',
|
||||
'../../src/cached-powers.cc',
|
||||
'../../src/cached-powers.h',
|
||||
'../../src/char-predicates.cc',
|
||||
'../../src/char-predicates-inl.h',
|
||||
'../../src/char-predicates.h',
|
||||
'../../src/checks.cc',
|
||||
|
Loading…
Reference in New Issue
Block a user