Update unicode to 7.0.0.

And do not use code points with PATTERN_* property for identifier start.
Maintain that \u180E is a white space character.

BUG=v8:2892
LOG=Y
R=dpino@igalia.com, mathias@qiwi.be

Review URL: https://codereview.chromium.org/638643002

git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@24473 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
yangguo@chromium.org 2014-10-08 14:55:03 +00:00
parent c8b1c3e784
commit 8659e50723
15 changed files with 2934 additions and 1153 deletions

View File

@ -895,6 +895,8 @@ source_set("v8_base") {
"src/unicode-inl.h",
"src/unicode.cc",
"src/unicode.h",
"src/unicode-decoder.cc",
"src/unicode-decoder.h",
"src/unique.h",
"src/uri.h",
"src/utils-inl.h",

View File

@ -22,42 +22,40 @@ inline bool IsBinaryDigit(uc32 c);
inline bool IsRegExpWord(uc32 c);
inline bool IsRegExpNewline(uc32 c);
// ES6 draft section 11.6
// This includes '_', '$' and '\', and ID_Start according to
// http://www.unicode.org/reports/tr31/, which consists of categories
// 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
// 'Pattern_Syntax' or 'Pattern_White_Space'.
struct IdentifierStart {
static inline bool Is(uc32 c) {
switch (c) {
case '$': case '_': case '\\': return true;
default: return unibrow::Letter::Is(c);
}
}
static inline bool Is(uc32 c) { return unibrow::ID_Start::Is(c); }
};
// ES6 draft section 11.6
// This includes \u200c and \u200d, and ID_Continue according to
// http://www.unicode.org/reports/tr31/, which consists of ID_Start,
// the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
// 'Pattern_Syntax' or 'Pattern_White_Space'.
struct IdentifierPart {
static inline bool Is(uc32 c) {
return IdentifierStart::Is(c)
|| unibrow::Number::Is(c)
|| c == 0x200C // U+200C is Zero-Width Non-Joiner.
|| c == 0x200D // U+200D is Zero-Width Joiner.
|| unibrow::CombiningMark::Is(c)
|| unibrow::ConnectorPunctuation::Is(c);
return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
}
};
// WhiteSpace according to ECMA-262 5.1, 7.2.
// ES6 draft section 11.2
// This includes all code points of Unicode category 'Zs'.
// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
// so it is also included.
// Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
struct WhiteSpace {
static inline bool Is(uc32 c) {
return c == 0x0009 || // <TAB>
c == 0x000B || // <VT>
c == 0x000C || // <FF>
c == 0xFEFF || // <BOM>
// \u0020 and \u00A0 are included in unibrow::WhiteSpace.
unibrow::WhiteSpace::Is(c);
}
static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
};
// WhiteSpace and LineTerminator according to ECMA-262 5.1, 7.2 and 7.3.
// WhiteSpace and LineTerminator according to ES6 draft section 11.2 and 11.3
// This consists of \000a, \000d, \u2028, and \u2029.
struct WhiteSpaceOrLineTerminator {
static inline bool Is(uc32 c) {
return WhiteSpace::Is(c) || unibrow::LineTerminator::Is(c);

View File

@ -20,6 +20,7 @@
#include "src/regexp-stack.h"
#include "src/runtime/runtime.h"
#include "src/string-search.h"
#include "src/unicode-decoder.h"
#ifndef V8_INTERPRETED_REGEXP
#if V8_TARGET_ARCH_IA32

View File

@ -20,6 +20,7 @@
#include "src/property-details.h"
#include "src/smart-pointers.h"
#include "src/unicode-inl.h"
#include "src/unicode-decoder.h"
#include "src/zone.h"
#if V8_TARGET_ARCH_ARM

View File

@ -15,6 +15,7 @@
#include "src/list.h"
#include "src/token.h"
#include "src/unicode-inl.h"
#include "src/unicode-decoder.h"
#include "src/utils.h"
namespace v8 {

78
src/unicode-decoder.cc Normal file
View File

@ -0,0 +1,78 @@
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/unicode-inl.h"
#include "src/unicode-decoder.h"
#include <stdio.h>
#include <stdlib.h>
namespace unibrow {
void Utf8DecoderBase::Reset(uint16_t* buffer, unsigned buffer_length,
const uint8_t* stream, unsigned stream_length) {
// Assume everything will fit in the buffer and stream won't be needed.
last_byte_of_buffer_unused_ = false;
unbuffered_start_ = NULL;
bool writing_to_buffer = true;
// Loop until stream is read, writing to buffer as long as buffer has space.
unsigned utf16_length = 0;
while (stream_length != 0) {
unsigned cursor = 0;
uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
DCHECK(cursor > 0 && cursor <= stream_length);
stream += cursor;
stream_length -= cursor;
bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
utf16_length += is_two_characters ? 2 : 1;
// Don't need to write to the buffer, but still need utf16_length.
if (!writing_to_buffer) continue;
// Write out the characters to the buffer.
// Must check for equality with buffer_length as we've already updated it.
if (utf16_length <= buffer_length) {
if (is_two_characters) {
*buffer++ = Utf16::LeadSurrogate(character);
*buffer++ = Utf16::TrailSurrogate(character);
} else {
*buffer++ = character;
}
if (utf16_length == buffer_length) {
// Just wrote last character of buffer
writing_to_buffer = false;
unbuffered_start_ = stream;
}
continue;
}
// Have gone over buffer.
// Last char of buffer is unused, set cursor back.
DCHECK(is_two_characters);
writing_to_buffer = false;
last_byte_of_buffer_unused_ = true;
unbuffered_start_ = stream - cursor;
}
utf16_length_ = utf16_length;
}
void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, uint16_t* data,
unsigned data_length) {
while (data_length != 0) {
unsigned cursor = 0;
uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);
// There's a total lack of bounds checking for stream
// as it was already done in Reset.
stream += cursor;
if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
*data++ = Utf16::LeadSurrogate(character);
*data++ = Utf16::TrailSurrogate(character);
DCHECK(data_length > 1);
data_length -= 2;
} else {
*data++ = character;
data_length -= 1;
}
}
}
} // namespace unibrow

121
src/unicode-decoder.h Normal file
View File

@ -0,0 +1,121 @@
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_UNICODE_DECODER_H_
#define V8_UNICODE_DECODER_H_
#include <sys/types.h>
#include "src/globals.h"
namespace unibrow {
class Utf8DecoderBase {
public:
// Initialization done in subclass.
inline Utf8DecoderBase();
inline Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length,
const uint8_t* stream, unsigned stream_length);
inline unsigned Utf16Length() const { return utf16_length_; }
protected:
// This reads all characters and sets the utf16_length_.
// The first buffer_length utf16 chars are cached in the buffer.
void Reset(uint16_t* buffer, unsigned buffer_length, const uint8_t* stream,
unsigned stream_length);
static void WriteUtf16Slow(const uint8_t* stream, uint16_t* data,
unsigned length);
const uint8_t* unbuffered_start_;
unsigned utf16_length_;
bool last_byte_of_buffer_unused_;
private:
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
};
template <unsigned kBufferSize>
class Utf8Decoder : public Utf8DecoderBase {
public:
inline Utf8Decoder() {}
inline Utf8Decoder(const char* stream, unsigned length);
inline void Reset(const char* stream, unsigned length);
inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
private:
uint16_t buffer_[kBufferSize];
};
Utf8DecoderBase::Utf8DecoderBase()
: unbuffered_start_(NULL),
utf16_length_(0),
last_byte_of_buffer_unused_(false) {}
Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length) {
Reset(buffer, buffer_length, stream, stream_length);
}
template <unsigned kBufferSize>
Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
: Utf8DecoderBase(buffer_, kBufferSize,
reinterpret_cast<const uint8_t*>(stream), length) {}
template <unsigned kBufferSize>
void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
Utf8DecoderBase::Reset(buffer_, kBufferSize,
reinterpret_cast<const uint8_t*>(stream), length);
}
template <unsigned kBufferSize>
unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
unsigned length) const {
DCHECK(length > 0);
if (length > utf16_length_) length = utf16_length_;
// memcpy everything in buffer.
unsigned buffer_length =
last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
if (length <= buffer_length) return length;
DCHECK(unbuffered_start_ != NULL);
// Copy the rest the slow way.
WriteUtf16Slow(unbuffered_start_, data + buffer_length,
length - buffer_length);
return length;
}
class Latin1 {
public:
static const unsigned kMaxChar = 0xff;
// Returns 0 if character does not convert to single latin-1 character
// or if the character doesn't not convert back to latin-1 via inverse
// operation (upper to lower, etc).
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
};
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
DCHECK(c > Latin1::kMaxChar);
switch (c) {
// This are equivalent characters in unicode.
case 0x39c:
case 0x3bc:
return 0xb5;
// This is an uppercase of a Latin-1 character
// outside of Latin-1.
case 0x178:
return 0xff;
}
return 0;
}
} // namespace unibrow
#endif // V8_UNICODE_DECODER_H_

View File

@ -57,22 +57,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
DCHECK(c > Latin1::kMaxChar);
switch (c) {
// This are equivalent characters in unicode.
case 0x39c:
case 0x3bc:
return 0xb5;
// This is an uppercase of a Latin-1 character
// outside of Latin-1.
case 0x178:
return 0xff;
}
return 0;
}
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) {
@ -153,53 +137,6 @@ unsigned Utf8::Length(uchar c, int previous) {
}
}
Utf8DecoderBase::Utf8DecoderBase()
: unbuffered_start_(NULL),
utf16_length_(0),
last_byte_of_buffer_unused_(false) {}
Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length) {
Reset(buffer, buffer_length, stream, stream_length);
}
template<unsigned kBufferSize>
Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
: Utf8DecoderBase(buffer_,
kBufferSize,
reinterpret_cast<const uint8_t*>(stream),
length) {
}
template<unsigned kBufferSize>
void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
Utf8DecoderBase::Reset(buffer_,
kBufferSize,
reinterpret_cast<const uint8_t*>(stream),
length);
}
template <unsigned kBufferSize>
unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
unsigned length) const {
DCHECK(length > 0);
if (length > utf16_length_) length = utf16_length_;
// memcpy everything in buffer.
unsigned buffer_length =
last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
if (length <= buffer_length) return length;
DCHECK(unbuffered_start_ != NULL);
// Copy the rest the slow way.
WriteUtf16Slow(unbuffered_start_,
data + buffer_length,
length - buffer_length);
return length;
}
} // namespace unibrow
#endif // V8_UNICODE_INL_H_

File diff suppressed because it is too large Load Diff

View File

@ -44,6 +44,7 @@ class Predicate {
CacheEntry entries_[kSize];
};
// A cache used in case conversion. It caches the value for characters
// that either have no mapping or map to a single character independent
// of context. Characters that map to more than one character or that
@ -70,6 +71,7 @@ class Mapping {
CacheEntry entries_[kSize];
};
class UnicodeData {
private:
friend class Test;
@ -77,6 +79,7 @@ class UnicodeData {
static const uchar kMaxCodePoint;
};
class Utf16 {
public:
static inline bool IsSurrogatePair(int lead, int trail) {
@ -113,14 +116,6 @@ class Utf16 {
}
};
class Latin1 {
public:
static const unsigned kMaxChar = 0xff;
// Returns 0 if character does not convert to single latin-1 character
// or if the character doesn't not convert back to latin-1 via inverse
// operation (upper to lower, etc).
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
};
class Utf8 {
public:
@ -155,45 +150,6 @@ class Utf8 {
unsigned* cursor);
};
class Utf8DecoderBase {
public:
// Initialization done in subclass.
inline Utf8DecoderBase();
inline Utf8DecoderBase(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length);
inline unsigned Utf16Length() const { return utf16_length_; }
protected:
// This reads all characters and sets the utf16_length_.
// The first buffer_length utf16 chars are cached in the buffer.
void Reset(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length);
static void WriteUtf16Slow(const uint8_t* stream,
uint16_t* data,
unsigned length);
const uint8_t* unbuffered_start_;
unsigned utf16_length_;
bool last_byte_of_buffer_unused_;
private:
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
};
template <unsigned kBufferSize>
class Utf8Decoder : public Utf8DecoderBase {
public:
inline Utf8Decoder() {}
inline Utf8Decoder(const char* stream, unsigned length);
inline void Reset(const char* stream, unsigned length);
inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
private:
uint16_t buffer_[kBufferSize];
};
struct Uppercase {
static bool Is(uchar c);
};
@ -203,7 +159,10 @@ struct Lowercase {
struct Letter {
static bool Is(uchar c);
};
struct Number {
struct ID_Start {
static bool Is(uchar c);
};
struct ID_Continue {
static bool Is(uchar c);
};
struct WhiteSpace {
@ -212,12 +171,6 @@ struct WhiteSpace {
struct LineTerminator {
static bool Is(uchar c);
};
struct CombiningMark {
static bool Is(uchar c);
};
struct ConnectorPunctuation {
static bool Is(uchar c);
};
struct ToLowercase {
static const int kMaxWidth = 3;
static const bool kIsToLower = true;

View File

@ -37,6 +37,7 @@
#include "src/api.h"
#include "src/factory.h"
#include "src/objects.h"
#include "src/unicode-decoder.h"
#include "test/cctest/cctest.h"
// Adapted from http://en.wikipedia.org/wiki/Multiply-with-carry

View File

@ -35,3 +35,22 @@ assertEquals(87, y);
assertTrue(!z && typeof z == 'undefined');
if (false) { var z; }
assertTrue(!z && typeof z == 'undefined');
assertThrows("var \u2E2F;", SyntaxError);
assertThrows("var \\u2E2F;", SyntaxError);
assertDoesNotThrow("var \u2118;");
assertDoesNotThrow("var \\u2118;");
assertDoesNotThrow("var \u212E;");
assertDoesNotThrow("var \\u212E;");
assertDoesNotThrow("var \u309B;");
assertDoesNotThrow("var \\u309B;");
assertDoesNotThrow("var \u309C;");
assertDoesNotThrow("var \\u309C;");
assertDoesNotThrow("var $\u00B7;");
assertDoesNotThrow("var $\u0387;");
assertDoesNotThrow("var $\u1369;");
assertDoesNotThrow("var $\u1370;");
assertDoesNotThrow("var $\u1371;");
assertDoesNotThrow("var $\u19DA;");

View File

@ -0,0 +1,90 @@
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/char-predicates.h"
#include "src/unicode.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace v8 {
namespace internal {
TEST(UnicodePredicatesTest, WhiteSpace) {
// As of Unicode 6.3.0, \u180E is no longer a white space. We still consider
// it to be one though, since JS recognizes all white spaces in Unicode 5.1.
EXPECT_TRUE(WhiteSpace::Is(0x0009));
EXPECT_TRUE(WhiteSpace::Is(0x000B));
EXPECT_TRUE(WhiteSpace::Is(0x000C));
EXPECT_TRUE(WhiteSpace::Is(' '));
EXPECT_TRUE(WhiteSpace::Is(0x00A0));
EXPECT_TRUE(WhiteSpace::Is(0x180E));
EXPECT_TRUE(WhiteSpace::Is(0xFEFF));
}
TEST(UnicodePredicatesTest, WhiteSpaceOrLineTerminator) {
// As of Unicode 6.3.0, \u180E is no longer a white space. We still consider
// it to be one though, since JS recognizes all white spaces in Unicode 5.1.
// White spaces
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x0009));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000B));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000C));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(' '));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x00A0));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x180E));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0xFEFF));
// Line terminators
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000A));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000D));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2028));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2029));
}
TEST(UnicodePredicatesTest, IdentifierStart) {
EXPECT_TRUE(IdentifierStart::Is('$'));
EXPECT_TRUE(IdentifierStart::Is('_'));
EXPECT_TRUE(IdentifierStart::Is('\\'));
// http://www.unicode.org/reports/tr31/
// Other_ID_Start
EXPECT_TRUE(IdentifierStart::Is(0x2118));
EXPECT_TRUE(IdentifierStart::Is(0x212E));
EXPECT_TRUE(IdentifierStart::Is(0x309B));
EXPECT_TRUE(IdentifierStart::Is(0x309C));
// Issue 2892:
// \u2E2F has the Pattern_Syntax property, excluding it from ID_Start.
EXPECT_FALSE(unibrow::ID_Start::Is(0x2E2F));
}
TEST(UnicodePredicatesTest, IdentifierPart) {
EXPECT_TRUE(IdentifierPart::Is('$'));
EXPECT_TRUE(IdentifierPart::Is('_'));
EXPECT_TRUE(IdentifierPart::Is('\\'));
EXPECT_TRUE(IdentifierPart::Is(0x200C));
EXPECT_TRUE(IdentifierPart::Is(0x200D));
// http://www.unicode.org/reports/tr31/
// Other_ID_Start
EXPECT_TRUE(IdentifierPart::Is(0x2118));
EXPECT_TRUE(IdentifierPart::Is(0x212E));
EXPECT_TRUE(IdentifierPart::Is(0x309B));
EXPECT_TRUE(IdentifierPart::Is(0x309C));
// Other_ID_Continue
EXPECT_TRUE(IdentifierPart::Is(0x00B7));
EXPECT_TRUE(IdentifierPart::Is(0x0387));
EXPECT_TRUE(IdentifierPart::Is(0x1369));
EXPECT_TRUE(IdentifierPart::Is(0x1370));
EXPECT_TRUE(IdentifierPart::Is(0x1371));
EXPECT_TRUE(IdentifierPart::Is(0x19DA));
// Issue 2892:
// \u2E2F has the Pattern_Syntax property, excluding it from ID_Start.
EXPECT_FALSE(IdentifierPart::Is(0x2E2F));
}
} // namespace internal
} // namespace v8

View File

@ -58,6 +58,7 @@
'run-all-unittests.cc',
'test-utils.h',
'test-utils.cc',
'unicode/unicode-predicates-unittest.cc',
],
'conditions': [
['v8_target_arch=="arm"', {

View File

@ -808,6 +808,8 @@
'../../src/unicode-inl.h',
'../../src/unicode.cc',
'../../src/unicode.h',
'../../src/unicode-decoder.cc',
'../../src/unicode-decoder.h',
'../../src/unique.h',
'../../src/uri.h',
'../../src/utils-inl.h',
@ -1694,6 +1696,12 @@
'../../src/mksnapshot.cc',
],
'conditions': [
['v8_enable_i18n_support==1', {
'dependencies': [
'<(icu_gyp_path):icui18n',
'<(icu_gyp_path):icuuc',
]
}],
['want_separate_host_toolset==1', {
'toolsets': ['host'],
}, {