Fix inconsistencies wrt whitespaces.
\u0085 (NEL) is now considered a whitespace in accordance to http://www.unicode.org/Public/6.3.0/ucd/PropList.txt R=mstarzinger@chromium.org BUG=v8:3109 LOG=Y Review URL: https://codereview.chromium.org/146983007 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@19196 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
5f8105af3c
commit
d0f57e1195
@ -497,6 +497,8 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
|
||||
__ b(ls, &success);
|
||||
// \u00a0 (NBSP).
|
||||
__ cmp(r0, Operand(0x00a0 - '\t'));
|
||||
// \u0085 (NEL).
|
||||
__ cmp(r0, Operand(0x0085 - '\t'), ne);
|
||||
BranchOrBacktrack(ne, on_no_match);
|
||||
__ bind(&success);
|
||||
return true;
|
||||
|
@ -66,6 +66,14 @@ struct IdentifierPart {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct WhiteSpace {
|
||||
static inline bool Is(uc32 c) {
|
||||
return unibrow::WhiteSpace::Is(c) ||
|
||||
c == 0xFEFF; // BYTE ORDER MARK is a white space in ECMA-262 5.1, 7.2.
|
||||
}
|
||||
};
|
||||
|
||||
} } // namespace v8::internal
|
||||
|
||||
#endif // V8_CHAR_PREDICATES_H_
|
||||
|
@ -526,6 +526,9 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
|
||||
__ j(below_equal, &success, Label::kNear);
|
||||
// \u00a0 (NBSP).
|
||||
__ cmp(eax, 0x00a0 - '\t');
|
||||
__ j(equal, &success, Label::kNear);
|
||||
// \u0085 (NEL).
|
||||
__ cmp(eax, 0x0085 - '\t');
|
||||
BranchOrBacktrack(not_equal, on_no_match);
|
||||
__ bind(&success);
|
||||
return true;
|
||||
|
@ -3597,9 +3597,10 @@ class AlternativeGenerationList {
|
||||
|
||||
|
||||
// The '2' variant is has inclusive from and exclusive to.
|
||||
static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
|
||||
0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
|
||||
0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
|
||||
static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
|
||||
0x0085, 0x0086, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F,
|
||||
0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060,
|
||||
0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
|
||||
static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
|
||||
|
||||
static const int kWordRanges[] = {
|
||||
|
@ -6105,8 +6105,10 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
|
||||
// Fast check for a junk value. A valid string may start from a
|
||||
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
|
||||
// the 'I' character ('Infinity'). All of that have codes not greater than
|
||||
// '9' except 'I' and .
|
||||
if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
|
||||
// '9' except 'I', NBSP and NEL.
|
||||
if (data[start_pos] != 'I' &&
|
||||
data[start_pos] != 0xa0 &&
|
||||
data[start_pos] != 0x85) {
|
||||
return isolate->heap()->nan_value();
|
||||
}
|
||||
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
|
||||
@ -6541,11 +6543,6 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToUpperCase) {
|
||||
}
|
||||
|
||||
|
||||
static inline bool IsTrimWhiteSpace(unibrow::uchar c) {
|
||||
return unibrow::WhiteSpace::Is(c) || c == 0x200b || c == 0xfeff;
|
||||
}
|
||||
|
||||
|
||||
RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
|
||||
HandleScope scope(isolate);
|
||||
ASSERT(args.length() == 3);
|
||||
@ -6558,15 +6555,17 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
|
||||
int length = string->length();
|
||||
|
||||
int left = 0;
|
||||
UnicodeCache* unicode_cache = isolate->unicode_cache();
|
||||
if (trimLeft) {
|
||||
while (left < length && IsTrimWhiteSpace(string->Get(left))) {
|
||||
while (left < length && unicode_cache->IsWhiteSpace(string->Get(left))) {
|
||||
left++;
|
||||
}
|
||||
}
|
||||
|
||||
int right = length;
|
||||
if (trimRight) {
|
||||
while (right > left && IsTrimWhiteSpace(string->Get(right - 1))) {
|
||||
while (right > left &&
|
||||
unicode_cache->IsWhiteSpace(string->Get(right - 1))) {
|
||||
right--;
|
||||
}
|
||||
}
|
||||
|
@ -144,7 +144,7 @@ class UnicodeCache {
|
||||
unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
|
||||
unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
|
||||
unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
|
||||
unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
|
||||
unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace;
|
||||
StaticResource<Utf8Decoder> utf8_decoder_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
|
||||
|
@ -552,6 +552,9 @@ bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type,
|
||||
__ j(below_equal, &success, Label::kNear);
|
||||
// \u00a0 (NBSP).
|
||||
__ cmpl(rax, Immediate(0x00a0 - '\t'));
|
||||
__ j(equal, &success, Label::kNear);
|
||||
// \u0085 (NEL).
|
||||
__ cmpl(rax, Immediate(0x0085 - '\t'));
|
||||
BranchOrBacktrack(not_equal, on_no_match);
|
||||
__ bind(&success);
|
||||
return true;
|
||||
|
@ -445,21 +445,7 @@ static bool NotDigit(uc16 c) {
|
||||
|
||||
|
||||
static bool IsWhiteSpace(uc16 c) {
|
||||
switch (c) {
|
||||
case 0x09:
|
||||
case 0x0A:
|
||||
case 0x0B:
|
||||
case 0x0C:
|
||||
case 0x0d:
|
||||
case 0x20:
|
||||
case 0xA0:
|
||||
case 0x2028:
|
||||
case 0x2029:
|
||||
case 0xFEFF:
|
||||
return true;
|
||||
default:
|
||||
return unibrow::Space::Is(c);
|
||||
}
|
||||
return v8::internal::WhiteSpace::Is(c);
|
||||
}
|
||||
|
||||
|
||||
|
3
test/mjsunit/third_party/string-trim.js
vendored
3
test/mjsunit/third_party/string-trim.js
vendored
@ -66,7 +66,8 @@ var whitespace = [
|
||||
{s : '\u3000', t : 'IDEOGRAPHIC SPACE'},
|
||||
{s : '\u2028', t : 'LINE SEPARATOR'},
|
||||
{s : '\u2029', t : 'PARAGRAPH SEPARATOR'},
|
||||
{s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
|
||||
// \u200B is not a whitespace character according to Unicode 6.3.0.
|
||||
// {s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
|
||||
];
|
||||
|
||||
for (var i = 0; i < whitespace.length; i++) {
|
||||
|
134
test/mjsunit/whitespaces.js
Normal file
134
test/mjsunit/whitespaces.js
Normal file
@ -0,0 +1,134 @@
|
||||
// Copyright 2014 the V8 project authors. All rights reserved.
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following
|
||||
// disclaimer in the documentation and/or other materials provided
|
||||
// with the distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived
|
||||
// from this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
var whitespaces = [
|
||||
// Whitespaces defined in ECMA-262 5.1, 7.2
|
||||
0x0009, // Tab TAB
|
||||
0x000B, // Vertical Tab VT
|
||||
0x000C, // Form Feed FF
|
||||
0x0020, // Space SP
|
||||
0x00A0, // No-break space NBSP
|
||||
0xFEFF, // Byte Order Mark BOM
|
||||
// Unicode whitespaces
|
||||
0x000A, // Line Feed LF
|
||||
0x000D, // Carriage Return CR
|
||||
0x0085, // Next Line NEL
|
||||
0x1680, // Ogham Space Mark
|
||||
0x180E, // Mongolian Vowel Separator
|
||||
0x2000, // EN QUAD
|
||||
0x2001, // EM QUAD
|
||||
0x2002, // EN SPACE
|
||||
0x2003, // EM SPACE
|
||||
0x2004, // THREE-PER-EM SPACE
|
||||
0x2005, // FOUR-PER-EM SPACE
|
||||
0x2006, // SIX-PER-EM SPACE
|
||||
0x2007, // FIGURE SPACE
|
||||
0x2008, // PUNCTUATION SPACE
|
||||
0x2009, // THIN SPACE
|
||||
0x200A, // HAIR SPACE
|
||||
0x2028, // LINE SEPARATOR
|
||||
0x2029, // PARAGRAPH SEPARATOR
|
||||
0x202F, // NARROW NO-BREAK SPACE
|
||||
0x205F, // MEDIUM MATHEMATICAL SPACE
|
||||
0x3000, // IDEOGRAPHIC SPACE
|
||||
];
|
||||
|
||||
// Add single twobyte char to force twobyte representation.
|
||||
// Interestingly, snowman is not "white" space :)
|
||||
var twobyte = "\u2603";
|
||||
var onebyte = "\u007E";
|
||||
var twobytespace = "\u2000";
|
||||
var onebytespace = "\u0020";
|
||||
|
||||
function is_whitespace(c) {
|
||||
return whitespaces.indexOf(c.charCodeAt(0)) > -1;
|
||||
}
|
||||
|
||||
function test_regexp(str) {
|
||||
var pos_match = str.match(/\s/);
|
||||
var neg_match = str.match(/\S/);
|
||||
var test_char = str[0];
|
||||
var postfix = str[1];
|
||||
if (is_whitespace(test_char)) {
|
||||
assertEquals(test_char, pos_match[0]);
|
||||
assertEquals(postfix, neg_match[0]);
|
||||
} else {
|
||||
assertEquals(test_char, neg_match[0]);
|
||||
assertNull(pos_match);
|
||||
}
|
||||
}
|
||||
|
||||
function test_trim(c, infix) {
|
||||
var str = c + c + c + infix + c;
|
||||
if (is_whitespace(c)) {
|
||||
assertEquals(infix, str.trim());
|
||||
} else {
|
||||
assertEquals(str, str.trim());
|
||||
}
|
||||
}
|
||||
|
||||
function test_parseInt(c, postfix) {
|
||||
// Skip if prefix is a digit.
|
||||
if (c >= "0" && c <= 9) return;
|
||||
var str = c + c + "123" + postfix;
|
||||
if (is_whitespace(c)) {
|
||||
assertEquals(123, parseInt(str));
|
||||
} else {
|
||||
assertEquals(NaN, parseInt(str));
|
||||
}
|
||||
}
|
||||
|
||||
function test_eval(c, content) {
|
||||
if (!is_whitespace(c)) return;
|
||||
var str = c + c + "'" + content + "'" + c + c;
|
||||
assertEquals(content, eval(str));
|
||||
}
|
||||
|
||||
function test_stringtonumber(c, postfix) {
|
||||
// Skip if prefix is a digit.
|
||||
if (c >= "0" && c <= 9) return;
|
||||
var result = 1 + Number(c + "123" + c + postfix);
|
||||
if (is_whitespace(c)) {
|
||||
assertEquals(124, result);
|
||||
} else {
|
||||
assertEquals(NaN, result);
|
||||
}
|
||||
}
|
||||
|
||||
for (var i = 0; i < 0x10000; i++) {
|
||||
c = String.fromCharCode(i);
|
||||
test_regexp(c + onebyte);
|
||||
test_regexp(c + twobyte);
|
||||
test_trim(c, onebyte + "trim");
|
||||
test_trim(c, twobyte + "trim");
|
||||
test_parseInt(c, onebyte);
|
||||
test_parseInt(c, twobyte);
|
||||
test_eval(c, onebyte);
|
||||
test_eval(c, twobyte);
|
||||
test_stringtonumber(c, onebytespace);
|
||||
test_stringtonumber(c, twobytespace);
|
||||
}
|
Loading…
Reference in New Issue
Block a user