Fix inconsistencies wrt whitespaces.

\u0085 (NEL) is now considered a whitespace in accordance to http://www.unicode.org/Public/6.3.0/ucd/PropList.txt R=mstarzinger@chromium.org BUG=v8:3109 LOG=Y Review URL: https://codereview.chromium.org/146983007 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@19196 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2014-02-07 12:34:45 +00:00 · 2014-02-07 12:34:45 +00:00 · d0f57e1195
commit d0f57e1195
parent 5f8105af3c
10 changed files with 166 additions and 29 deletions
--- a/src/arm/regexp-macro-assembler-arm.cc
+++ b/src/arm/regexp-macro-assembler-arm.cc
@ -497,6 +497,8 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
      __ b(ls, &success);
      // \u00a0 (NBSP).
      __ cmp(r0, Operand(0x00a0 - '\t'));
      // \u0085 (NEL).
      __ cmp(r0, Operand(0x0085 - '\t'), ne);
      BranchOrBacktrack(ne, on_no_match);
      __ bind(&success);
      return true;
--- a/src/char-predicates.h
+++ b/src/char-predicates.h
@ -66,6 +66,14 @@ struct IdentifierPart {
  }
 };
 struct WhiteSpace {
  static inline bool Is(uc32 c) {
    return unibrow::WhiteSpace::Is(c) ||
        c == 0xFEFF;  // BYTE ORDER MARK is a white space in ECMA-262 5.1, 7.2.
  }
 };
 } }  // namespace v8::internal
 #endif  // V8_CHAR_PREDICATES_H_
--- a/src/ia32/regexp-macro-assembler-ia32.cc
+++ b/src/ia32/regexp-macro-assembler-ia32.cc
@ -526,6 +526,9 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
      __ j(below_equal, &success, Label::kNear);
      // \u00a0 (NBSP).
      __ cmp(eax, 0x00a0 - '\t');
      __ j(equal, &success, Label::kNear);
      // \u0085 (NEL).
      __ cmp(eax, 0x0085 - '\t');
      BranchOrBacktrack(not_equal, on_no_match);
      __ bind(&success);
      return true;
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@ -3597,9 +3597,10 @@ class AlternativeGenerationList {
 // The '2' variant is has inclusive from and exclusive to.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
+static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
-    0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
+    0x0085, 0x0086, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F,
-    0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
+    0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060,
    0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
 static const int kWordRanges[] = {
--- a/src/runtime.cc
+++ b/src/runtime.cc
@ -6105,8 +6105,10 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
      // Fast check for a junk value. A valid string may start from a
      // whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
      // the 'I' character ('Infinity'). All of that have codes not greater than
-      // '9' except 'I' and &nbsp;.
+      // '9' except 'I', NBSP and NEL.
-      if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
+      if (data[start_pos] != 'I' &&
          data[start_pos] != 0xa0 &&
          data[start_pos] != 0x85) {
        return isolate->heap()->nan_value();
      }
    } else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
@ -6541,11 +6543,6 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToUpperCase) {
 }
 static inline bool IsTrimWhiteSpace(unibrow::uchar c) {
  return unibrow::WhiteSpace::Is(c) || c == 0x200b || c == 0xfeff;
 }
 RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
  HandleScope scope(isolate);
  ASSERT(args.length() == 3);
@ -6558,15 +6555,17 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
  int length = string->length();
  int left = 0;
  UnicodeCache* unicode_cache = isolate->unicode_cache();
  if (trimLeft) {
-    while (left < length && IsTrimWhiteSpace(string->Get(left))) {
+    while (left < length && unicode_cache->IsWhiteSpace(string->Get(left))) {
      left++;
    }
  }
  int right = length;
  if (trimRight) {
-    while (right > left && IsTrimWhiteSpace(string->Get(right - 1))) {
+    while (right > left &&
           unicode_cache->IsWhiteSpace(string->Get(right - 1))) {
      right--;
    }
  }
--- a/src/scanner.h
+++ b/src/scanner.h
@ -144,7 +144,7 @@ class UnicodeCache {
  unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
  unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
  unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
-  unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
+  unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace;
  StaticResource<Utf8Decoder> utf8_decoder_;
  DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
--- a/src/x64/regexp-macro-assembler-x64.cc
+++ b/src/x64/regexp-macro-assembler-x64.cc
@ -552,6 +552,9 @@ bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type,
      __ j(below_equal, &success, Label::kNear);
      // \u00a0 (NBSP).
      __ cmpl(rax, Immediate(0x00a0 - '\t'));
      __ j(equal, &success, Label::kNear);
      // \u0085 (NEL).
      __ cmpl(rax, Immediate(0x0085 - '\t'));
      BranchOrBacktrack(not_equal, on_no_match);
      __ bind(&success);
      return true;
--- a/test/cctest/test-regexp.cc
+++ b/test/cctest/test-regexp.cc
@ -445,21 +445,7 @@ static bool NotDigit(uc16 c) {
 static bool IsWhiteSpace(uc16 c) {
-  switch (c) {
+  return v8::internal::WhiteSpace::Is(c);
    case 0x09:
    case 0x0A:
    case 0x0B:
    case 0x0C:
    case 0x0d:
    case 0x20:
    case 0xA0:
    case 0x2028:
    case 0x2029:
    case 0xFEFF:
      return true;
    default:
      return unibrow::Space::Is(c);
  }
 }
--- a/test/mjsunit/third_party/string-trim.js
+++ b/test/mjsunit/third_party/string-trim.js
@ -66,7 +66,8 @@ var whitespace      = [
  {s : '\u3000', t : 'IDEOGRAPHIC SPACE'},
  {s : '\u2028', t : 'LINE SEPARATOR'},
  {s : '\u2029', t : 'PARAGRAPH SEPARATOR'},
-  {s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
+  // \u200B is not a whitespace character according to Unicode 6.3.0.
  // {s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
 ];
 for (var i = 0; i < whitespace.length; i++) {
--- a/test/mjsunit/whitespaces.js
+++ b/test/mjsunit/whitespaces.js
@ -0,0 +1,134 @@
 // Copyright 2014 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 //       notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 //       copyright notice, this list of conditions and the following
 //       disclaimer in the documentation and/or other materials provided
 //       with the distribution.
 //     * Neither the name of Google Inc. nor the names of its
 //       contributors may be used to endorse or promote products derived
 //       from this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 var whitespaces = [
  // Whitespaces defined in ECMA-262 5.1, 7.2
  0x0009,  // Tab              TAB
  0x000B,  // Vertical Tab     VT
  0x000C,  // Form Feed        FF
  0x0020,  // Space            SP
  0x00A0,  // No-break space   NBSP
  0xFEFF,  // Byte Order Mark  BOM
  // Unicode whitespaces
  0x000A,  // Line Feed        LF
  0x000D,  // Carriage Return  CR
  0x0085,  // Next Line        NEL
  0x1680,  // Ogham Space Mark
  0x180E,  // Mongolian Vowel Separator
  0x2000,  // EN QUAD
  0x2001,  // EM QUAD
  0x2002,  // EN SPACE
  0x2003,  // EM SPACE
  0x2004,  // THREE-PER-EM SPACE
  0x2005,  // FOUR-PER-EM SPACE
  0x2006,  // SIX-PER-EM SPACE
  0x2007,  // FIGURE SPACE
  0x2008,  // PUNCTUATION SPACE
  0x2009,  // THIN SPACE
  0x200A,  // HAIR SPACE
  0x2028,  // LINE SEPARATOR
  0x2029,  // PARAGRAPH SEPARATOR
  0x202F,  // NARROW NO-BREAK SPACE
  0x205F,  // MEDIUM MATHEMATICAL SPACE
  0x3000,  // IDEOGRAPHIC SPACE
 ];
 // Add single twobyte char to force twobyte representation.
 // Interestingly, snowman is not "white" space :)
 var twobyte = "\u2603";
 var onebyte = "\u007E";
 var twobytespace = "\u2000";
 var onebytespace = "\u0020";
 function is_whitespace(c) {
  return whitespaces.indexOf(c.charCodeAt(0)) > -1;
 }
 function test_regexp(str) {
  var pos_match = str.match(/\s/);
  var neg_match = str.match(/\S/);
  var test_char = str[0];
  var postfix = str[1];
  if (is_whitespace(test_char)) {
    assertEquals(test_char, pos_match[0]);
    assertEquals(postfix, neg_match[0]);
  } else {
    assertEquals(test_char, neg_match[0]);
    assertNull(pos_match);
  }
 }
 function test_trim(c, infix) {
  var str = c + c + c + infix + c;
  if (is_whitespace(c)) {
    assertEquals(infix, str.trim());
  } else {
    assertEquals(str, str.trim());
  }
 }
 function test_parseInt(c, postfix) {
  // Skip if prefix is a digit.
  if (c >= "0" && c <= 9) return;
  var str = c + c + "123" + postfix;
  if (is_whitespace(c)) {
    assertEquals(123, parseInt(str));
  } else {
    assertEquals(NaN, parseInt(str));
  }
 }
 function test_eval(c, content) {
  if (!is_whitespace(c)) return;
  var str = c + c + "'" + content + "'" + c + c;
  assertEquals(content, eval(str));
 }
 function test_stringtonumber(c, postfix) {
  // Skip if prefix is a digit.
  if (c >= "0" && c <= 9) return;
  var result = 1 + Number(c + "123" + c + postfix);
  if (is_whitespace(c)) {
    assertEquals(124, result);
  } else {
    assertEquals(NaN, result);
  }
 }
 for (var i = 0; i < 0x10000; i++) {
  c = String.fromCharCode(i);
  test_regexp(c + onebyte);
  test_regexp(c + twobyte);
  test_trim(c, onebyte + "trim");
  test_trim(c, twobyte + "trim");
  test_parseInt(c, onebyte);
  test_parseInt(c, twobyte);
  test_eval(c, onebyte);
  test_eval(c, twobyte);
  test_stringtonumber(c, onebytespace);
  test_stringtonumber(c, twobytespace);
 }