[regexp] fix /\W/ui wrt \u017f and \u212a.

R=jgruber@chromium.org BUG=v8:5080 Review-Url: https://codereview.chromium.org/2725583002 Cr-Commit-Position: refs/heads/master@{#43496}
2017-02-28 10:28:42 -08:00 · 2017-02-28 10:28:42 -08:00 · a813525a07
commit a813525a07
parent 0db5bc23e9
5 changed files with 138 additions and 26 deletions
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@ -1019,6 +1019,11 @@ class RegExpCompiler {
  inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
  inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
  // Both unicode and ignore_case flags are set. We need to use ICU to find
  // the closure over case equivalents.
  inline bool needs_unicode_case_equivalents() {
    return unicode() && ignore_case();
  }
  inline bool one_byte() { return one_byte_; }
  inline bool optimize() { return optimize_; }
  inline void set_optimize(bool value) { optimize_ = value; }
@ -5108,13 +5113,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
 }
-
+void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
                               ZoneList<CharacterRange>* ranges) {
 #ifdef V8_I18N_SUPPORT
  // Use ICU to compute the case fold closure over the ranges.
  DCHECK(compiler->unicode());
  DCHECK(compiler->ignore_case());
  icu::UnicodeSet set;
  for (int i = 0; i < ranges->length(); i++) {
    set.add(ranges->at(i).from(), ranges->at(i).to());
@ -5125,18 +5126,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
  // Those are represented as strings in the set. Remove them so that
  // we end up with only simple and common case mappings.
  set.removeAllStrings();
  Zone* zone = compiler->zone();
  for (int i = 0; i < set.getRangeCount(); i++) {
    ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
                zone);
  }
  // No errors and everything we collected have been ranges.
 #else
  // Fallback if ICU is not included.
  CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
                                     ranges, compiler->one_byte());
 #endif  // V8_I18N_SUPPORT
  CharacterRange::Canonicalize(ranges);
 #endif  // V8_I18N_SUPPORT
 }
@ -5145,8 +5141,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
  set_.Canonicalize();
  Zone* zone = compiler->zone();
  ZoneList<CharacterRange>* ranges = this->ranges(zone);
-  if (compiler->unicode() && compiler->ignore_case()) {
+  if (compiler->needs_unicode_case_equivalents()) {
-    AddUnicodeCaseEquivalents(compiler, ranges);
+    AddUnicodeCaseEquivalents(ranges, zone);
  }
  if (compiler->unicode() && !compiler->one_byte()) {
    if (is_negated()) {
@ -5619,6 +5615,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
  }
 }
 namespace {
 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
                                          RegExpNode* on_success,
                                          RegExpAssertion::AssertionType type) {
  DCHECK(compiler->needs_unicode_case_equivalents());
  Zone* zone = compiler->zone();
  ZoneList<CharacterRange>* word_range =
      new (zone) ZoneList<CharacterRange>(2, zone);
  CharacterRange::AddClassEscape('w', word_range, true, zone);
  int stack_register = compiler->UnicodeLookaroundStackRegister();
  int position_register = compiler->UnicodeLookaroundPositionRegister();
  ChoiceNode* result = new (zone) ChoiceNode(2, zone);
  // Add two choices. The (non-)boundary could start with a word or
  // a non-word-character.
  for (int i = 0; i < 2; i++) {
    bool lookbehind_for_word = i == 0;
    bool lookahead_for_word =
        (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
    // Look to the left.
    RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
                                         stack_register, position_register);
    RegExpNode* backward = TextNode::CreateForCharacterRanges(
        zone, word_range, true, lookbehind.on_match_success());
    // Look to the right.
    RegExpLookaround::Builder lookahead(lookahead_for_word,
                                        lookbehind.ForMatch(backward),
                                        stack_register, position_register);
    RegExpNode* forward = TextNode::CreateForCharacterRanges(
        zone, word_range, false, lookahead.on_match_success());
    result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
  }
  return result;
 }
 }  // anonymous namespace
 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
                                    RegExpNode* on_success) {
@ -5631,9 +5663,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
    case START_OF_INPUT:
      return AssertionNode::AtStart(on_success);
    case BOUNDARY:
-      return AssertionNode::AtBoundary(on_success);
+      return compiler->needs_unicode_case_equivalents()
                 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
                 : AssertionNode::AtBoundary(on_success);
    case NON_BOUNDARY:
-      return AssertionNode::AtNonBoundary(on_success);
+      return compiler->needs_unicode_case_equivalents()
                 ? BoundaryAssertionAsLookaround(compiler, on_success,
                                                 NON_BOUNDARY)
                 : AssertionNode::AtNonBoundary(on_success);
    case END_OF_INPUT:
      return AssertionNode::AtEnd(on_success);
    case END_OF_LINE: {
@ -5647,7 +5684,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
      // Create a newline atom.
      ZoneList<CharacterRange>* newline_ranges =
          new(zone) ZoneList<CharacterRange>(3, zone);
-      CharacterRange::AddClassEscape('n', newline_ranges, zone);
+      CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
      RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
      TextNode* newline_matcher = new (zone) TextNode(
          newline_atom, false, ActionNode::PositiveSubmatchSuccess(
@ -5821,9 +5858,30 @@ static void AddClassNegated(const int *elmv,
  ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
 }
 void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
                                    bool add_unicode_case_equivalents,
                                    Zone* zone) {
  if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
    // See #sec-runtime-semantics-wordcharacters-abstract-operation
    // In case of unicode and ignore_case, we need to create the closure over
    // case equivalent characters before negating.
    ZoneList<CharacterRange>* new_ranges =
        new (zone) ZoneList<CharacterRange>(2, zone);
    AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
    AddUnicodeCaseEquivalents(new_ranges, zone);
    if (type == 'W') {
      ZoneList<CharacterRange>* negated =
          new (zone) ZoneList<CharacterRange>(2, zone);
      CharacterRange::Negate(new_ranges, negated, zone);
      new_ranges = negated;
    }
    ranges->AddAll(*new_ranges, zone);
    return;
  }
  AddClassEscape(type, ranges, zone);
 }
-void CharacterRange::AddClassEscape(uc16 type,
+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
                                    ZoneList<CharacterRange>* ranges,
                                    Zone* zone) {
  switch (type) {
    case 's':
@ -5965,7 +6023,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
  if (ranges_ == NULL) {
    ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
-    CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
+    CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
  }
  return ranges_;
 }
--- a/src/regexp/regexp-ast.h
+++ b/src/regexp/regexp-ast.h
@ -81,6 +81,9 @@ class CharacterRange {
  CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
  static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
                             Zone* zone);
  // Add class escapes. Add case equivalent closure for \w and \W if necessary.
  static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
                             bool add_unicode_case_equivalents, Zone* zone);
  static Vector<const int> GetWordBounds();
  static inline CharacterRange Singleton(uc32 value) {
    return CharacterRange(value, value);
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -270,7 +270,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
        // everything except \x0a, \x0d, \u2028 and \u2029
        ZoneList<CharacterRange>* ranges =
            new (zone()) ZoneList<CharacterRange>(2, zone());
-        CharacterRange::AddClassEscape('.', ranges, zone());
+        CharacterRange::AddClassEscape('.', ranges, false, zone());
        RegExpCharacterClass* cc =
            new (zone()) RegExpCharacterClass(ranges, false);
        builder->AddCharacterClass(cc);
@ -377,7 +377,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
            Advance(2);
            ZoneList<CharacterRange>* ranges =
                new (zone()) ZoneList<CharacterRange>(2, zone());
-            CharacterRange::AddClassEscape(c, ranges, zone());
+            CharacterRange::AddClassEscape(c, ranges,
                                           unicode() && ignore_case(), zone());
            RegExpCharacterClass* cc =
                new (zone()) RegExpCharacterClass(ranges, false);
            builder->AddCharacterClass(cc);
@ -1389,9 +1390,11 @@ static const uc16 kNoCharClass = 0;
 // escape (i.e., 's' means whitespace, from '\s').
 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
                                    uc16 char_class, CharacterRange range,
                                    bool add_unicode_case_equivalents,
                                    Zone* zone) {
  if (char_class != kNoCharClass) {
-    CharacterRange::AddClassEscape(char_class, ranges, zone);
+    CharacterRange::AddClassEscape(char_class, ranges,
                                   add_unicode_case_equivalents, zone);
  } else {
    ranges->Add(range, zone);
  }
@ -1431,6 +1434,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
  }
  ZoneList<CharacterRange>* ranges =
      new (zone()) ZoneList<CharacterRange>(2, zone());
  bool add_unicode_case_equivalents = unicode() && ignore_case();
  while (has_more() && current() != ']') {
    bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
    if (parsed_property) continue;
@ -1443,7 +1447,8 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
        // following code report an error.
        break;
      } else if (current() == ']') {
-        AddRangeOrEscape(ranges, char_class, first, zone());
+        AddRangeOrEscape(ranges, char_class, first,
                         add_unicode_case_equivalents, zone());
        ranges->Add(CharacterRange::Singleton('-'), zone());
        break;
      }
@ -1455,9 +1460,11 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
          // ES2015 21.2.2.15.1 step 1.
          return ReportError(CStrVector(kRangeInvalid));
        }
-        AddRangeOrEscape(ranges, char_class, first, zone());
+        AddRangeOrEscape(ranges, char_class, first,
                         add_unicode_case_equivalents, zone());
        ranges->Add(CharacterRange::Singleton('-'), zone());
-        AddRangeOrEscape(ranges, char_class_2, next, zone());
+        AddRangeOrEscape(ranges, char_class_2, next,
                         add_unicode_case_equivalents, zone());
        continue;
      }
      // ES2015 21.2.2.15.1 step 6.
@ -1466,7 +1473,8 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
      }
      ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
    } else {
-      AddRangeOrEscape(ranges, char_class, first, zone());
+      AddRangeOrEscape(ranges, char_class, first, add_unicode_case_equivalents,
                       zone());
    }
  }
  if (!has_more()) {
--- a/test/mjsunit/es7/regexp-ui-word.js
+++ b/test/mjsunit/es7/regexp-ui-word.js
@ -0,0 +1,42 @@
 // Copyright 2017 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 assertTrue(/\w/iu.test('\u017F'));
 assertTrue(/\w/iu.test('\u212A'));
 assertFalse(/\W/iu.test('\u017F'));
 assertFalse(/\W/iu.test('\u212A'));
 assertFalse(/\W/iu.test('s'));
 assertFalse(/\W/iu.test('S'));
 assertFalse(/\W/iu.test('K'));
 assertFalse(/\W/iu.test('k'));
 assertTrue(/[\w]/iu.test('\u017F'));
 assertTrue(/[\w]/iu.test('\u212A'));
 assertFalse(/[\W]/iu.test('\u017F'));
 assertFalse(/[\W]/iu.test('\u212A'));
 assertFalse(/[\W]/iu.test('s'));
 assertFalse(/[\W]/iu.test('S'));
 assertFalse(/[\W]/iu.test('K'));
 assertFalse(/[\W]/iu.test('k'));
 assertTrue(/\b/iu.test('\u017F'));
 assertTrue(/\b/iu.test('\u212A'));
 assertTrue(/\b/iu.test('s'));
 assertTrue(/\b/iu.test('S'));
 assertFalse(/\B/iu.test('\u017F'));
 assertFalse(/\B/iu.test('\u212A'));
 assertFalse(/\B/iu.test('s'));
 assertFalse(/\B/iu.test('S'));
 assertFalse(/\B/iu.test('K'));
 assertFalse(/\B/iu.test('k'));
 assertEquals(["abcd", "d"], /a.*?(.)\b/i.exec('abcd\u017F cd'));
 assertEquals(["abcd", "d"], /a.*?(.)\b/i.exec('abcd\u212A cd'));
 assertEquals(["abcd\u017F", "\u017F"], /a.*?(.)\b/iu.exec('abcd\u017F cd'));
 assertEquals(["abcd\u212A", "\u212A"], /a.*?(.)\b/iu.exec('abcd\u212A cd'));
 assertEquals(["a\u017F ", " "], /a.*?\B(.)/i.exec('a\u017F '));
 assertEquals(["a\u212A ", " "], /a.*?\B(.)/i.exec('a\u212A '));
 assertEquals(["a\u017F", "\u017F"], /a.*?\B(.)/iu.exec('a\u017F '));
 assertEquals(["a\u212A", "\u212A"], /a.*?\B(.)/iu.exec('a\u212A '));
--- a/test/mjsunit/mjsunit.status
+++ b/test/mjsunit/mjsunit.status
@ -156,6 +156,7 @@
  'es6/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]],
  'es6/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]],
  'regress/regress-5036': [PASS, ['no_i18n == True', FAIL]],
  'es7/regexp-ui-word': [PASS, ['no_i18n == True', FAIL]],
  # desugaring regexp property class relies on ICU.
  'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]],