[regexp] fix /\W/ui wrt \u017f and \u212a.

R=jgruber@chromium.org
BUG=v8:5080

Review-Url: https://codereview.chromium.org/2725583002
Cr-Commit-Position: refs/heads/master@{#43496}
This commit is contained in:
yangguo 2017-02-28 10:28:42 -08:00 committed by Commit bot
parent 0db5bc23e9
commit a813525a07
5 changed files with 138 additions and 26 deletions

View File

@ -1019,6 +1019,11 @@ class RegExpCompiler {
inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
// Both unicode and ignore_case flags are set. We need to use ICU to find
// the closure over case equivalents.
inline bool needs_unicode_case_equivalents() {
return unicode() && ignore_case();
}
inline bool one_byte() { return one_byte_; } inline bool one_byte() { return one_byte_; }
inline bool optimize() { return optimize_; } inline bool optimize() { return optimize_; }
inline void set_optimize(bool value) { optimize_ = value; } inline void set_optimize(bool value) { optimize_ = value; }
@ -5108,13 +5113,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
return TextNode::CreateForCharacterRanges(zone, range, false, on_success); return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
} }
void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
ZoneList<CharacterRange>* ranges) {
#ifdef V8_I18N_SUPPORT #ifdef V8_I18N_SUPPORT
// Use ICU to compute the case fold closure over the ranges. // Use ICU to compute the case fold closure over the ranges.
DCHECK(compiler->unicode());
DCHECK(compiler->ignore_case());
icu::UnicodeSet set; icu::UnicodeSet set;
for (int i = 0; i < ranges->length(); i++) { for (int i = 0; i < ranges->length(); i++) {
set.add(ranges->at(i).from(), ranges->at(i).to()); set.add(ranges->at(i).from(), ranges->at(i).to());
@ -5125,18 +5126,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
// Those are represented as strings in the set. Remove them so that // Those are represented as strings in the set. Remove them so that
// we end up with only simple and common case mappings. // we end up with only simple and common case mappings.
set.removeAllStrings(); set.removeAllStrings();
Zone* zone = compiler->zone();
for (int i = 0; i < set.getRangeCount(); i++) { for (int i = 0; i < set.getRangeCount(); i++) {
ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
zone); zone);
} }
// No errors and everything we collected have been ranges. // No errors and everything we collected have been ranges.
#else
// Fallback if ICU is not included.
CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
ranges, compiler->one_byte());
#endif // V8_I18N_SUPPORT
CharacterRange::Canonicalize(ranges); CharacterRange::Canonicalize(ranges);
#endif // V8_I18N_SUPPORT
} }
@ -5145,8 +5141,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
set_.Canonicalize(); set_.Canonicalize();
Zone* zone = compiler->zone(); Zone* zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone); ZoneList<CharacterRange>* ranges = this->ranges(zone);
if (compiler->unicode() && compiler->ignore_case()) { if (compiler->needs_unicode_case_equivalents()) {
AddUnicodeCaseEquivalents(compiler, ranges); AddUnicodeCaseEquivalents(ranges, zone);
} }
if (compiler->unicode() && !compiler->one_byte()) { if (compiler->unicode() && !compiler->one_byte()) {
if (is_negated()) { if (is_negated()) {
@ -5619,6 +5615,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
} }
} }
namespace {
// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpAssertion::AssertionType type) {
DCHECK(compiler->needs_unicode_case_equivalents());
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* word_range =
new (zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::AddClassEscape('w', word_range, true, zone);
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
// Add two choices. The (non-)boundary could start with a word or
// a non-word-character.
for (int i = 0; i < 2; i++) {
bool lookbehind_for_word = i == 0;
bool lookahead_for_word =
(type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
// Look to the left.
RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
stack_register, position_register);
RegExpNode* backward = TextNode::CreateForCharacterRanges(
zone, word_range, true, lookbehind.on_match_success());
// Look to the right.
RegExpLookaround::Builder lookahead(lookahead_for_word,
lookbehind.ForMatch(backward),
stack_register, position_register);
RegExpNode* forward = TextNode::CreateForCharacterRanges(
zone, word_range, false, lookahead.on_match_success());
result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
}
return result;
}
} // anonymous namespace
RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) { RegExpNode* on_success) {
@ -5631,9 +5663,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
case START_OF_INPUT: case START_OF_INPUT:
return AssertionNode::AtStart(on_success); return AssertionNode::AtStart(on_success);
case BOUNDARY: case BOUNDARY:
return AssertionNode::AtBoundary(on_success); return compiler->needs_unicode_case_equivalents()
? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
: AssertionNode::AtBoundary(on_success);
case NON_BOUNDARY: case NON_BOUNDARY:
return AssertionNode::AtNonBoundary(on_success); return compiler->needs_unicode_case_equivalents()
? BoundaryAssertionAsLookaround(compiler, on_success,
NON_BOUNDARY)
: AssertionNode::AtNonBoundary(on_success);
case END_OF_INPUT: case END_OF_INPUT:
return AssertionNode::AtEnd(on_success); return AssertionNode::AtEnd(on_success);
case END_OF_LINE: { case END_OF_LINE: {
@ -5647,7 +5684,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
// Create a newline atom. // Create a newline atom.
ZoneList<CharacterRange>* newline_ranges = ZoneList<CharacterRange>* newline_ranges =
new(zone) ZoneList<CharacterRange>(3, zone); new(zone) ZoneList<CharacterRange>(3, zone);
CharacterRange::AddClassEscape('n', newline_ranges, zone); CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
TextNode* newline_matcher = new (zone) TextNode( TextNode* newline_matcher = new (zone) TextNode(
newline_atom, false, ActionNode::PositiveSubmatchSuccess( newline_atom, false, ActionNode::PositiveSubmatchSuccess(
@ -5821,9 +5858,30 @@ static void AddClassNegated(const int *elmv,
ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
} }
void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents,
Zone* zone) {
if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
// See #sec-runtime-semantics-wordcharacters-abstract-operation
// In case of unicode and ignore_case, we need to create the closure over
// case equivalent characters before negating.
ZoneList<CharacterRange>* new_ranges =
new (zone) ZoneList<CharacterRange>(2, zone);
AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
AddUnicodeCaseEquivalents(new_ranges, zone);
if (type == 'W') {
ZoneList<CharacterRange>* negated =
new (zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::Negate(new_ranges, negated, zone);
new_ranges = negated;
}
ranges->AddAll(*new_ranges, zone);
return;
}
AddClassEscape(type, ranges, zone);
}
void CharacterRange::AddClassEscape(uc16 type, void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
ZoneList<CharacterRange>* ranges,
Zone* zone) { Zone* zone) {
switch (type) { switch (type) {
case 's': case 's':
@ -5965,7 +6023,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
if (ranges_ == NULL) { if (ranges_ == NULL) {
ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
} }
return ranges_; return ranges_;
} }

View File

@ -81,6 +81,9 @@ class CharacterRange {
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
Zone* zone); Zone* zone);
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents, Zone* zone);
static Vector<const int> GetWordBounds(); static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc32 value) { static inline CharacterRange Singleton(uc32 value) {
return CharacterRange(value, value); return CharacterRange(value, value);

View File

@ -270,7 +270,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// everything except \x0a, \x0d, \u2028 and \u2029 // everything except \x0a, \x0d, \u2028 and \u2029
ZoneList<CharacterRange>* ranges = ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone()); new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape('.', ranges, zone()); CharacterRange::AddClassEscape('.', ranges, false, zone());
RegExpCharacterClass* cc = RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, false); new (zone()) RegExpCharacterClass(ranges, false);
builder->AddCharacterClass(cc); builder->AddCharacterClass(cc);
@ -377,7 +377,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2); Advance(2);
ZoneList<CharacterRange>* ranges = ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone()); new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges, zone()); CharacterRange::AddClassEscape(c, ranges,
unicode() && ignore_case(), zone());
RegExpCharacterClass* cc = RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, false); new (zone()) RegExpCharacterClass(ranges, false);
builder->AddCharacterClass(cc); builder->AddCharacterClass(cc);
@ -1389,9 +1390,11 @@ static const uc16 kNoCharClass = 0;
// escape (i.e., 's' means whitespace, from '\s'). // escape (i.e., 's' means whitespace, from '\s').
static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
uc16 char_class, CharacterRange range, uc16 char_class, CharacterRange range,
bool add_unicode_case_equivalents,
Zone* zone) { Zone* zone) {
if (char_class != kNoCharClass) { if (char_class != kNoCharClass) {
CharacterRange::AddClassEscape(char_class, ranges, zone); CharacterRange::AddClassEscape(char_class, ranges,
add_unicode_case_equivalents, zone);
} else { } else {
ranges->Add(range, zone); ranges->Add(range, zone);
} }
@ -1431,6 +1434,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
} }
ZoneList<CharacterRange>* ranges = ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone()); new (zone()) ZoneList<CharacterRange>(2, zone());
bool add_unicode_case_equivalents = unicode() && ignore_case();
while (has_more() && current() != ']') { while (has_more() && current() != ']') {
bool parsed_property = ParseClassProperty(ranges CHECK_FAILED); bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
if (parsed_property) continue; if (parsed_property) continue;
@ -1443,7 +1447,8 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
// following code report an error. // following code report an error.
break; break;
} else if (current() == ']') { } else if (current() == ']') {
AddRangeOrEscape(ranges, char_class, first, zone()); AddRangeOrEscape(ranges, char_class, first,
add_unicode_case_equivalents, zone());
ranges->Add(CharacterRange::Singleton('-'), zone()); ranges->Add(CharacterRange::Singleton('-'), zone());
break; break;
} }
@ -1455,9 +1460,11 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
// ES2015 21.2.2.15.1 step 1. // ES2015 21.2.2.15.1 step 1.
return ReportError(CStrVector(kRangeInvalid)); return ReportError(CStrVector(kRangeInvalid));
} }
AddRangeOrEscape(ranges, char_class, first, zone()); AddRangeOrEscape(ranges, char_class, first,
add_unicode_case_equivalents, zone());
ranges->Add(CharacterRange::Singleton('-'), zone()); ranges->Add(CharacterRange::Singleton('-'), zone());
AddRangeOrEscape(ranges, char_class_2, next, zone()); AddRangeOrEscape(ranges, char_class_2, next,
add_unicode_case_equivalents, zone());
continue; continue;
} }
// ES2015 21.2.2.15.1 step 6. // ES2015 21.2.2.15.1 step 6.
@ -1466,7 +1473,8 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
} }
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
} else { } else {
AddRangeOrEscape(ranges, char_class, first, zone()); AddRangeOrEscape(ranges, char_class, first, add_unicode_case_equivalents,
zone());
} }
} }
if (!has_more()) { if (!has_more()) {

View File

@ -0,0 +1,42 @@
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
assertTrue(/\w/iu.test('\u017F'));
assertTrue(/\w/iu.test('\u212A'));
assertFalse(/\W/iu.test('\u017F'));
assertFalse(/\W/iu.test('\u212A'));
assertFalse(/\W/iu.test('s'));
assertFalse(/\W/iu.test('S'));
assertFalse(/\W/iu.test('K'));
assertFalse(/\W/iu.test('k'));
assertTrue(/[\w]/iu.test('\u017F'));
assertTrue(/[\w]/iu.test('\u212A'));
assertFalse(/[\W]/iu.test('\u017F'));
assertFalse(/[\W]/iu.test('\u212A'));
assertFalse(/[\W]/iu.test('s'));
assertFalse(/[\W]/iu.test('S'));
assertFalse(/[\W]/iu.test('K'));
assertFalse(/[\W]/iu.test('k'));
assertTrue(/\b/iu.test('\u017F'));
assertTrue(/\b/iu.test('\u212A'));
assertTrue(/\b/iu.test('s'));
assertTrue(/\b/iu.test('S'));
assertFalse(/\B/iu.test('\u017F'));
assertFalse(/\B/iu.test('\u212A'));
assertFalse(/\B/iu.test('s'));
assertFalse(/\B/iu.test('S'));
assertFalse(/\B/iu.test('K'));
assertFalse(/\B/iu.test('k'));
assertEquals(["abcd", "d"], /a.*?(.)\b/i.exec('abcd\u017F cd'));
assertEquals(["abcd", "d"], /a.*?(.)\b/i.exec('abcd\u212A cd'));
assertEquals(["abcd\u017F", "\u017F"], /a.*?(.)\b/iu.exec('abcd\u017F cd'));
assertEquals(["abcd\u212A", "\u212A"], /a.*?(.)\b/iu.exec('abcd\u212A cd'));
assertEquals(["a\u017F ", " "], /a.*?\B(.)/i.exec('a\u017F '));
assertEquals(["a\u212A ", " "], /a.*?\B(.)/i.exec('a\u212A '));
assertEquals(["a\u017F", "\u017F"], /a.*?\B(.)/iu.exec('a\u017F '));
assertEquals(["a\u212A", "\u212A"], /a.*?\B(.)/iu.exec('a\u212A '));

View File

@ -156,6 +156,7 @@
'es6/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]], 'es6/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]],
'es6/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]], 'es6/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]],
'regress/regress-5036': [PASS, ['no_i18n == True', FAIL]], 'regress/regress-5036': [PASS, ['no_i18n == True', FAIL]],
'es7/regexp-ui-word': [PASS, ['no_i18n == True', FAIL]],
# desugaring regexp property class relies on ICU. # desugaring regexp property class relies on ICU.
'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]], 'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]],