From 3246d26b71d0c4d86705e3994b0c328c846a3fd4 Mon Sep 17 00:00:00 2001 From: yangguo Date: Mon, 25 Jan 2016 02:45:57 -0800 Subject: [PATCH] [regexp] step back if starting unicode regexp within surrogate pair. See https://github.com/tc39/ecma262/issues/128 R=erik.corry@gmail.com, littledan@chromium.org BUG=v8:2952 LOG=N Review URL: https://codereview.chromium.org/1608693003 Cr-Commit-Position: refs/heads/master@{#33488} --- src/regexp/jsregexp.cc | 140 +++++++++++++----- src/regexp/regexp-ast.h | 2 +- .../harmony/unicode-regexp-last-index.js | 104 +++++++++++++ 3 files changed, 210 insertions(+), 36 deletions(-) create mode 100644 test/mjsunit/harmony/unicode-regexp-last-index.js diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc index 3559bcd111..6235c25c77 100644 --- a/src/regexp/jsregexp.cc +++ b/src/regexp/jsregexp.cc @@ -3957,6 +3957,11 @@ void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { int choice_count = alternatives_->length(); + if (choice_count == 1 && alternatives_->at(0).guards() == NULL) { + alternatives_->at(0).node()->Emit(compiler, trace); + return; + } + AssertGuardsMentionRegisters(trace); LimitResult limit_result = LimitVersions(compiler, trace); @@ -5040,22 +5045,21 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, if (lead_surrogates == nullptr) return; Zone* zone = compiler->zone(); // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). - ZoneList* trail_surrogates = - new (zone) ZoneList(1, zone); - trail_surrogates->Add( - CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), zone); + ZoneList* trail_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); - RegExpNode* match = - compiler->read_backward() - // Reading backward. Assert that reading forward, there is no trail - // surrogate, and then backward match the lead surrogate. - ? NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, trail_surrogates, lead_surrogates, on_success, true) - // Reading forward. Forwrad match the lead surrogate and assert that - // no - // trail surrogate follows. - : MatchAndNegativeLookaroundInReadDirection( - compiler, lead_surrogates, trail_surrogates, on_success, false); + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Assert that reading forward, there is no trail + // surrogate, and then backward match the lead surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, trail_surrogates, lead_surrogates, on_success, true); + } else { + // Reading forward. Forward match the lead surrogate and assert that + // no trail surrogate follows. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, lead_surrogates, trail_surrogates, on_success, false); + } result->AddAlternative(GuardedAlternative(match)); } @@ -5067,25 +5071,55 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, if (trail_surrogates == nullptr) return; Zone* zone = compiler->zone(); // E.g. \udc01 becomes (?* lead_surrogates = - new (zone) ZoneList(1, zone); - lead_surrogates->Add( - CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), zone); + ZoneList* lead_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); - RegExpNode* match = - compiler->read_backward() - // Reading backward. Backward match the trail surrogate and assert - // that no lead surrogate precedes it. - ? MatchAndNegativeLookaroundInReadDirection( - compiler, trail_surrogates, lead_surrogates, on_success, true) - // Reading forward. Assert that reading backward, there is no lead - // surrogate, and then forward match the trail surrogate. - : NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, lead_surrogates, trail_surrogates, on_success, false); + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Backward match the trail surrogate and assert that no + // lead surrogate precedes it. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, trail_surrogates, lead_surrogates, on_success, true); + } else { + // Reading forward. Assert that reading backward, there is no lead + // surrogate, and then forward match the trail surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, lead_surrogates, trail_surrogates, on_success, false); + } result->AddAlternative(GuardedAlternative(match)); } +void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success) { + // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. + DCHECK(!compiler->read_backward()); + Zone* zone = compiler->zone(); + // Advancing can either consume a BMP character or a trail surrogate. + ZoneList* bmp_and_trail = + new (zone) ZoneList(2, zone); + bmp_and_trail->Add(CharacterRange::Range(0, kLeadSurrogateStart - 1), zone); + bmp_and_trail->Add( + CharacterRange::Range(kLeadSurrogateEnd + 1, kNonBmpStart - 1), zone); + result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( + zone, bmp_and_trail, false, on_success))); + + // Or it could consume a lead optionally followed by a trail surrogate. + ZoneList* lead_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); + ZoneList* trail_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); + ChoiceNode* optional_trail = new (zone) ChoiceNode(2, zone); + optional_trail->AddAlternative( + GuardedAlternative(TextNode::CreateForCharacterRanges( + zone, trail_surrogates, false, on_success))); + optional_trail->AddAlternative(GuardedAlternative(on_success)); + RegExpNode* optional_pair = TextNode::CreateForCharacterRanges( + zone, lead_surrogates, false, optional_trail); + result->AddAlternative(GuardedAlternative(optional_pair)); +} + + RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { set_.Canonicalize(); @@ -5102,12 +5136,16 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, // No matches possible. return new (zone) EndNode(EndNode::BACKTRACK, zone); } - UnicodeRangeSplitter splitter(zone, ranges); - ChoiceNode* result = new (compiler->zone()) ChoiceNode(2, compiler->zone()); - AddBmpCharacters(compiler, result, on_success, &splitter); - AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); - AddLoneLeadSurrogates(compiler, result, on_success, &splitter); - AddLoneTrailSurrogates(compiler, result, on_success, &splitter); + ChoiceNode* result = new (zone) ChoiceNode(2, zone); + if (standard_type() == '*') { + AddUnanchoredAdvance(compiler, result, on_success); + } else { + UnicodeRangeSplitter splitter(zone, ranges); + AddBmpCharacters(compiler, result, on_success, &splitter); + AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); + AddLoneLeadSurrogates(compiler, result, on_success, &splitter); + AddLoneTrailSurrogates(compiler, result, on_success, &splitter); + } return result; } else { return new (zone) TextNode(this, compiler->read_backward(), on_success); @@ -6513,6 +6551,36 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) { } +RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, + RegExpNode* on_success) { + // If the regexp matching starts within a surrogate pair, step back + // to the lead surrogate and start matching from there. + DCHECK(!compiler->read_backward()); + Zone* zone = compiler->zone(); + ZoneList* lead_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); + ZoneList* trail_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); + + ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); + + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpNode* step_back = TextNode::CreateForCharacterRanges( + zone, lead_surrogates, true, on_success); + RegExpLookaround::Builder builder(true, step_back, stack_register, + position_register); + RegExpNode* match_trail = TextNode::CreateForCharacterRanges( + zone, trail_surrogates, false, builder.on_match_success()); + + optional_step_back->AddAlternative( + GuardedAlternative(builder.ForMatch(match_trail))); + optional_step_back->AddAlternative(GuardedAlternative(on_success)); + + return optional_step_back; +} + + RegExpEngine::CompilationResult RegExpEngine::Compile( Isolate* isolate, Zone* zone, RegExpCompileData* data, JSRegExp::Flags flags, Handle pattern, @@ -6575,6 +6643,8 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( if (node != NULL) { node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); } + } else if (compiler.unicode() && (is_global || is_sticky)) { + node = OptionallyStepBackToLeadSurrogate(&compiler, node); } if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h index ed91a82d49..5e6f8f45e7 100644 --- a/src/regexp/regexp-ast.h +++ b/src/regexp/regexp-ast.h @@ -311,7 +311,7 @@ class RegExpCharacterClass final : public RegExpTree { // d : ASCII digit // D : non-ASCII digit // . : non-unicode non-newline - // * : All characters + // * : All characters, for advancing unanchored regexp uc16 standard_type() { return set_.standard_set_type(); } ZoneList* ranges(Zone* zone) { return set_.ranges(zone); } bool is_negated() { return is_negated_; } diff --git a/test/mjsunit/harmony/unicode-regexp-last-index.js b/test/mjsunit/harmony/unicode-regexp-last-index.js new file mode 100644 index 0000000000..4a075d4380 --- /dev/null +++ b/test/mjsunit/harmony/unicode-regexp-last-index.js @@ -0,0 +1,104 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Flags: --harmony-unicode-regexps --harmony-regexp-lookbehind + +var r = /./ug; +assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +r.lastIndex = 1; +assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +assertEquals(["\ud801\udc01"], r.exec("\ud800\udc00\ud801\udc01")); +r.lastIndex = 3; +assertEquals(["\ud801\udc01"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(4, r.lastIndex); +r.lastIndex = 4; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); +r.lastIndex = 5; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); + +r.lastIndex = 3; +assertEquals(["\ud802"], r.exec("\ud800\udc00\ud801\ud802")); +r.lastIndex = 4; +assertNull(r.exec("\ud800\udc00\ud801\ud802")); + +r = /./g; +assertEquals(["\ud800"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(1, r.lastIndex); +assertEquals(["\udc00"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +assertEquals(["\ud801"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(3, r.lastIndex); +assertEquals(["\udc01"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(4, r.lastIndex); +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); +r.lastIndex = 1; +assertEquals(["\udc00"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); + +// ------------------------ + +r = /^./ug; +assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +r.lastIndex = 1; +assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); +r.lastIndex = 3; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); +r.lastIndex = 4; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); +r.lastIndex = 5; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); + +r = /^./g; +assertEquals(["\ud800"], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(1, r.lastIndex); +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); +r.lastIndex = 3; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(0, r.lastIndex); + +//------------------------ + +r = /(?:(^.)|.)/ug; +assertEquals(["\ud800\udc00", "\ud800\udc00"], + r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +r.lastIndex = 1; +assertEquals(["\ud800\udc00", "\ud800\udc00"], + r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +assertEquals(["\ud801\udc01", undefined], r.exec("\ud800\udc00\ud801\udc01")); +r.lastIndex = 3; +assertEquals(["\ud801\udc01", undefined], r.exec("\ud800\udc00\ud801\udc01")); +r.lastIndex = 4; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); +r.lastIndex = 5; +assertNull(r.exec("\ud800\udc00\ud801\udc01")); + +r.lastIndex = 3; +assertEquals(["\ud802", undefined], r.exec("\ud800\udc00\ud801\ud802")); +r.lastIndex = 4; +assertNull(r.exec("\ud800\udc00\ud801\ud802")); + +r = /(?:(^.)|.)/g; +assertEquals(["\ud800", "\ud800"], + r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(1, r.lastIndex); +assertEquals(["\udc00", undefined], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(2, r.lastIndex); +r.lastIndex = 3; +assertEquals(["\udc01", undefined], r.exec("\ud800\udc00\ud801\udc01")); +assertEquals(4, r.lastIndex);