[regexp] step back if starting unicode regexp within surrogate pair.
See https://github.com/tc39/ecma262/issues/128 R=erik.corry@gmail.com, littledan@chromium.org BUG=v8:2952 LOG=N Review URL: https://codereview.chromium.org/1608693003 Cr-Commit-Position: refs/heads/master@{#33488}
This commit is contained in:
parent
88f9995d5c
commit
3246d26b71
@ -3957,6 +3957,11 @@ void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
|
|||||||
void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
|
void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
|
||||||
int choice_count = alternatives_->length();
|
int choice_count = alternatives_->length();
|
||||||
|
|
||||||
|
if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
|
||||||
|
alternatives_->at(0).node()->Emit(compiler, trace);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
AssertGuardsMentionRegisters(trace);
|
AssertGuardsMentionRegisters(trace);
|
||||||
|
|
||||||
LimitResult limit_result = LimitVersions(compiler, trace);
|
LimitResult limit_result = LimitVersions(compiler, trace);
|
||||||
@ -5040,22 +5045,21 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
|
|||||||
if (lead_surrogates == nullptr) return;
|
if (lead_surrogates == nullptr) return;
|
||||||
Zone* zone = compiler->zone();
|
Zone* zone = compiler->zone();
|
||||||
// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
|
// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
|
||||||
ZoneList<CharacterRange>* trail_surrogates =
|
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
|
||||||
new (zone) ZoneList<CharacterRange>(1, zone);
|
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
|
||||||
trail_surrogates->Add(
|
|
||||||
CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), zone);
|
|
||||||
|
|
||||||
RegExpNode* match =
|
RegExpNode* match;
|
||||||
compiler->read_backward()
|
if (compiler->read_backward()) {
|
||||||
// Reading backward. Assert that reading forward, there is no trail
|
// Reading backward. Assert that reading forward, there is no trail
|
||||||
// surrogate, and then backward match the lead surrogate.
|
// surrogate, and then backward match the lead surrogate.
|
||||||
? NegativeLookaroundAgainstReadDirectionAndMatch(
|
match = NegativeLookaroundAgainstReadDirectionAndMatch(
|
||||||
compiler, trail_surrogates, lead_surrogates, on_success, true)
|
compiler, trail_surrogates, lead_surrogates, on_success, true);
|
||||||
// Reading forward. Forwrad match the lead surrogate and assert that
|
} else {
|
||||||
// no
|
// Reading forward. Forward match the lead surrogate and assert that
|
||||||
// trail surrogate follows.
|
// no trail surrogate follows.
|
||||||
: MatchAndNegativeLookaroundInReadDirection(
|
match = MatchAndNegativeLookaroundInReadDirection(
|
||||||
compiler, lead_surrogates, trail_surrogates, on_success, false);
|
compiler, lead_surrogates, trail_surrogates, on_success, false);
|
||||||
|
}
|
||||||
result->AddAlternative(GuardedAlternative(match));
|
result->AddAlternative(GuardedAlternative(match));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5067,25 +5071,55 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
|
|||||||
if (trail_surrogates == nullptr) return;
|
if (trail_surrogates == nullptr) return;
|
||||||
Zone* zone = compiler->zone();
|
Zone* zone = compiler->zone();
|
||||||
// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
|
// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
|
||||||
ZoneList<CharacterRange>* lead_surrogates =
|
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
|
||||||
new (zone) ZoneList<CharacterRange>(1, zone);
|
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
|
||||||
lead_surrogates->Add(
|
|
||||||
CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), zone);
|
|
||||||
|
|
||||||
RegExpNode* match =
|
RegExpNode* match;
|
||||||
compiler->read_backward()
|
if (compiler->read_backward()) {
|
||||||
// Reading backward. Backward match the trail surrogate and assert
|
// Reading backward. Backward match the trail surrogate and assert that no
|
||||||
// that no lead surrogate precedes it.
|
// lead surrogate precedes it.
|
||||||
? MatchAndNegativeLookaroundInReadDirection(
|
match = MatchAndNegativeLookaroundInReadDirection(
|
||||||
compiler, trail_surrogates, lead_surrogates, on_success, true)
|
compiler, trail_surrogates, lead_surrogates, on_success, true);
|
||||||
// Reading forward. Assert that reading backward, there is no lead
|
} else {
|
||||||
// surrogate, and then forward match the trail surrogate.
|
// Reading forward. Assert that reading backward, there is no lead
|
||||||
: NegativeLookaroundAgainstReadDirectionAndMatch(
|
// surrogate, and then forward match the trail surrogate.
|
||||||
compiler, lead_surrogates, trail_surrogates, on_success, false);
|
match = NegativeLookaroundAgainstReadDirectionAndMatch(
|
||||||
|
compiler, lead_surrogates, trail_surrogates, on_success, false);
|
||||||
|
}
|
||||||
result->AddAlternative(GuardedAlternative(match));
|
result->AddAlternative(GuardedAlternative(match));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result,
|
||||||
|
RegExpNode* on_success) {
|
||||||
|
// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
|
||||||
|
DCHECK(!compiler->read_backward());
|
||||||
|
Zone* zone = compiler->zone();
|
||||||
|
// Advancing can either consume a BMP character or a trail surrogate.
|
||||||
|
ZoneList<CharacterRange>* bmp_and_trail =
|
||||||
|
new (zone) ZoneList<CharacterRange>(2, zone);
|
||||||
|
bmp_and_trail->Add(CharacterRange::Range(0, kLeadSurrogateStart - 1), zone);
|
||||||
|
bmp_and_trail->Add(
|
||||||
|
CharacterRange::Range(kLeadSurrogateEnd + 1, kNonBmpStart - 1), zone);
|
||||||
|
result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
|
||||||
|
zone, bmp_and_trail, false, on_success)));
|
||||||
|
|
||||||
|
// Or it could consume a lead optionally followed by a trail surrogate.
|
||||||
|
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
|
||||||
|
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
|
||||||
|
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
|
||||||
|
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
|
||||||
|
ChoiceNode* optional_trail = new (zone) ChoiceNode(2, zone);
|
||||||
|
optional_trail->AddAlternative(
|
||||||
|
GuardedAlternative(TextNode::CreateForCharacterRanges(
|
||||||
|
zone, trail_surrogates, false, on_success)));
|
||||||
|
optional_trail->AddAlternative(GuardedAlternative(on_success));
|
||||||
|
RegExpNode* optional_pair = TextNode::CreateForCharacterRanges(
|
||||||
|
zone, lead_surrogates, false, optional_trail);
|
||||||
|
result->AddAlternative(GuardedAlternative(optional_pair));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
||||||
RegExpNode* on_success) {
|
RegExpNode* on_success) {
|
||||||
set_.Canonicalize();
|
set_.Canonicalize();
|
||||||
@ -5102,12 +5136,16 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
|||||||
// No matches possible.
|
// No matches possible.
|
||||||
return new (zone) EndNode(EndNode::BACKTRACK, zone);
|
return new (zone) EndNode(EndNode::BACKTRACK, zone);
|
||||||
}
|
}
|
||||||
UnicodeRangeSplitter splitter(zone, ranges);
|
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
|
||||||
ChoiceNode* result = new (compiler->zone()) ChoiceNode(2, compiler->zone());
|
if (standard_type() == '*') {
|
||||||
AddBmpCharacters(compiler, result, on_success, &splitter);
|
AddUnanchoredAdvance(compiler, result, on_success);
|
||||||
AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
|
} else {
|
||||||
AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
|
UnicodeRangeSplitter splitter(zone, ranges);
|
||||||
AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
|
AddBmpCharacters(compiler, result, on_success, &splitter);
|
||||||
|
AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
|
||||||
|
AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
|
||||||
|
AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
} else {
|
} else {
|
||||||
return new (zone) TextNode(this, compiler->read_backward(), on_success);
|
return new (zone) TextNode(this, compiler->read_backward(), on_success);
|
||||||
@ -6513,6 +6551,36 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
|
||||||
|
RegExpNode* on_success) {
|
||||||
|
// If the regexp matching starts within a surrogate pair, step back
|
||||||
|
// to the lead surrogate and start matching from there.
|
||||||
|
DCHECK(!compiler->read_backward());
|
||||||
|
Zone* zone = compiler->zone();
|
||||||
|
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
|
||||||
|
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
|
||||||
|
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
|
||||||
|
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
|
||||||
|
|
||||||
|
ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
|
||||||
|
|
||||||
|
int stack_register = compiler->UnicodeLookaroundStackRegister();
|
||||||
|
int position_register = compiler->UnicodeLookaroundPositionRegister();
|
||||||
|
RegExpNode* step_back = TextNode::CreateForCharacterRanges(
|
||||||
|
zone, lead_surrogates, true, on_success);
|
||||||
|
RegExpLookaround::Builder builder(true, step_back, stack_register,
|
||||||
|
position_register);
|
||||||
|
RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
|
||||||
|
zone, trail_surrogates, false, builder.on_match_success());
|
||||||
|
|
||||||
|
optional_step_back->AddAlternative(
|
||||||
|
GuardedAlternative(builder.ForMatch(match_trail)));
|
||||||
|
optional_step_back->AddAlternative(GuardedAlternative(on_success));
|
||||||
|
|
||||||
|
return optional_step_back;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
RegExpEngine::CompilationResult RegExpEngine::Compile(
|
RegExpEngine::CompilationResult RegExpEngine::Compile(
|
||||||
Isolate* isolate, Zone* zone, RegExpCompileData* data,
|
Isolate* isolate, Zone* zone, RegExpCompileData* data,
|
||||||
JSRegExp::Flags flags, Handle<String> pattern,
|
JSRegExp::Flags flags, Handle<String> pattern,
|
||||||
@ -6575,6 +6643,8 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
|
|||||||
if (node != NULL) {
|
if (node != NULL) {
|
||||||
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
|
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
|
||||||
}
|
}
|
||||||
|
} else if (compiler.unicode() && (is_global || is_sticky)) {
|
||||||
|
node = OptionallyStepBackToLeadSurrogate(&compiler, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
|
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
|
||||||
|
@ -311,7 +311,7 @@ class RegExpCharacterClass final : public RegExpTree {
|
|||||||
// d : ASCII digit
|
// d : ASCII digit
|
||||||
// D : non-ASCII digit
|
// D : non-ASCII digit
|
||||||
// . : non-unicode non-newline
|
// . : non-unicode non-newline
|
||||||
// * : All characters
|
// * : All characters, for advancing unanchored regexp
|
||||||
uc16 standard_type() { return set_.standard_set_type(); }
|
uc16 standard_type() { return set_.standard_set_type(); }
|
||||||
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
||||||
bool is_negated() { return is_negated_; }
|
bool is_negated() { return is_negated_; }
|
||||||
|
104
test/mjsunit/harmony/unicode-regexp-last-index.js
Normal file
104
test/mjsunit/harmony/unicode-regexp-last-index.js
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file.
|
||||||
|
|
||||||
|
// Flags: --harmony-unicode-regexps --harmony-regexp-lookbehind
|
||||||
|
|
||||||
|
var r = /./ug;
|
||||||
|
assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
r.lastIndex = 1;
|
||||||
|
assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
assertEquals(["\ud801\udc01"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertEquals(["\ud801\udc01"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(4, r.lastIndex);
|
||||||
|
r.lastIndex = 4;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
r.lastIndex = 5;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertEquals(["\ud802"], r.exec("\ud800\udc00\ud801\ud802"));
|
||||||
|
r.lastIndex = 4;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\ud802"));
|
||||||
|
|
||||||
|
r = /./g;
|
||||||
|
assertEquals(["\ud800"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(1, r.lastIndex);
|
||||||
|
assertEquals(["\udc00"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
assertEquals(["\ud801"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(3, r.lastIndex);
|
||||||
|
assertEquals(["\udc01"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(4, r.lastIndex);
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
r.lastIndex = 1;
|
||||||
|
assertEquals(["\udc00"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
|
||||||
|
// ------------------------
|
||||||
|
|
||||||
|
r = /^./ug;
|
||||||
|
assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
r.lastIndex = 1;
|
||||||
|
assertEquals(["\ud800\udc00"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
r.lastIndex = 4;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
r.lastIndex = 5;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
|
||||||
|
r = /^./g;
|
||||||
|
assertEquals(["\ud800"], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(1, r.lastIndex);
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(0, r.lastIndex);
|
||||||
|
|
||||||
|
//------------------------
|
||||||
|
|
||||||
|
r = /(?:(^.)|.)/ug;
|
||||||
|
assertEquals(["\ud800\udc00", "\ud800\udc00"],
|
||||||
|
r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
r.lastIndex = 1;
|
||||||
|
assertEquals(["\ud800\udc00", "\ud800\udc00"],
|
||||||
|
r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
assertEquals(["\ud801\udc01", undefined], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertEquals(["\ud801\udc01", undefined], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
r.lastIndex = 4;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
r.lastIndex = 5;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertEquals(["\ud802", undefined], r.exec("\ud800\udc00\ud801\ud802"));
|
||||||
|
r.lastIndex = 4;
|
||||||
|
assertNull(r.exec("\ud800\udc00\ud801\ud802"));
|
||||||
|
|
||||||
|
r = /(?:(^.)|.)/g;
|
||||||
|
assertEquals(["\ud800", "\ud800"],
|
||||||
|
r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(1, r.lastIndex);
|
||||||
|
assertEquals(["\udc00", undefined], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(2, r.lastIndex);
|
||||||
|
r.lastIndex = 3;
|
||||||
|
assertEquals(["\udc01", undefined], r.exec("\ud800\udc00\ud801\udc01"));
|
||||||
|
assertEquals(4, r.lastIndex);
|
Loading…
Reference in New Issue
Block a user