RegExp: Add the ability to switch flags on and off within the regexp

R=yangguo@chromium.org

This is a reupload of https://chromium-review.googlesource.com/c/v8/v8/+/571746
with a different user, since the other one was not allowed to commit to V8 any
more.

Bug: 
Change-Id: I6171afd44e514f6c934390faab6f9bee3953ac77
Reviewed-on: https://chromium-review.googlesource.com/752522
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#49098}
This commit is contained in:
Erik 2017-11-02 23:11:14 +01:00 committed by Commit Bot
parent 113527b6bc
commit 68212c80c3
9 changed files with 816 additions and 333 deletions

View File

@ -959,6 +959,7 @@ DEFINE_BOOL(serialization_statistics, false,
// Regexp
DEFINE_BOOL(regexp_optimization, true, "generate optimized regexp code")
DEFINE_BOOL(regexp_mode_modifiers, false, "enable inline flags in regexp.")
// Testing flags test/cctest/test-{flags,api,serialization}.cc
DEFINE_BOOL(testing_bool_flag, true, "testing_bool_flag")

View File

@ -132,17 +132,17 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
bool has_been_compiled = false;
if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
!(flags & JSRegExp::kSticky) &&
if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
pattern->length() <= kPatternTooShortForBoyerMoore) {
// Parse-tree is a single atom that is equal to the pattern.
AtomCompile(re, pattern, flags, pattern);
has_been_compiled = true;
} else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
} else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
parse_result.capture_count == 0) {
RegExpAtom* atom = parse_result.tree->AsAtom();
Vector<const uc16> atom_pattern = atom->data();
if (atom_pattern.length() <= kPatternTooShortForBoyerMoore) {
if (!IgnoreCase(atom->flags()) &&
atom_pattern.length() <= kPatternTooShortForBoyerMoore) {
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
@ -622,7 +622,7 @@ RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
}
}
DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
DCHECK(IsGlobal(regexp->GetFlags()));
if (!interpreted) {
register_array_size_ =
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
@ -653,8 +653,7 @@ RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
}
int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
last_index + 1 < subject_->length() &&
if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
// Advance over the surrogate pair.
@ -916,7 +915,7 @@ class FrequencyCollator {
class RegExpCompiler {
public:
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
JSRegExp::Flags flags, bool is_one_byte);
bool is_one_byte);
int AllocateRegister() {
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
@ -968,13 +967,6 @@ class RegExpCompiler {
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
// Both unicode and ignore_case flags are set. We need to use ICU to find
// the closure over case equivalents.
inline bool needs_unicode_case_equivalents() {
return unicode() && ignore_case();
}
inline bool one_byte() { return one_byte_; }
inline bool optimize() { return optimize_; }
inline void set_optimize(bool value) { optimize_ = value; }
@ -1004,7 +996,6 @@ class RegExpCompiler {
std::vector<RegExpNode*>* work_list_;
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
JSRegExp::Flags flags_;
bool one_byte_;
bool reg_exp_too_big_;
bool limiting_recursion_;
@ -1036,13 +1027,12 @@ static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
// Attempts to compile the regexp using an Irregexp code generator. Returns
// a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
JSRegExp::Flags flags, bool one_byte)
bool one_byte)
: next_register_(2 * (capture_count + 1)),
unicode_lookaround_stack_register_(kNoRegister),
unicode_lookaround_position_register_(kNoRegister),
work_list_(nullptr),
recursion_depth_(0),
flags_(flags),
one_byte_(one_byte),
reg_exp_too_big_(false),
limiting_recursion_(false),
@ -2503,7 +2493,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
QuickCheckDetails::Position* pos =
details->positions(characters_filled_in);
uc16 c = quarks[i];
if (compiler->ignore_case()) {
if (elm.atom()->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c,
compiler->one_byte(), chars);
@ -2711,18 +2701,16 @@ class VisitMarker {
NodeInfo* info_;
};
RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
DCHECK(!info()->visited);
VisitMarker marker(info());
return FilterSuccessor(depth - 1, ignore_case);
return FilterSuccessor(depth - 1);
}
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
RegExpNode* next = on_success_->FilterOneByte(depth - 1);
if (next == nullptr) return set_replacement(nullptr);
on_success_ = next;
return set_replacement(this);
@ -2745,8 +2733,7 @@ static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
return false;
}
RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
RegExpNode* TextNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
DCHECK(!info()->visited);
@ -2759,7 +2746,7 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
for (int j = 0; j < quarks.length(); j++) {
uint16_t c = quarks[j];
if (c <= String::kMaxOneByteCharCode) continue;
if (!ignore_case) return set_replacement(nullptr);
if (!IgnoreCase(elm.atom()->flags())) return set_replacement(nullptr);
// Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range.
uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
@ -2781,42 +2768,41 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
ranges->at(0).from() == 0 &&
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
continue;
return set_replacement(nullptr);
}
} else {
if (range_count == 0 ||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
continue;
return set_replacement(nullptr);
}
}
}
}
return FilterSuccessor(depth - 1, ignore_case);
return FilterSuccessor(depth - 1);
}
RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
{
VisitMarker marker(info());
RegExpNode* continue_replacement =
continue_node_->FilterOneByte(depth - 1, ignore_case);
RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
// If we can't continue after the loop then there is no sense in doing the
// loop.
if (continue_replacement == nullptr) return set_replacement(nullptr);
}
return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
return ChoiceNode::FilterOneByte(depth - 1);
}
RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
RegExpNode* ChoiceNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@ -2836,8 +2822,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
RegExpNode* survivor = nullptr;
for (int i = 0; i < choice_count; i++) {
GuardedAlternative alternative = alternatives_->at(i);
RegExpNode* replacement =
alternative.node()->FilterOneByte(depth - 1, ignore_case);
RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
if (replacement != nullptr) {
alternatives_->at(i).set_node(replacement);
@ -2857,7 +2842,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
for (int i = 0; i < choice_count; i++) {
RegExpNode* replacement =
alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
alternatives_->at(i).node()->FilterOneByte(depth - 1);
if (replacement != nullptr) {
alternatives_->at(i).set_node(replacement);
new_alternatives->Add(alternatives_->at(i), zone());
@ -2867,9 +2852,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
return this;
}
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
bool ignore_case) {
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@ -2877,12 +2860,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
RegExpNode* node = alternatives_->at(1).node();
RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
RegExpNode* replacement = node->FilterOneByte(depth - 1);
if (replacement == nullptr) return set_replacement(nullptr);
alternatives_->at(1).set_node(replacement);
RegExpNode* neg_node = alternatives_->at(0).node();
RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
// If the negative lookahead is always going to fail then
// we don't need to check it.
if (neg_replacement == nullptr) return set_replacement(replacement);
@ -3199,6 +3182,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
TextElement elm = elements()->at(i);
int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
if (elm.text_type() == TextElement::ATOM) {
if (SkipPass(pass, elm.atom()->ignore_case())) continue;
Vector<const uc16> quarks = elm.atom()->data();
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
if (first_element_checked && i == 0 && j == 0) continue;
@ -3254,9 +3238,7 @@ int TextNode::Length() {
return elm.cp_offset() + elm.length();
}
bool TextNode::SkipPass(int int_pass, bool ignore_case) {
TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
if (ignore_case) {
return pass == SIMPLE_CHARACTER_MATCH;
} else {
@ -3264,32 +3246,33 @@ bool TextNode::SkipPass(int int_pass, bool ignore_case) {
}
}
TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success) {
RegExpNode* on_success,
JSRegExp::Flags flags) {
DCHECK_NOT_NULL(ranges);
ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
zone);
elms->Add(
TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, flags)),
zone);
return new (zone) TextNode(elms, read_backward, on_success);
}
TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success) {
RegExpNode* on_success,
JSRegExp::Flags flags) {
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
elms->Add(
TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
zone);
elms->Add(
TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
zone);
elms->Add(TextElement::CharClass(
new (zone) RegExpCharacterClass(lead_ranges, flags)),
zone);
elms->Add(TextElement::CharClass(
new (zone) RegExpCharacterClass(trail_ranges, flags)),
zone);
return new (zone) TextNode(elms, read_backward, on_success);
}
@ -3323,27 +3306,15 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
// check that now.
if (trace->characters_preloaded() == 1) {
for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
if (!SkipPass(pass, compiler->ignore_case())) {
TextEmitPass(compiler,
static_cast<TextEmitPassType>(pass),
true,
trace,
false,
&bound_checked_to);
}
TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
false, &bound_checked_to);
}
first_elt_done = true;
}
for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
if (!SkipPass(pass, compiler->ignore_case())) {
TextEmitPass(compiler,
static_cast<TextEmitPassType>(pass),
false,
trace,
first_elt_done,
&bound_checked_to);
}
TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
first_elt_done, &bound_checked_to);
}
Trace successor_trace(*trace);
@ -3386,11 +3357,15 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
TextElement elm = elements()->at(i);
if (elm.text_type() == TextElement::CHAR_CLASS) {
RegExpCharacterClass* cc = elm.char_class();
// None of the standard character classes is different in the case
// independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
if (IgnoreCase(cc->flags()) &&
!NeedsUnicodeCaseEquivalents(cc->flags())) {
// None of the standard character classes is different in the case
// independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
is_one_byte);
}
}
}
}
@ -4353,9 +4328,9 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
RecursionCheck rc(compiler);
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (compiler->ignore_case()) {
if (IgnoreCase(flags_)) {
assembler->CheckNotBackReferenceIgnoreCase(
start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
} else {
assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack());
@ -4364,7 +4339,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
// Check that the back reference does not end inside a surrogate pair.
if (compiler->unicode() && !compiler->one_byte()) {
if (IsUnicode(flags_) && !compiler->one_byte()) {
assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
}
on_success()->Emit(compiler, trace);
@ -4887,24 +4862,24 @@ void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
(*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
}
void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
ZoneList<CharacterRange>* bmp = splitter->bmp();
if (bmp == nullptr) return;
JSRegExp::Flags default_flags = JSRegExp::Flags();
result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
compiler->zone(), bmp, compiler->read_backward(), on_success)));
compiler->zone(), bmp, compiler->read_backward(), on_success,
default_flags)));
}
void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success,
UnicodeRangeSplitter* splitter) {
ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
if (non_bmp == nullptr) return;
DCHECK(compiler->unicode());
DCHECK(!compiler->one_byte());
Zone* zone = compiler->zone();
JSRegExp::Flags default_flags = JSRegExp::Flags();
CharacterRange::Canonicalize(non_bmp);
for (int i = 0; i < non_bmp->length(); i++) {
// Match surrogate pair.
@ -4924,7 +4899,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Singleton(from_l),
CharacterRange::Range(from_t, to_t), compiler->read_backward(),
on_success)));
on_success, default_flags)));
} else {
if (from_t != kTrailSurrogateStart) {
// Add [from_l][from_t-\udfff]
@ -4932,7 +4907,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Singleton(from_l),
CharacterRange::Range(from_t, kTrailSurrogateEnd),
compiler->read_backward(), on_success)));
compiler->read_backward(), on_success, default_flags)));
from_l++;
}
if (to_t != kTrailSurrogateEnd) {
@ -4941,7 +4916,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Singleton(to_l),
CharacterRange::Range(kTrailSurrogateStart, to_t),
compiler->read_backward(), on_success)));
compiler->read_backward(), on_success, default_flags)));
to_l--;
}
if (from_l <= to_l) {
@ -4950,49 +4925,47 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Range(from_l, to_l),
CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
compiler->read_backward(), on_success)));
compiler->read_backward(), on_success, default_flags)));
}
}
}
}
RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
ZoneList<CharacterRange>* match, RegExpNode* on_success,
bool read_backward) {
ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
JSRegExp::Flags flags) {
Zone* zone = compiler->zone();
RegExpNode* match_node = TextNode::CreateForCharacterRanges(
zone, match, read_backward, on_success);
zone, match, read_backward, on_success, flags);
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
RegExpLookaround::Builder lookaround(false, match_node, stack_register,
position_register);
RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
zone, lookbehind, !read_backward, lookaround.on_match_success());
zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
return lookaround.ForMatch(negative_match);
}
RegExpNode* MatchAndNegativeLookaroundInReadDirection(
RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
bool read_backward) {
bool read_backward, JSRegExp::Flags flags) {
Zone* zone = compiler->zone();
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
RegExpLookaround::Builder lookaround(false, on_success, stack_register,
position_register);
RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
zone, lookahead, read_backward, lookaround.on_match_success());
zone, lookahead, read_backward, lookaround.on_match_success(), flags);
return TextNode::CreateForCharacterRanges(
zone, match, read_backward, lookaround.ForMatch(negative_match));
zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
}
void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success,
UnicodeRangeSplitter* splitter) {
JSRegExp::Flags default_flags = JSRegExp::Flags();
ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
if (lead_surrogates == nullptr) return;
Zone* zone = compiler->zone();
@ -5005,20 +4978,22 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
// Reading backward. Assert that reading forward, there is no trail
// surrogate, and then backward match the lead surrogate.
match = NegativeLookaroundAgainstReadDirectionAndMatch(
compiler, trail_surrogates, lead_surrogates, on_success, true);
compiler, trail_surrogates, lead_surrogates, on_success, true,
default_flags);
} else {
// Reading forward. Forward match the lead surrogate and assert that
// no trail surrogate follows.
match = MatchAndNegativeLookaroundInReadDirection(
compiler, lead_surrogates, trail_surrogates, on_success, false);
compiler, lead_surrogates, trail_surrogates, on_success, false,
default_flags);
}
result->AddAlternative(GuardedAlternative(match));
}
void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success,
UnicodeRangeSplitter* splitter) {
JSRegExp::Flags default_flags = JSRegExp::Flags();
ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
if (trail_surrogates == nullptr) return;
Zone* zone = compiler->zone();
@ -5031,12 +5006,14 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
// Reading backward. Backward match the trail surrogate and assert that no
// lead surrogate precedes it.
match = MatchAndNegativeLookaroundInReadDirection(
compiler, trail_surrogates, lead_surrogates, on_success, true);
compiler, trail_surrogates, lead_surrogates, on_success, true,
default_flags);
} else {
// Reading forward. Assert that reading backward, there is no lead
// surrogate, and then forward match the trail surrogate.
match = NegativeLookaroundAgainstReadDirectionAndMatch(
compiler, lead_surrogates, trail_surrogates, on_success, false);
compiler, lead_surrogates, trail_surrogates, on_success, false,
default_flags);
}
result->AddAlternative(GuardedAlternative(match));
}
@ -5052,7 +5029,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
// the associated trail surrogate.
ZoneList<CharacterRange>* range = CharacterRange::List(
zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
JSRegExp::Flags default_flags = JSRegExp::Flags();
return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
default_flags);
}
void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
@ -5093,10 +5072,10 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
set_.Canonicalize();
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone);
if (compiler->needs_unicode_case_equivalents()) {
if (NeedsUnicodeCaseEquivalents(flags_)) {
AddUnicodeCaseEquivalents(ranges, zone);
}
if (compiler->unicode() && !compiler->one_byte() &&
if (IsUnicode(flags_) && !compiler->one_byte() &&
!contains_split_surrogate()) {
if (is_negated()) {
ZoneList<CharacterRange>* negated =
@ -5105,9 +5084,10 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ranges = negated;
}
if (ranges->length() == 0) {
JSRegExp::Flags default_flags = JSRegExp::Flags();
ranges->Add(CharacterRange::Everything(), zone);
RegExpCharacterClass* fail =
new (zone) RegExpCharacterClass(ranges, NEGATED);
new (zone) RegExpCharacterClass(ranges, default_flags, NEGATED);
return new (zone) TextNode(fail, compiler->read_backward(), on_success);
}
if (standard_type() == '*') {
@ -5182,10 +5162,12 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
// i is length or it is the index of an atom.
if (i == length) break;
int first_atom = i;
JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
i++;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
if (alternative->AsAtom()->flags() != flags) break;
i++;
}
// Sort atoms to get ones with common prefixes together.
@ -5197,7 +5179,7 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
DCHECK_LT(first_atom, alternatives->length());
DCHECK_LE(i, alternatives->length());
DCHECK_LE(first_atom, i);
if (compiler->ignore_case()) {
if (IgnoreCase(flags)) {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
auto compare_closure =
@ -5229,7 +5211,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
i++;
continue;
}
RegExpAtom* atom = alternative->AsAtom();
RegExpAtom* const atom = alternative->AsAtom();
JSRegExp::Flags flags = atom->flags();
unibrow::uchar common_prefix = atom->data().at(0);
int first_with_prefix = i;
int prefix_length = atom->length();
@ -5237,10 +5220,11 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
while (i < length) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
atom = alternative->AsAtom();
RegExpAtom* const atom = alternative->AsAtom();
if (atom->flags() != flags) break;
unibrow::uchar new_prefix = atom->data().at(0);
if (new_prefix != common_prefix) {
if (!compiler->ignore_case()) break;
if (!IgnoreCase(flags)) break;
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
new_prefix = Canonical(canonicalize, new_prefix);
@ -5257,7 +5241,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
// common prefix if the terms were similar or presorted in the input.
// Find out how long the common prefix is.
int run_length = i - first_with_prefix;
atom = alternatives->at(first_with_prefix)->AsAtom();
RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
for (int j = 1; j < run_length && prefix_length > 1; j++) {
RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom();
@ -5268,8 +5252,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
}
}
}
RegExpAtom* prefix =
new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
RegExpAtom* prefix = new (zone)
RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
pair->Add(prefix, zone);
ZoneList<RegExpTree*>* suffixes =
@ -5282,7 +5266,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
suffixes->Add(new (zone) RegExpEmpty(), zone);
} else {
RegExpTree* suffix = new (zone) RegExpAtom(
old_atom->data().SubVector(prefix_length, old_atom->length()));
old_atom->data().SubVector(prefix_length, old_atom->length()),
flags);
suffixes->Add(suffix, zone);
}
}
@ -5305,7 +5290,6 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
Zone* zone = compiler->zone();
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
const bool unicode = compiler->unicode();
int write_posn = 0;
int i = 0;
@ -5316,24 +5300,28 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
i++;
continue;
}
RegExpAtom* atom = alternative->AsAtom();
RegExpAtom* const atom = alternative->AsAtom();
if (atom->length() != 1) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
DCHECK_IMPLIES(unicode,
JSRegExp::Flags flags = atom->flags();
DCHECK_IMPLIES(IsUnicode(flags),
!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
bool contains_trail_surrogate =
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
int first_in_run = i;
i++;
// Find a run of single-character atom alternatives that have identical
// flags (case independence and unicode-ness).
while (i < length) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
atom = alternative->AsAtom();
RegExpAtom* const atom = alternative->AsAtom();
if (atom->length() != 1) break;
DCHECK_IMPLIES(unicode,
if (atom->flags() != flags) break;
DCHECK_IMPLIES(IsUnicode(flags),
!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
contains_trail_surrogate |=
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
@ -5349,12 +5337,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
DCHECK_EQ(old_atom->length(), 1);
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
}
RegExpCharacterClass::Flags flags;
if (unicode && contains_trail_surrogate) {
flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
RegExpCharacterClass::CharacterClassFlags character_class_flags;
if (IsUnicode(flags) && contains_trail_surrogate) {
character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
}
alternatives->at(write_posn++) =
new (zone) RegExpCharacterClass(ranges, flags);
new (zone) RegExpCharacterClass(ranges, flags, character_class_flags);
} else {
// Just copy any trivial alternatives.
for (int j = first_in_run; j < i; j++) {
@ -5586,8 +5574,9 @@ namespace {
// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpAssertion::AssertionType type) {
DCHECK(compiler->needs_unicode_case_equivalents());
RegExpAssertion::AssertionType type,
JSRegExp::Flags flags) {
DCHECK(NeedsUnicodeCaseEquivalents(flags));
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* word_range =
new (zone) ZoneList<CharacterRange>(2, zone);
@ -5605,13 +5594,13 @@ RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
stack_register, position_register);
RegExpNode* backward = TextNode::CreateForCharacterRanges(
zone, word_range, true, lookbehind.on_match_success());
zone, word_range, true, lookbehind.on_match_success(), flags);
// Look to the right.
RegExpLookaround::Builder lookahead(lookahead_for_word,
lookbehind.ForMatch(backward),
stack_register, position_register);
RegExpNode* forward = TextNode::CreateForCharacterRanges(
zone, word_range, false, lookahead.on_match_success());
zone, word_range, false, lookahead.on_match_success(), flags);
result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
}
return result;
@ -5629,13 +5618,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
case START_OF_INPUT:
return AssertionNode::AtStart(on_success);
case BOUNDARY:
return compiler->needs_unicode_case_equivalents()
? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
return NeedsUnicodeCaseEquivalents(flags_)
? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
flags_)
: AssertionNode::AtBoundary(on_success);
case NON_BOUNDARY:
return compiler->needs_unicode_case_equivalents()
return NeedsUnicodeCaseEquivalents(flags_)
? BoundaryAssertionAsLookaround(compiler, on_success,
NON_BOUNDARY)
NON_BOUNDARY, flags_)
: AssertionNode::AtNonBoundary(on_success);
case END_OF_INPUT:
return AssertionNode::AtEnd(on_success);
@ -5651,7 +5641,9 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ZoneList<CharacterRange>* newline_ranges =
new(zone) ZoneList<CharacterRange>(3, zone);
CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
JSRegExp::Flags default_flags = JSRegExp::Flags();
RegExpCharacterClass* newline_atom =
new (zone) RegExpCharacterClass('n', default_flags);
TextNode* newline_matcher = new (zone) TextNode(
newline_atom, false, ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register,
@ -5681,7 +5673,7 @@ RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return new (compiler->zone())
BackReferenceNode(RegExpCapture::StartRegister(index()),
RegExpCapture::EndRegister(index()),
RegExpCapture::EndRegister(index()), flags_,
compiler->read_backward(), on_success);
}
@ -6337,9 +6329,7 @@ void TextNode::CalculateOffsets() {
void Analysis::VisitText(TextNode* that) {
if (ignore_case()) {
that->MakeCaseIndependent(isolate(), is_one_byte_);
}
that->MakeCaseIndependent(isolate(), is_one_byte_);
EnsureAnalyzed(that->on_success());
if (!has_failed()) {
that->CalculateOffsets();
@ -6450,7 +6440,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
return;
}
uc16 character = atom->data()[j];
if (bm->compiler()->ignore_case()) {
if (IgnoreCase(atom->flags())) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(
isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
@ -6602,9 +6592,9 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) {
target->Accept(this);
}
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
RegExpNode* on_success) {
RegExpNode* on_success,
JSRegExp::Flags flags) {
// If the regexp matching starts within a surrogate pair, step back
// to the lead surrogate and start matching from there.
DCHECK(!compiler->read_backward());
@ -6619,11 +6609,11 @@ RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
RegExpNode* step_back = TextNode::CreateForCharacterRanges(
zone, lead_surrogates, true, on_success);
zone, lead_surrogates, true, on_success, flags);
RegExpLookaround::Builder builder(true, step_back, stack_register,
position_register);
RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
zone, trail_surrogates, false, builder.on_match_success());
zone, trail_surrogates, false, builder.on_match_success(), flags);
optional_step_back->AddAlternative(
GuardedAlternative(builder.ForMatch(match_trail)));
@ -6640,12 +6630,10 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
return IrregexpRegExpTooBig(isolate);
}
bool ignore_case = flags & JSRegExp::kIgnoreCase;
bool is_sticky = flags & JSRegExp::kSticky;
bool is_global = flags & JSRegExp::kGlobal;
bool is_unicode = flags & JSRegExp::kUnicode;
RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
is_one_byte);
bool is_sticky = IsSticky(flags);
bool is_global = IsGlobal(flags);
bool is_unicode = IsUnicode(flags);
RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
@ -6673,9 +6661,11 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
if (!is_start_anchored && !is_sticky) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
JSRegExp::Flags default_flags = JSRegExp::Flags();
RegExpNode* loop_node = RegExpQuantifier::ToNode(
0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
&compiler, captured_body, data->contains_anchor);
0, RegExpTree::kInfinity, false,
new (zone) RegExpCharacterClass('*', default_flags), &compiler,
captured_body, data->contains_anchor);
if (data->contains_anchor) {
// Unroll loop once, to take care of the case that might start
@ -6683,26 +6673,27 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
new (zone) RegExpCharacterClass('*'), false, loop_node)));
new (zone) RegExpCharacterClass('*', default_flags), false,
loop_node)));
node = first_step_node;
} else {
node = loop_node;
}
}
if (is_one_byte) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
}
} else if (compiler.unicode() && (is_global || is_sticky)) {
node = OptionallyStepBackToLeadSurrogate(&compiler, node);
} else if (is_unicode && (is_global || is_sticky)) {
node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
}
if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node;
Analysis analysis(isolate, flags, is_one_byte);
Analysis analysis(isolate, is_one_byte);
analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) {
const char* error_message = analysis.error_message();

View File

@ -21,6 +21,36 @@ class RegExpNode;
class RegExpTree;
class BoyerMooreLookahead;
inline bool IgnoreCase(JSRegExp::Flags flags) {
return (flags & JSRegExp::kIgnoreCase) != 0;
}
inline bool IsUnicode(JSRegExp::Flags flags) {
return (flags & JSRegExp::kUnicode) != 0;
}
inline bool IsSticky(JSRegExp::Flags flags) {
return (flags & JSRegExp::kSticky) != 0;
}
inline bool IsGlobal(JSRegExp::Flags flags) {
return (flags & JSRegExp::kGlobal) != 0;
}
inline bool DotAll(JSRegExp::Flags flags) {
return (flags & JSRegExp::kDotAll) != 0;
}
inline bool Multiline(JSRegExp::Flags flags) {
return (flags & JSRegExp::kMultiline) != 0;
}
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
// Both unicode and ignore_case flags are set. We need to use ICU to find
// the closure over case equivalents.
return IsUnicode(flags) && IgnoreCase(flags);
}
class RegExpImpl {
public:
// Whether V8 is compiled with native regexp support or not.
@ -495,9 +525,7 @@ class RegExpNode: public ZoneObject {
// If we know that the input is one-byte then there are some nodes that can
// never match. This method returns a node that can be substituted for
// itself, or nullptr if the node can never match.
virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
return this;
}
virtual RegExpNode* FilterOneByte(int depth) { return this; }
// Helper for FilterOneByte.
RegExpNode* replacement() {
DCHECK(info()->replacement_calculated);
@ -569,7 +597,7 @@ class SeqRegExpNode: public RegExpNode {
: RegExpNode(on_success->zone()), on_success_(on_success) { }
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
virtual RegExpNode* FilterOneByte(int depth);
virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) {
on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
@ -577,7 +605,7 @@ class SeqRegExpNode: public RegExpNode {
}
protected:
RegExpNode* FilterSuccessor(int depth, bool ignore_case);
RegExpNode* FilterSuccessor(int depth);
private:
RegExpNode* on_success_;
@ -682,13 +710,15 @@ class TextNode: public SeqRegExpNode {
static TextNode* CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success);
RegExpNode* on_success,
JSRegExp::Flags flags);
// Create TextNode for a surrogate pair with a range given for the
// lead and the trail surrogate each.
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success);
RegExpNode* on_success,
JSRegExp::Flags flags);
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
@ -705,7 +735,7 @@ class TextNode: public SeqRegExpNode {
virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start);
void CalculateOffsets();
virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
virtual RegExpNode* FilterOneByte(int depth);
private:
enum TextEmitPassType {
@ -715,7 +745,7 @@ class TextNode: public SeqRegExpNode {
CASE_CHARACTER_MATCH, // Case-independent single character check.
CHARACTER_CLASS_MATCH // Character class.
};
static bool SkipPass(int pass, bool ignore_case);
static bool SkipPass(TextEmitPassType pass, bool ignore_case);
static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
static const int kLastPass = CHARACTER_CLASS_MATCH;
void TextEmitPass(RegExpCompiler* compiler,
@ -779,11 +809,12 @@ class AssertionNode: public SeqRegExpNode {
class BackReferenceNode: public SeqRegExpNode {
public:
BackReferenceNode(int start_reg, int end_reg, bool read_backward,
RegExpNode* on_success)
BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
bool read_backward, RegExpNode* on_success)
: SeqRegExpNode(on_success),
start_reg_(start_reg),
end_reg_(end_reg),
flags_(flags),
read_backward_(read_backward) {}
virtual void Accept(NodeVisitor* visitor);
int start_register() { return start_reg_; }
@ -805,6 +836,7 @@ class BackReferenceNode: public SeqRegExpNode {
private:
int start_reg_;
int end_reg_;
JSRegExp::Flags flags_;
bool read_backward_;
};
@ -929,7 +961,7 @@ class ChoiceNode: public RegExpNode {
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
return true;
}
virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
virtual RegExpNode* FilterOneByte(int depth);
virtual bool read_backward() { return false; }
protected:
@ -1001,7 +1033,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
return !is_first;
}
virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
virtual RegExpNode* FilterOneByte(int depth);
};
@ -1028,7 +1060,7 @@ class LoopChoiceNode: public ChoiceNode {
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
virtual bool read_backward() { return read_backward_; }
virtual void Accept(NodeVisitor* visitor);
virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
virtual RegExpNode* FilterOneByte(int depth);
private:
// AddAlternative is made private for loop nodes because alternatives
@ -1435,11 +1467,8 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
// +-------+ +------------+
class Analysis: public NodeVisitor {
public:
Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
: isolate_(isolate),
flags_(flags),
is_one_byte_(is_one_byte),
error_message_(nullptr) {}
Analysis(Isolate* isolate, bool is_one_byte)
: isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
void EnsureAnalyzed(RegExpNode* node);
#define DECLARE_VISIT(Type) \
@ -1459,12 +1488,8 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
Isolate* isolate() const { return isolate_; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
private:
Isolate* isolate_;
JSRegExp::Flags flags_;
bool is_one_byte_;
const char* error_message_;

View File

@ -6,6 +6,7 @@
#define V8_REGEXP_REGEXP_AST_H_
#include "src/objects.h"
#include "src/objects/js-regexp.h"
#include "src/objects/string.h"
#include "src/utils.h"
#include "src/zone/zone-containers.h"
@ -144,7 +145,7 @@ class CharacterSet final BASE_EMBEDDED {
explicit CharacterSet(ZoneList<CharacterRange>* ranges)
: ranges_(ranges), standard_set_type_(0) {}
ZoneList<CharacterRange>* ranges(Zone* zone);
uc16 standard_set_type() { return standard_set_type_; }
uc16 standard_set_type() const { return standard_set_type_; }
void set_standard_set_type(uc16 special_set_type) {
standard_set_type_ = special_set_type;
}
@ -274,7 +275,8 @@ class RegExpAssertion final : public RegExpTree {
BOUNDARY,
NON_BOUNDARY
};
explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
: assertion_type_(type), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAssertion* AsAssertion() override;
@ -286,7 +288,8 @@ class RegExpAssertion final : public RegExpTree {
AssertionType assertion_type() { return assertion_type_; }
private:
AssertionType assertion_type_;
const AssertionType assertion_type_;
const JSRegExp::Flags flags_;
};
@ -300,12 +303,18 @@ class RegExpCharacterClass final : public RegExpTree {
NEGATED = 1 << 0,
CONTAINS_SPLIT_SURROGATE = 1 << 1,
};
typedef base::Flags<Flag> Flags;
typedef base::Flags<Flag> CharacterClassFlags;
explicit RegExpCharacterClass(ZoneList<CharacterRange>* ranges,
Flags flags = Flags())
: set_(ranges), flags_(flags) {}
explicit RegExpCharacterClass(uc16 type) : set_(type), flags_(0) {}
RegExpCharacterClass(
ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
CharacterClassFlags character_class_flags = CharacterClassFlags())
: set_(ranges),
flags_(flags),
character_class_flags_(character_class_flags) {}
RegExpCharacterClass(uc16 type, JSRegExp::Flags flags)
: set_(type),
flags_(flags),
character_class_flags_(CharacterClassFlags()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpCharacterClass* AsCharacterClass() override;
@ -332,22 +341,25 @@ class RegExpCharacterClass final : public RegExpTree {
// D : non-ASCII digit
// . : non-newline
// * : All characters, for advancing unanchored regexp
uc16 standard_type() { return set_.standard_set_type(); }
uc16 standard_type() const { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
bool is_negated() const { return (flags_ & NEGATED) != 0; }
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
JSRegExp::Flags flags() const { return flags_; }
bool contains_split_surrogate() const {
return (flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
}
private:
CharacterSet set_;
const Flags flags_;
const JSRegExp::Flags flags_;
const CharacterClassFlags character_class_flags_;
};
class RegExpAtom final : public RegExpTree {
public:
explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
: data_(data), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAtom* AsAtom() override;
@ -358,9 +370,12 @@ class RegExpAtom final : public RegExpTree {
void AppendToText(RegExpText* text, Zone* zone) override;
Vector<const uc16> data() { return data_; }
int length() { return data_.length(); }
JSRegExp::Flags flags() const { return flags_; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
private:
Vector<const uc16> data_;
const JSRegExp::Flags flags_;
};
@ -532,9 +547,10 @@ class RegExpLookaround final : public RegExpTree {
class RegExpBackReference final : public RegExpTree {
public:
RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
explicit RegExpBackReference(RegExpCapture* capture)
: capture_(capture), name_(nullptr) {}
explicit RegExpBackReference(JSRegExp::Flags flags)
: capture_(nullptr), name_(nullptr), flags_(flags) {}
RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags)
: capture_(capture), name_(nullptr), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpBackReference* AsBackReference() override;
@ -552,6 +568,7 @@ class RegExpBackReference final : public RegExpTree {
private:
RegExpCapture* capture_;
const ZoneVector<uc16>* name_;
const JSRegExp::Flags flags_;
};

View File

@ -31,10 +31,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
named_back_references_(nullptr),
in_(in),
current_(kEndMarker),
dotall_(flags & JSRegExp::kDotAll),
ignore_case_(flags & JSRegExp::kIgnoreCase),
multiline_(flags & JSRegExp::kMultiline),
unicode_(flags & JSRegExp::kUnicode),
top_level_flags_(flags),
next_pos_(0),
captures_started_(0),
capture_count_(0),
@ -44,7 +41,6 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
is_scanned_for_captures_(false),
has_named_captures_(false),
failed_(false) {
DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
Advance();
}
@ -183,7 +179,7 @@ RegExpTree* RegExpParser::ParsePattern() {
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
0, nullptr, ignore_case(), unicode(), zone());
0, nullptr, top_level_flags_, zone());
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@ -253,12 +249,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("Nothing to repeat"));
case '^': {
Advance();
if (multiline()) {
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
if (builder->multiline()) {
builder->AddAssertion(new (zone()) RegExpAssertion(
RegExpAssertion::START_OF_LINE, builder->flags()));
} else {
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
builder->AddAssertion(new (zone()) RegExpAssertion(
RegExpAssertion::START_OF_INPUT, builder->flags()));
set_contains_anchor();
}
continue;
@ -266,9 +262,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '$': {
Advance();
RegExpAssertion::AssertionType assertion_type =
multiline() ? RegExpAssertion::END_OF_LINE
: RegExpAssertion::END_OF_INPUT;
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
builder->multiline() ? RegExpAssertion::END_OF_LINE
: RegExpAssertion::END_OF_INPUT;
builder->AddAssertion(
new (zone()) RegExpAssertion(assertion_type, builder->flags()));
continue;
}
case '.': {
@ -276,7 +273,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
if (dotall()) {
if (builder->dotall()) {
// Everything.
DCHECK(FLAG_harmony_regexp_dotall);
CharacterRange::AddClassEscape('*', ranges, false, zone());
@ -285,78 +282,18 @@ RegExpTree* RegExpParser::ParseDisjunction() {
CharacterRange::AddClassEscape('.', ranges, false, zone());
}
RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, builder->flags());
builder->AddCharacterClass(cc);
break;
}
case '(': {
SubexpressionType subexpr_type = CAPTURE;
RegExpLookaround::Type lookaround_type = state->lookaround_type();
bool is_named_capture = false;
Advance();
if (current() == '?') {
switch (Next()) {
case ':':
subexpr_type = GROUPING;
Advance(2);
break;
case '=':
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = POSITIVE_LOOKAROUND;
Advance(2);
break;
case '!':
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = NEGATIVE_LOOKAROUND;
Advance(2);
break;
case '<':
Advance();
if (FLAG_harmony_regexp_lookbehind) {
if (Next() == '=') {
subexpr_type = POSITIVE_LOOKAROUND;
lookaround_type = RegExpLookaround::LOOKBEHIND;
Advance(2);
break;
} else if (Next() == '!') {
subexpr_type = NEGATIVE_LOOKAROUND;
lookaround_type = RegExpLookaround::LOOKBEHIND;
Advance(2);
break;
}
}
if (FLAG_harmony_regexp_named_captures) {
has_named_captures_ = true;
is_named_capture = true;
Advance();
break;
}
// Fall through.
default:
return ReportError(CStrVector("Invalid group"));
}
}
const ZoneVector<uc16>* capture_name = nullptr;
if (subexpr_type == CAPTURE) {
if (captures_started_ >= kMaxCaptures) {
return ReportError(CStrVector("Too many captures"));
}
captures_started_++;
if (is_named_capture) {
capture_name = ParseCaptureGroupName(CHECK_FAILED);
}
}
// Store current state and begin new disjunction parsing.
state = new (zone()) RegExpParserState(
state, subexpr_type, lookaround_type, captures_started_,
capture_name, ignore_case(), unicode(), zone());
state = ParseOpenParenthesis(state CHECK_FAILED);
builder = state->builder();
continue;
}
case '[': {
RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
builder->AddCharacterClass(cc->AsCharacterClass());
break;
}
@ -368,13 +305,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("\\ at end of pattern"));
case 'b':
Advance(2);
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
builder->AddAssertion(new (zone()) RegExpAssertion(
RegExpAssertion::BOUNDARY, builder->flags()));
continue;
case 'B':
Advance(2);
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
builder->AddAssertion(new (zone()) RegExpAssertion(
RegExpAssertion::NON_BOUNDARY, builder->flags()));
continue;
// AtomEscape ::
// CharacterClassEscape
@ -391,10 +328,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges,
unicode() && ignore_case(), zone());
CharacterRange::AddClassEscape(
c, ranges, unicode() && builder->ignore_case(), zone());
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges);
new (zone()) RegExpCharacterClass(ranges, builder->flags());
builder->AddCharacterClass(cc);
break;
}
@ -410,7 +347,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("Invalid property name"));
}
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges);
new (zone()) RegExpCharacterClass(ranges, builder->flags());
builder->AddCharacterClass(cc);
} else {
// With /u, no identity escapes except for syntax characters
@ -443,7 +380,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddEmpty();
} else {
RegExpCapture* capture = GetCapture(index);
RegExpTree* atom = new (zone()) RegExpBackReference(capture);
RegExpTree* atom =
new (zone()) RegExpBackReference(capture, builder->flags());
builder->AddAtom(atom);
}
break;
@ -638,6 +576,143 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
}
RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
RegExpParserState* state) {
RegExpLookaround::Type lookaround_type = state->lookaround_type();
bool is_named_capture = false;
JSRegExp::Flags switch_on = JSRegExp::kNone;
JSRegExp::Flags switch_off = JSRegExp::kNone;
const ZoneVector<uc16>* capture_name = nullptr;
SubexpressionType subexpr_type = CAPTURE;
Advance();
if (current() == '?') {
switch (Next()) {
case ':':
Advance(2);
subexpr_type = GROUPING;
break;
case '=':
Advance(2);
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = POSITIVE_LOOKAROUND;
break;
case '!':
Advance(2);
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = NEGATIVE_LOOKAROUND;
break;
case '-':
case 'i':
case 's':
case 'm': {
if (!FLAG_regexp_mode_modifiers ||
(Next() == 's' && !FLAG_harmony_regexp_dotall)) {
ReportError(CStrVector("Invalid group"));
return nullptr;
}
Advance();
bool flags_sense = true; // Switching on flags.
while (subexpr_type != GROUPING) {
switch (current()) {
case '-':
if (!flags_sense) {
ReportError(CStrVector("Multiple dashes in flag group"));
return nullptr;
}
flags_sense = false;
Advance();
continue;
case 's':
if (!FLAG_harmony_regexp_dotall) {
ReportError(CStrVector("Invalid group"));
return nullptr;
}
// Fall through.
case 'i':
case 'm': {
JSRegExp::Flags bit = JSRegExp::kUnicode;
if (current() == 'i') bit = JSRegExp::kIgnoreCase;
if (current() == 'm') bit = JSRegExp::kMultiline;
if (current() == 's') bit = JSRegExp::kDotAll;
if (((switch_on | switch_off) & bit) != 0) {
ReportError(CStrVector("Repeated flag in flag group"));
return nullptr;
}
if (flags_sense) {
switch_on |= bit;
} else {
switch_off |= bit;
}
Advance();
continue;
}
case ')': {
Advance();
state->builder()
->FlushText(); // Flush pending text using old flags.
// These (?i)-style flag switches don't put us in a subexpression
// at all, they just modify the flags in the rest of the current
// subexpression.
JSRegExp::Flags flags =
(state->builder()->flags() | switch_on) & ~switch_off;
state->builder()->set_flags(flags);
return state;
}
case ':':
Advance();
subexpr_type = GROUPING; // Will break us out of the outer loop.
continue;
default:
ReportError(CStrVector("Invalid flag group"));
return nullptr;
}
}
break;
}
case '<':
Advance();
if (FLAG_harmony_regexp_lookbehind) {
if (Next() == '=') {
Advance(2);
lookaround_type = RegExpLookaround::LOOKBEHIND;
subexpr_type = POSITIVE_LOOKAROUND;
break;
} else if (Next() == '!') {
Advance(2);
lookaround_type = RegExpLookaround::LOOKBEHIND;
subexpr_type = NEGATIVE_LOOKAROUND;
break;
}
}
if (FLAG_harmony_regexp_named_captures) {
is_named_capture = true;
has_named_captures_ = true;
Advance();
break;
}
// Fall through.
default:
ReportError(CStrVector("Invalid group"));
return nullptr;
}
}
if (subexpr_type == CAPTURE) {
if (captures_started_ >= kMaxCaptures) {
ReportError(CStrVector("Too many captures"));
return nullptr;
}
captures_started_++;
if (is_named_capture) {
capture_name = ParseCaptureGroupName(CHECK_FAILED);
}
}
JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
// Store current state and begin new disjunction parsing.
return new (zone())
RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
capture_name, flags, zone());
}
#ifdef DEBUG
// Currently only used in an DCHECK.
@ -855,7 +930,8 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
if (state->IsInsideCaptureGroup(name)) {
builder->AddEmpty();
} else {
RegExpBackReference* atom = new (zone()) RegExpBackReference();
RegExpBackReference* atom =
new (zone()) RegExpBackReference(builder->flags());
atom->set_name(name);
builder->AddAtom(atom);
@ -1525,7 +1601,7 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
}
}
RegExpTree* RegExpParser::ParseCharacterClass() {
RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
static const char* kUnterminated = "Unterminated character class";
static const char* kRangeInvalid = "Invalid character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
@ -1539,7 +1615,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
}
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
bool add_unicode_case_equivalents = unicode() && ignore_case();
bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
while (has_more() && current() != ']') {
uc32 char_1, char_2;
bool is_class_1, is_class_2;
@ -1586,9 +1662,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
ranges->Add(CharacterRange::Everything(), zone());
is_negated = !is_negated;
}
RegExpCharacterClass::Flags flags;
if (is_negated) flags = RegExpCharacterClass::NEGATED;
return new (zone()) RegExpCharacterClass(ranges, flags);
RegExpCharacterClass::CharacterClassFlags character_class_flags;
if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
return new (zone())
RegExpCharacterClass(ranges, builder->flags(), character_class_flags);
}
@ -1622,11 +1699,10 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
return !parser.failed();
}
RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
: zone_(zone),
pending_empty_(false),
ignore_case_(ignore_case),
unicode_(unicode),
flags_(flags),
characters_(nullptr),
pending_surrogate_(kNoPendingSurrogate),
terms_(),
@ -1662,7 +1738,7 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
surrogate_pair.Add(lead_surrogate, zone());
surrogate_pair.Add(trail_surrogate, zone());
RegExpAtom* atom =
new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
AddAtom(atom);
}
} else {
@ -1686,7 +1762,8 @@ void RegExpBuilder::FlushCharacters() {
FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ != nullptr) {
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
RegExpTree* atom =
new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
characters_ = nullptr;
text_.Add(atom, zone());
LAST(ADD_ATOM);
@ -1762,7 +1839,7 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
AddTerm(new (zone()) RegExpCharacterClass(
CharacterRange::List(zone(), CharacterRange::Singleton(c))));
CharacterRange::List(zone(), CharacterRange::Singleton(c)), flags_));
}
@ -1880,11 +1957,11 @@ bool RegExpBuilder::AddQuantifierToAtom(
int num_chars = char_vector.length();
if (num_chars > 1) {
Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
text_.Add(new (zone()) RegExpAtom(prefix), zone());
text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
char_vector = char_vector.SubVector(num_chars - 1, num_chars);
}
characters_ = nullptr;
atom = new (zone()) RegExpAtom(char_vector);
atom = new (zone()) RegExpAtom(char_vector, flags_);
FlushText();
} else if (text_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM);

View File

@ -99,7 +99,7 @@ class BufferedZoneList {
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
public:
RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
void AddEscapedUnicodeCharacter(uc32 character);
@ -114,7 +114,14 @@ class RegExpBuilder : public ZoneObject {
void NewAlternative(); // '|'
bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
void FlushText();
RegExpTree* ToRegExp();
JSRegExp::Flags flags() const { return flags_; }
void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
private:
static const uc16 kNoPendingSurrogate = 0;
@ -122,18 +129,15 @@ class RegExpBuilder : public ZoneObject {
void AddTrailSurrogate(uc16 trail_surrogate);
void FlushPendingSurrogate();
void FlushCharacters();
void FlushText();
void FlushTerms();
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
bool NeedsDesugaringForIgnoreCase(uc32 c);
Zone* zone() const { return zone_; }
bool ignore_case() const { return ignore_case_; }
bool unicode() const { return unicode_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
Zone* zone_;
bool pending_empty_;
bool ignore_case_;
bool unicode_;
JSRegExp::Flags flags_;
ZoneList<uc16>* characters_;
uc16 pending_surrogate_;
BufferedZoneList<RegExpTree, 2> terms_;
@ -159,7 +163,6 @@ class RegExpParser BASE_EMBEDDED {
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
RegExpTree* ParseCharacterClass();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
@ -175,6 +178,7 @@ class RegExpParser BASE_EMBEDDED {
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
uc32 ParseOctalLiteral();
@ -205,10 +209,9 @@ class RegExpParser BASE_EMBEDDED {
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
bool dotall() const { return dotall_; }
bool ignore_case() const { return ignore_case_; }
bool multiline() const { return multiline_; }
bool unicode() const { return unicode_; }
// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
// just read the initial flag value here.
bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
static bool IsSyntaxCharacterOrSlash(uc32 c);
@ -226,34 +229,35 @@ class RegExpParser BASE_EMBEDDED {
class RegExpParserState : public ZoneObject {
public:
// Push a state on the stack.
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index,
const ZoneVector<uc16>* capture_name, bool ignore_case,
bool unicode, Zone* zone)
const ZoneVector<uc16>* capture_name,
JSRegExp::Flags flags, Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
builder_(new (zone) RegExpBuilder(zone, flags)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index),
capture_name_(capture_name) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
RegExpParserState* previous_state() const { return previous_state_; }
bool IsSubexpression() { return previous_state_ != nullptr; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() { return builder_; }
RegExpBuilder* builder() const { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() { return group_type_; }
SubexpressionType group_type() const { return group_type_; }
// Lookahead or Lookbehind.
RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() { return disjunction_capture_index_; }
int capture_index() const { return disjunction_capture_index_; }
// The name of the current sub-expression, if group_type is CAPTURE. Only
// used for named captures.
const ZoneVector<uc16>* capture_name() { return capture_name_; }
const ZoneVector<uc16>* capture_name() const { return capture_name_; }
bool IsNamedCapture() const { return capture_name_ != nullptr; }
@ -264,17 +268,17 @@ class RegExpParser BASE_EMBEDDED {
private:
// Linked list implementation of stack of states.
RegExpParserState* previous_state_;
RegExpParserState* const previous_state_;
// Builder for the stored disjunction.
RegExpBuilder* builder_;
RegExpBuilder* const builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
SubexpressionType group_type_;
const SubexpressionType group_type_;
// Stored read direction.
RegExpLookaround::Type lookaround_type_;
const RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
int disjunction_capture_index_;
const int disjunction_capture_index_;
// Stored capture name (if any).
const ZoneVector<uc16>* capture_name_;
const ZoneVector<uc16>* const capture_name_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
@ -291,6 +295,7 @@ class RegExpParser BASE_EMBEDDED {
bool ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state);
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
// After the initial parsing pass, patch corresponding RegExpCapture objects
// into all RegExpBackReferences. This is done after initial parsing in order
@ -323,10 +328,10 @@ class RegExpParser BASE_EMBEDDED {
ZoneList<RegExpBackReference*>* named_back_references_;
FlatStringReader* in_;
uc32 current_;
bool dotall_;
bool ignore_case_;
bool multiline_;
bool unicode_;
// These are the flags specified outside the regexp syntax ie after the
// terminating '/' or in the second argument to the constructor. The current
// flags are stored on the RegExpBuilder.
JSRegExp::Flags top_level_flags_;
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.

View File

@ -0,0 +1,144 @@
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --regexp-mode-modifiers --harmony-regexp-property
// These regexps are just grepped out of the other tests we already have
// and the syntax changed from out-of-line i flag to inline i flag.
assertFalse(/(?i)x(...)\1/.test("x\u03a3\u03c2\u03c3\u03c2\u03c3"));
assertTrue(/(?i)\u03a3((?:))\1\1x/.test("\u03c2x"), "backref-UC16-empty");
assertTrue(/(?i)x(?:...|(...))\1x/.test("x\u03a3\u03c2\u03c3x"));
assertTrue(/(?i)x(?:...|(...))\1x/.test("x\u03c2\u03c3\u039b\u03a3\u03c2\u03bbx"));
assertFalse(/(?i)\xc1/.test('fooA'), "quickcheck-uc16-pattern-ascii-subject");
assertFalse(/(?i)x(...)\1/.test("xaaaaa"), "backref-ASCII-short");
assertTrue(/(?i)x((?:))\1\1x/.test("xx"), "backref-ASCII-empty");
assertTrue(/(?i)x(?:...|(...))\1x/.test("xabcx"), "backref-ASCII-uncaptured");
assertTrue(/(?i)x(?:...|(...))\1x/.test("xabcABCx"), "backref-ASCII-backtrack");
assertFalse(/(?i)f/.test('b'));
assertFalse(/(?i)[abc]f/.test('x'));
assertFalse(/(?i)[abc]f/.test('xa'));
assertFalse(/(?i)[abc]</.test('x'));
assertFalse(/(?i)[abc]</.test('xa'));
assertFalse(/(?i)f[abc]/.test('x'));
assertFalse(/(?i)f[abc]/.test('xa'));
assertFalse(/(?i)<[abc]/.test('x'));
assertFalse(/(?i)<[abc]/.test('xa'));
assertFalse(/(?i)[\u00e5]/.test("\u212b"));
assertFalse(/(?i)[\u212b]/.test("\u00e5\u1234"));
assertFalse(/(?i)[\u212b]/.test("\u00e5"));
assertTrue(/(?i)\u00e5/u.test("\u00c5"));
assertTrue(/(?i)\u00e5/u.test("\u00e5"));
assertTrue(/(?i)\u00c5/u.test("\u00e5"));
assertTrue(/(?i)\u00c5/u.test("\u00c5"));
assertTrue(/(?i)\u212b/u.test("\u212b"));
assertFalse(/(?i)\u{10400}/.test("\u{10428}"));
assertFalse(/(?i)\u00df/u.test("SS"));
assertFalse(/(?i)\u1f8d/u.test("\u1f05\u03b9"));
assertTrue(/(?i)\u1f6b/u.test("\u1f63"));
assertFalse(/(?i)[\u00e5]/.test("\u212b"));
assertFalse(/(?i)[\u212b]/.test("\u00e5\u1234"));
assertFalse(/(?i)[\u212b]/.test("\u00e5"));
assertTrue(/(?i)\u00e5/u.test("\u212b"));
assertTrue(/(?i)\u00e5/u.test("\u00c5"));
assertTrue(/(?i)\u00e5/u.test("\u00e5"));
assertTrue(/(?i)\u00e5/u.test("\u212b"));
assertTrue(/(?i)\u00c5/u.test("\u00e5"));
assertTrue(/(?i)\u00c5/u.test("\u212b"));
assertTrue(/(?i)\u00c5/u.test("\u00c5"));
assertTrue(/(?i)\u212b/u.test("\u00c5"));
assertTrue(/(?i)\u212b/u.test("\u00e5"));
assertTrue(/(?i)\u212b/u.test("\u212b"));
assertFalse(/(?i)\u{10400}/.test("\u{10428}"));
assertTrue(/(?i)\u{10400}/u.test("\u{10428}"));
assertTrue(/(?i)\ud801\udc00/u.test("\u{10428}"));
assertTrue(/(?i)[\u{10428}]/u.test("\u{10400}"));
assertTrue(/(?i)[\ud801\udc28]/u.test("\u{10400}"));
assertFalse(/(?i)\u00df/u.test("SS"));
assertFalse(/(?i)\u1f8d/u.test("\u1f05\u03b9"));
assertTrue(/(?i)\u1f8d/u.test("\u1f85"));
assertTrue(/(?i)\u1f6b/u.test("\u1f63"));
assertTrue(/(?i)\u00e5\u00e5\u00e5/u.test("\u212b\u00e5\u00c5"));
assertTrue(/(?i)AB\u{10400}/u.test("ab\u{10428}"));
assertTrue(/(?i)\w/u.test('\u017F'));
assertTrue(/(?i)\w/u.test('\u212A'));
assertFalse(/(?i)\W/u.test('\u017F'));
assertFalse(/(?i)\W/u.test('\u212A'));
assertFalse(/(?i)\W/u.test('s'));
assertFalse(/(?i)\W/u.test('S'));
assertFalse(/(?i)\W/u.test('K'));
assertFalse(/(?i)\W/u.test('k'));
assertTrue(/(?i)[\w]/u.test('\u017F'));
assertTrue(/(?i)[\w]/u.test('\u212A'));
assertFalse(/(?i)[\W]/u.test('\u017F'));
assertFalse(/(?i)[\W]/u.test('\u212A'));
assertFalse(/(?i)[\W]/u.test('s'));
assertFalse(/(?i)[\W]/u.test('S'));
assertFalse(/(?i)[\W]/u.test('K'));
assertFalse(/(?i)[\W]/u.test('k'));
assertTrue(/(?i)\b/u.test('\u017F'));
assertTrue(/(?i)\b/u.test('\u212A'));
assertTrue(/(?i)\b/u.test('s'));
assertTrue(/(?i)\b/u.test('S'));
assertFalse(/(?i)\B/u.test('\u017F'));
assertFalse(/(?i)\B/u.test('\u212A'));
assertFalse(/(?i)\B/u.test('s'));
assertFalse(/(?i)\B/u.test('S'));
assertFalse(/(?i)\B/u.test('K'));
assertFalse(/(?i)\B/u.test('k'));
assertTrue(/(?i)\p{Ll}/u.test("a"));
assertTrue(/(?i)\p{Ll}/u.test("\u{118D4}"));
assertTrue(/(?i)\p{Ll}/u.test("A"));
assertTrue(/(?i)\p{Ll}/u.test("\u{118B4}"));
assertTrue(/(?i)\P{Ll}/u.test("a"));
assertTrue(/(?i)\P{Ll}/u.test("\u{118D4}"));
assertTrue(/(?i)\P{Ll}/u.test("A"));
assertTrue(/(?i)\P{Ll}/u.test("\u{118B4}"));
assertTrue(/(?i)\p{Lu}/u.test("a"));
assertTrue(/(?i)\p{Lu}/u.test("\u{118D4}"));
assertTrue(/(?i)\p{Lu}/u.test("A"));
assertTrue(/(?i)\p{Lu}/u.test("\u{118B4}"));
assertTrue(/(?i)\P{Lu}/u.test("a"));
assertTrue(/(?i)\P{Lu}/u.test("\u{118D4}"));
assertTrue(/(?i)\P{Lu}/u.test("A"));
assertTrue(/(?i)\P{Lu}/u.test("\u{118B4}"));
assertTrue(/(?i)[@-A]/.test("a"));
assertTrue(/(?i)[@-A]/.test("A"));
assertTrue(/(?i)[@-A]/.test("@"));
assertFalse(/(?i)[¿-À]/.test('¾'));
assertTrue(/(?i)[¿-À]/.test('¿'));
assertTrue(/(?i)[¿-À]/.test('À'));
assertTrue(/(?i)[¿-À]/.test('à'));
assertFalse(/(?i)[¿-À]/.test('á'));
assertFalse(/(?i)[¿-À]/.test('Á'));
assertFalse(/(?i)[¿-À]/.test('Á'));
assertFalse(/(?i)[Ö-×]/.test('Õ'));
assertTrue(/(?i)[Ö-×]/.test('Ö'));
assertTrue(/(?i)[Ö-×]/.test('ö'));
assertTrue(/(?i)[Ö-×]/.test('×'));
assertFalse(/(?i)[Ö-×]/.test('Ø'));
assertTrue(/(?i)(a[\u1000A])+/.test('aa'));
assertTrue(/(?i)\u0178/.test('\u00ff'));
assertTrue(/(?i)\u039c/.test('\u00b5'));
assertTrue(/(?i)\u039c/.test('\u03bc'));
assertTrue(/(?i)\u00b5/.test('\u03bc'));
assertTrue(/(?i)[\u039b-\u039d]/.test('\u00b5'));
assertFalse(/(?i)[^\u039b-\u039d]/.test('\u00b5'));
assertTrue(/(?m)^bar/.test("bar"));
assertTrue(/(?m)^bar/.test("bar\nfoo"));
assertTrue(/(?m)^bar/.test("foo\nbar"));
assertTrue(/(?m)bar$/.test("bar"));
assertTrue(/(?m)bar$/.test("bar\nfoo"));
assertTrue(/(?m)bar$/.test("foo\nbar"));
assertFalse(/(?m)^bxr/.test("bar"));
assertFalse(/(?m)^bxr/.test("bar\nfoo"));
assertFalse(/(?m)^bxr/.test("foo\nbar"));
assertFalse(/(?m)bxr$/.test("bar"));
assertFalse(/(?m)bxr$/.test("bar\nfoo"));
assertFalse(/(?m)bxr$/.test("foo\nbar"));
assertTrue(/(?m)^.*$/.test("\n"));
assertTrue(/(?m)^([()]|.)*$/.test("()\n()"));
assertTrue(/(?m)^([()]|.)*$/.test("()\n"));
assertTrue(/(?m)^[()]*$/.test("()\n."));

View File

@ -0,0 +1,27 @@
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --regexp-mode-modifiers --harmony-regexp-dotall
// S flag switches dotall mode on and off. Combine with i flag changes to test
// the parser.
test(/.(?s).(?i-s).a(?-i)a/);
test(/.(?s:.)(?i:.a)a/);
test(/.(?s).(?i-s).a(?-i)a/u);
test(/.(?s:.)(?i:.a)a/u);
// m flag makes no difference
test(/.(?sm).(?i-s).a(?-i)a/);
test(/.(?s:.)(?i:.a)a/);
test(/.(?sm).(?im-s).a(?m-i)a/u);
test(/.(?s:.)(?i:.a)a/u);
function test(re) {
assertTrue(re.test("...aa"));
assertTrue(re.test(".\n.aa"));
assertTrue(re.test(".\n.Aa"));
assertFalse(re.test("\n\n.Aa"));
assertFalse(re.test(".\n\nAa"));
assertFalse(re.test(".\n.AA"));
}

View File

@ -0,0 +1,196 @@
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --regexp-mode-modifiers
aa(/(a)(?i)\1/);
aa(/([az])(?i)\1/);
aa(/(a)(?i)\1/u);
aa(/([az])(?i)\1/u);
function aa(re) {
assertTrue(re.test("aa"));
assertTrue(re.test("aA"));
assertFalse(re.test("Aa"));
assertFalse(re.test("AA"));
}
aai(/(a)(?-i)\1/i);
aai(/([az])(?-i)\1/i);
aai(/(a)(?-i)\1/iu);
aai(/([az])(?-i)\1/iu);
function aai(re) {
assertTrue(re.test("aa"));
assertFalse(re.test("aA"));
assertFalse(re.test("Aa"));
assertTrue(re.test("AA"));
}
abcd(/a(b(?i)c)d/);
abcd(/[aw]([bx](?i)[cy])[dz]/);
abcd(/a(b(?i)c)d/u);
abcd(/[aw]([bx](?i)[cy])[dz]/u);
function abcd(re) {
assertTrue(re.test("abcd"));
assertFalse(re.test("abcD"));
assertTrue(re.test("abCd"));
assertFalse(re.test("abCD"));
assertFalse(re.test("aBcd"));
assertFalse(re.test("aBcD"));
assertFalse(re.test("aBCd"));
assertFalse(re.test("aBCD"));
assertFalse(re.test("Abcd"));
assertFalse(re.test("AbcD"));
assertFalse(re.test("AbCd"));
assertFalse(re.test("AbCD"));
assertFalse(re.test("ABcd"));
assertFalse(re.test("ABcD"));
assertFalse(re.test("ABCd"));
assertFalse(re.test("ABCD"));
}
abcdei(/a(b(?-i)c)d/i);
abcdei(/[aw]([bx](?-i)[cy])[dz]/i);
abcdei(/a(b(?-i)c)d/iu);
abcdei(/[aw]([bx](?-i)[cy])[dz]/iu);
function abcdei(re) {
assertTrue(re.test("abcd"));
assertTrue(re.test("abcD"));
assertFalse(re.test("abCd"));
assertFalse(re.test("abCD"));
assertTrue(re.test("aBcd"));
assertTrue(re.test("aBcD"));
assertFalse(re.test("aBCd"));
assertFalse(re.test("aBCD"));
assertTrue(re.test("Abcd"));
assertTrue(re.test("AbcD"));
assertFalse(re.test("AbCd"));
assertFalse(re.test("AbCD"));
assertTrue(re.test("ABcd"));
assertTrue(re.test("ABcD"));
assertFalse(re.test("ABCd"));
assertFalse(re.test("ABCD"));
}
abc(/a(?i:b)c/);
abc(/[ax](?i:[by])[cz]/);
abc(/a(?i:b)c/u);
abc(/[ax](?i:[by])[cz]/u);
function abc(re) {
assertTrue(re.test("abc"));
assertFalse(re.test("abC"));
assertTrue(re.test("aBc"));
assertFalse(re.test("aBC"));
assertFalse(re.test("Abc"));
assertFalse(re.test("AbC"));
assertFalse(re.test("ABc"));
assertFalse(re.test("ABC"));
}
abci(/a(?-i:b)c/i);
abci(/[ax](?-i:[by])[cz]/i);
abci(/a(?-i:b)c/iu);
abci(/[ax](?-i:[by])[cz]/iu);
function abci(re) {
assertTrue(re.test("abc"));
assertTrue(re.test("abC"));
assertFalse(re.test("aBc"));
assertFalse(re.test("aBC"));
assertTrue(re.test("Abc"));
assertTrue(re.test("AbC"));
assertFalse(re.test("ABc"));
assertFalse(re.test("ABC"));
}
assertThrows(() => new RegExp("foo(?i:"));
assertThrows(() => new RegExp("foo(?--i)"));
assertThrows(() => new RegExp("foo(?i-i)"));
assertThrows(() => new RegExp("foo(?m:"));
assertThrows(() => new RegExp("foo(?--m)"));
assertThrows(() => new RegExp("foo(?m-m)"));
// The following tests are taken from test/mjsunit/es7/regexp-ui-word.js but
// using inline syntax instead of the global /i flag.
assertTrue(/(?i)\w/u.test('\u017F'));
assertTrue(/(?i)\w/u.test('\u212A'));
assertFalse(/(?i)\W/u.test('\u017F'));
assertFalse(/(?i)\W/u.test('\u212A'));
assertFalse(/(?i)\W/u.test('s'));
assertFalse(/(?i)\W/u.test('S'));
assertFalse(/(?i)\W/u.test('K'));
assertFalse(/(?i)\W/u.test('k'));
assertTrue(/(?i)[\w]/u.test('\u017F'));
assertTrue(/(?i)[\w]/u.test('\u212A'));
assertFalse(/(?i)[\W]/u.test('\u017F'));
assertFalse(/(?i)[\W]/u.test('\u212A'));
assertFalse(/(?i)[\W]/u.test('s'));
assertFalse(/(?i)[\W]/u.test('S'));
assertFalse(/(?i)[\W]/u.test('K'));
assertFalse(/(?i)[\W]/u.test('k'));
assertTrue(/(?i)\b/u.test('\u017F'));
assertFalse(/(?i:)\b/u.test('\u017F'));
assertTrue(/(?i)\b/u.test('\u212A'));
assertFalse(/(?i:)\b/u.test('\u212A'));
assertTrue(/(?i)\b/u.test('s'));
assertTrue(/(?i)\b/u.test('S'));
assertFalse(/(?i)\B/u.test('\u017F'));
assertFalse(/(?i)\B/u.test('\u212A'));
assertFalse(/(?i)\B/u.test('s'));
assertFalse(/(?i)\B/u.test('S'));
assertFalse(/(?i)\B/u.test('K'));
assertFalse(/(?i)\B/u.test('k'));
var re = /^\s(?m)^.$\s(?-m)$/;
assertTrue(re.test("\n.\n"));
assertFalse(re.test(" .\n"));
assertFalse(re.test("\n. "));
assertFalse(re.test(" . "));
assertFalse(re.test("_\n.\n"));
assertFalse(re.test("\n.\n_"));
assertFalse(re.test("_\n.\n_"));
assertEquals(["abcd", "d"], /a.*?(.)(?i)\b/.exec('abcd\u017F cd'));
assertEquals(["abcd", "d"], /a.*?(.)(?i)\b/.exec('abcd\u212A cd'));
assertEquals(["abcd\u017F", "\u017F"], /a.*?(.)(?i)\b/u.exec('abcd\u017F cd'));
assertEquals(["abcd\u212A", "\u212A"], /a.*?(.)(?i)\b/u.exec('abcd\u212A cd'));
assertEquals(["a\u017F ", " "], /a.*?(?i)\B(.)/.exec('a\u017F '));
assertEquals(["a\u212A ", " "], /a.*?(?i)\B(.)/.exec('a\u212A '));
assertEquals(["a\u017F", "\u017F"], /a.*?(?i:\B)(.)/u.exec('a\u017F '));
assertEquals(["a\u212A", "\u212A"], /a.*?(?i:\B)(.)/u.exec('a\u212A '));
// Nested flags.
var res = [
/^a(?i:b(?-i:c(?i:d)e)f)g$/,
/^a(?i:b(?-i)c(?i)d(?-i)e(?i)f)g$/,
/^(?-i:a(?i:b(?-i:c(?i:d)e)f)g)$/i,
/^(?-i:a(?i:b(?-i)c(?i)d(?-i)e(?i)f)g)$/i,
];
for (var idx = 0; idx < res.length; idx++) {
var re = res[idx];
for (var i = 0; i < 128; i++) {
var s = (i & 1) ? "A" : "a";
s += (i & 2) ? "B" : "b";
s += (i & 4) ? "C" : "c";
s += (i & 8) ? "D" : "d";
s += (i & 16) ? "E" : "e";
s += (i & 32) ? "F" : "f";
s += (i & 64) ? "G" : "g";
if ((i & (1 | 4 | 16 | 64)) != 0) {
assertFalse(re.test(s), s);
} else {
assertTrue(re.test(s), s);
}
}
}