From 49999742b5c6e104f657a0cfed22df756be49123 Mon Sep 17 00:00:00 2001 From: Benedikt Meurer Date: Fri, 3 Nov 2017 10:28:52 +0000 Subject: [PATCH] Revert "RegExp: Add the ability to switch flags on and off within the regexp" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 68212c80c3f54aab83f30ccc6b15f899e36f3131. Reason for revert: https://build.chromium.org/p/client.v8/builders/V8%20Linux%20-%20noi18n%20-%20debug/builds/17200 Original change's description: > RegExp: Add the ability to switch flags on and off within the regexp > > R=​yangguo@chromium.org > > This is a reupload of https://chromium-review.googlesource.com/c/v8/v8/+/571746 > with a different user, since the other one was not allowed to commit to V8 any > more. > > Bug: > Change-Id: I6171afd44e514f6c934390faab6f9bee3953ac77 > Reviewed-on: https://chromium-review.googlesource.com/752522 > Commit-Queue: Jakob Gruber > Reviewed-by: Jakob Gruber > Cr-Commit-Position: refs/heads/master@{#49098} TBR=erik.corry@gmail.com,yangguo@chromium.org,erikcorry@chromium.org,jgruber@chromium.org Change-Id: I651c5618f09f43104af50cb1319ab7b49011573e No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://chromium-review.googlesource.com/752802 Reviewed-by: Benedikt Meurer Commit-Queue: Benedikt Meurer Cr-Commit-Position: refs/heads/master@{#49099} --- src/flag-definitions.h | 1 - src/regexp/jsregexp.cc | 315 +++++++++--------- src/regexp/jsregexp.h | 71 ++-- src/regexp/regexp-ast.h | 49 +-- src/regexp/regexp-parser.cc | 281 ++++++---------- src/regexp/regexp-parser.h | 65 ++-- .../mjsunit/regexp-modifiers-autogenerated.js | 144 -------- test/mjsunit/regexp-modifiers-dotall.js | 27 -- test/mjsunit/regexp-modifiers.js | 196 ----------- 9 files changed, 333 insertions(+), 816 deletions(-) delete mode 100644 test/mjsunit/regexp-modifiers-autogenerated.js delete mode 100644 test/mjsunit/regexp-modifiers-dotall.js delete mode 100644 test/mjsunit/regexp-modifiers.js diff --git a/src/flag-definitions.h b/src/flag-definitions.h index 190c12014b..fe1c41e199 100644 --- a/src/flag-definitions.h +++ b/src/flag-definitions.h @@ -959,7 +959,6 @@ DEFINE_BOOL(serialization_statistics, false, // Regexp DEFINE_BOOL(regexp_optimization, true, "generate optimized regexp code") -DEFINE_BOOL(regexp_mode_modifiers, false, "enable inline flags in regexp.") // Testing flags test/cctest/test-{flags,api,serialization}.cc DEFINE_BOOL(testing_bool_flag, true, "testing_bool_flag") diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc index acaff99047..7befa2e81f 100644 --- a/src/regexp/jsregexp.cc +++ b/src/regexp/jsregexp.cc @@ -132,17 +132,17 @@ MaybeHandle RegExpImpl::Compile(Handle re, bool has_been_compiled = false; - if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) && + if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && + !(flags & JSRegExp::kSticky) && pattern->length() <= kPatternTooShortForBoyerMoore) { // Parse-tree is a single atom that is equal to the pattern. AtomCompile(re, pattern, flags, pattern); has_been_compiled = true; - } else if (parse_result.tree->IsAtom() && !IsSticky(flags) && - parse_result.capture_count == 0) { + } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) && + !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) { RegExpAtom* atom = parse_result.tree->AsAtom(); Vector atom_pattern = atom->data(); - if (!IgnoreCase(atom->flags()) && - atom_pattern.length() <= kPatternTooShortForBoyerMoore) { + if (atom_pattern.length() <= kPatternTooShortForBoyerMoore) { Handle atom_string; ASSIGN_RETURN_ON_EXCEPTION( isolate, atom_string, @@ -622,7 +622,7 @@ RegExpImpl::GlobalCache::GlobalCache(Handle regexp, } } - DCHECK(IsGlobal(regexp->GetFlags())); + DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal); if (!interpreted) { register_array_size_ = Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); @@ -653,7 +653,8 @@ RegExpImpl::GlobalCache::GlobalCache(Handle regexp, } int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { - if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() && + if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 && + last_index + 1 < subject_->length() && unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { // Advance over the surrogate pair. @@ -915,7 +916,7 @@ class FrequencyCollator { class RegExpCompiler { public: RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool is_one_byte); + JSRegExp::Flags flags, bool is_one_byte); int AllocateRegister() { if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { @@ -967,6 +968,13 @@ class RegExpCompiler { void SetRegExpTooBig() { reg_exp_too_big_ = true; } + inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } + inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } + // Both unicode and ignore_case flags are set. We need to use ICU to find + // the closure over case equivalents. + inline bool needs_unicode_case_equivalents() { + return unicode() && ignore_case(); + } inline bool one_byte() { return one_byte_; } inline bool optimize() { return optimize_; } inline void set_optimize(bool value) { optimize_ = value; } @@ -996,6 +1004,7 @@ class RegExpCompiler { std::vector* work_list_; int recursion_depth_; RegExpMacroAssembler* macro_assembler_; + JSRegExp::Flags flags_; bool one_byte_; bool reg_exp_too_big_; bool limiting_recursion_; @@ -1027,12 +1036,13 @@ static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { // Attempts to compile the regexp using an Irregexp code generator. Returns // a fixed array or a null handle depending on whether it succeeded. RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool one_byte) + JSRegExp::Flags flags, bool one_byte) : next_register_(2 * (capture_count + 1)), unicode_lookaround_stack_register_(kNoRegister), unicode_lookaround_position_register_(kNoRegister), work_list_(nullptr), recursion_depth_(0), + flags_(flags), one_byte_(one_byte), reg_exp_too_big_(false), limiting_recursion_(false), @@ -2493,7 +2503,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, QuickCheckDetails::Position* pos = details->positions(characters_filled_in); uc16 c = quarks[i]; - if (elm.atom()->ignore_case()) { + if (compiler->ignore_case()) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; int length = GetCaseIndependentLetters(isolate, c, compiler->one_byte(), chars); @@ -2701,16 +2711,18 @@ class VisitMarker { NodeInfo* info_; }; -RegExpNode* SeqRegExpNode::FilterOneByte(int depth) { + +RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; DCHECK(!info()->visited); VisitMarker marker(info()); - return FilterSuccessor(depth - 1); + return FilterSuccessor(depth - 1, ignore_case); } -RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { - RegExpNode* next = on_success_->FilterOneByte(depth - 1); + +RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { + RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); if (next == nullptr) return set_replacement(nullptr); on_success_ = next; return set_replacement(this); @@ -2733,7 +2745,8 @@ static bool RangesContainLatin1Equivalents(ZoneList* ranges) { return false; } -RegExpNode* TextNode::FilterOneByte(int depth) { + +RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; DCHECK(!info()->visited); @@ -2746,7 +2759,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) { for (int j = 0; j < quarks.length(); j++) { uint16_t c = quarks[j]; if (c <= String::kMaxOneByteCharCode) continue; - if (!IgnoreCase(elm.atom()->flags())) return set_replacement(nullptr); + if (!ignore_case) return set_replacement(nullptr); // Here, we need to check for characters whose upper and lower cases // are outside the Latin-1 range. uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); @@ -2768,41 +2781,42 @@ RegExpNode* TextNode::FilterOneByte(int depth) { ranges->at(0).from() == 0 && ranges->at(0).to() >= String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) - continue; + if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; return set_replacement(nullptr); } } else { if (range_count == 0 || ranges->at(0).from() > String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) - continue; + if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; return set_replacement(nullptr); } } } } - return FilterSuccessor(depth - 1); + return FilterSuccessor(depth - 1, ignore_case); } -RegExpNode* LoopChoiceNode::FilterOneByte(int depth) { + +RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; { VisitMarker marker(info()); - RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); + RegExpNode* continue_replacement = + continue_node_->FilterOneByte(depth - 1, ignore_case); // If we can't continue after the loop then there is no sense in doing the // loop. if (continue_replacement == nullptr) return set_replacement(nullptr); } - return ChoiceNode::FilterOneByte(depth - 1); + return ChoiceNode::FilterOneByte(depth - 1, ignore_case); } -RegExpNode* ChoiceNode::FilterOneByte(int depth) { + +RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2822,7 +2836,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) { RegExpNode* survivor = nullptr; for (int i = 0; i < choice_count; i++) { GuardedAlternative alternative = alternatives_->at(i); - RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); + RegExpNode* replacement = + alternative.node()->FilterOneByte(depth - 1, ignore_case); DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. if (replacement != nullptr) { alternatives_->at(i).set_node(replacement); @@ -2842,7 +2857,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) { new(zone()) ZoneList(surviving, zone()); for (int i = 0; i < choice_count; i++) { RegExpNode* replacement = - alternatives_->at(i).node()->FilterOneByte(depth - 1); + alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); if (replacement != nullptr) { alternatives_->at(i).set_node(replacement); new_alternatives->Add(alternatives_->at(i), zone()); @@ -2852,7 +2867,9 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) { return this; } -RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { + +RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, + bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2860,12 +2877,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { // Alternative 0 is the negative lookahead, alternative 1 is what comes // afterwards. RegExpNode* node = alternatives_->at(1).node(); - RegExpNode* replacement = node->FilterOneByte(depth - 1); + RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); if (replacement == nullptr) return set_replacement(nullptr); alternatives_->at(1).set_node(replacement); RegExpNode* neg_node = alternatives_->at(0).node(); - RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); // If the negative lookahead is always going to fail then // we don't need to check it. if (neg_replacement == nullptr) return set_replacement(replacement); @@ -3182,7 +3199,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextElement elm = elements()->at(i); int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; if (elm.text_type() == TextElement::ATOM) { - if (SkipPass(pass, elm.atom()->ignore_case())) continue; Vector quarks = elm.atom()->data(); for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { if (first_element_checked && i == 0 && j == 0) continue; @@ -3238,7 +3254,9 @@ int TextNode::Length() { return elm.cp_offset() + elm.length(); } -bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { + +bool TextNode::SkipPass(int int_pass, bool ignore_case) { + TextEmitPassType pass = static_cast(int_pass); if (ignore_case) { return pass == SIMPLE_CHARACTER_MATCH; } else { @@ -3246,33 +3264,32 @@ bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { } } + TextNode* TextNode::CreateForCharacterRanges(Zone* zone, ZoneList* ranges, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags) { + RegExpNode* on_success) { DCHECK_NOT_NULL(ranges); ZoneList* elms = new (zone) ZoneList(1, zone); - elms->Add( - TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, flags)), - zone); + elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)), + zone); return new (zone) TextNode(elms, read_backward, on_success); } + TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, CharacterRange trail, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags) { + RegExpNode* on_success) { ZoneList* lead_ranges = CharacterRange::List(zone, lead); ZoneList* trail_ranges = CharacterRange::List(zone, trail); ZoneList* elms = new (zone) ZoneList(2, zone); - elms->Add(TextElement::CharClass( - new (zone) RegExpCharacterClass(lead_ranges, flags)), - zone); - elms->Add(TextElement::CharClass( - new (zone) RegExpCharacterClass(trail_ranges, flags)), - zone); + elms->Add( + TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)), + zone); + elms->Add( + TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)), + zone); return new (zone) TextNode(elms, read_backward, on_success); } @@ -3306,15 +3323,27 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { // check that now. if (trace->characters_preloaded() == 1) { for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { - TextEmitPass(compiler, static_cast(pass), true, trace, - false, &bound_checked_to); + if (!SkipPass(pass, compiler->ignore_case())) { + TextEmitPass(compiler, + static_cast(pass), + true, + trace, + false, + &bound_checked_to); + } } first_elt_done = true; } for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { - TextEmitPass(compiler, static_cast(pass), false, trace, - first_elt_done, &bound_checked_to); + if (!SkipPass(pass, compiler->ignore_case())) { + TextEmitPass(compiler, + static_cast(pass), + false, + trace, + first_elt_done, + &bound_checked_to); + } } Trace successor_trace(*trace); @@ -3357,15 +3386,11 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { TextElement elm = elements()->at(i); if (elm.text_type() == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.char_class(); - if (IgnoreCase(cc->flags()) && - !NeedsUnicodeCaseEquivalents(cc->flags())) { - // None of the standard character classes is different in the case - // independent case and it slows us down if we don't know that. - if (cc->is_standard(zone())) continue; - ZoneList* ranges = cc->ranges(zone()); - CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, - is_one_byte); - } + // None of the standard character classes is different in the case + // independent case and it slows us down if we don't know that. + if (cc->is_standard(zone())) continue; + ZoneList* ranges = cc->ranges(zone()); + CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); } } } @@ -4328,9 +4353,9 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RecursionCheck rc(compiler); DCHECK_EQ(start_reg_ + 1, end_reg_); - if (IgnoreCase(flags_)) { + if (compiler->ignore_case()) { assembler->CheckNotBackReferenceIgnoreCase( - start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack()); + start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -4339,7 +4364,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { if (read_backward()) trace->set_at_start(Trace::UNKNOWN); // Check that the back reference does not end inside a surrogate pair. - if (IsUnicode(flags_) && !compiler->one_byte()) { + if (compiler->unicode() && !compiler->one_byte()) { assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); } on_success()->Emit(compiler, trace); @@ -4862,24 +4887,24 @@ void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); } + void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { ZoneList* bmp = splitter->bmp(); if (bmp == nullptr) return; - JSRegExp::Flags default_flags = JSRegExp::Flags(); result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( - compiler->zone(), bmp, compiler->read_backward(), on_success, - default_flags))); + compiler->zone(), bmp, compiler->read_backward(), on_success))); } + void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { ZoneList* non_bmp = splitter->non_bmp(); if (non_bmp == nullptr) return; + DCHECK(compiler->unicode()); DCHECK(!compiler->one_byte()); Zone* zone = compiler->zone(); - JSRegExp::Flags default_flags = JSRegExp::Flags(); CharacterRange::Canonicalize(non_bmp); for (int i = 0; i < non_bmp->length(); i++) { // Match surrogate pair. @@ -4899,7 +4924,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Singleton(from_l), CharacterRange::Range(from_t, to_t), compiler->read_backward(), - on_success, default_flags))); + on_success))); } else { if (from_t != kTrailSurrogateStart) { // Add [from_l][from_t-\udfff] @@ -4907,7 +4932,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Singleton(from_l), CharacterRange::Range(from_t, kTrailSurrogateEnd), - compiler->read_backward(), on_success, default_flags))); + compiler->read_backward(), on_success))); from_l++; } if (to_t != kTrailSurrogateEnd) { @@ -4916,7 +4941,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Singleton(to_l), CharacterRange::Range(kTrailSurrogateStart, to_t), - compiler->read_backward(), on_success, default_flags))); + compiler->read_backward(), on_success))); to_l--; } if (from_l <= to_l) { @@ -4925,47 +4950,49 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Range(from_l, to_l), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), - compiler->read_backward(), on_success, default_flags))); + compiler->read_backward(), on_success))); } } } } + RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( RegExpCompiler* compiler, ZoneList* lookbehind, - ZoneList* match, RegExpNode* on_success, bool read_backward, - JSRegExp::Flags flags) { + ZoneList* match, RegExpNode* on_success, + bool read_backward) { Zone* zone = compiler->zone(); RegExpNode* match_node = TextNode::CreateForCharacterRanges( - zone, match, read_backward, on_success, flags); + zone, match, read_backward, on_success); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpLookaround::Builder lookaround(false, match_node, stack_register, position_register); RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookbehind, !read_backward, lookaround.on_match_success(), flags); + zone, lookbehind, !read_backward, lookaround.on_match_success()); return lookaround.ForMatch(negative_match); } + RegExpNode* MatchAndNegativeLookaroundInReadDirection( RegExpCompiler* compiler, ZoneList* match, ZoneList* lookahead, RegExpNode* on_success, - bool read_backward, JSRegExp::Flags flags) { + bool read_backward) { Zone* zone = compiler->zone(); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpLookaround::Builder lookaround(false, on_success, stack_register, position_register); RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookahead, read_backward, lookaround.on_match_success(), flags); + zone, lookahead, read_backward, lookaround.on_match_success()); return TextNode::CreateForCharacterRanges( - zone, match, read_backward, lookaround.ForMatch(negative_match), flags); + zone, match, read_backward, lookaround.ForMatch(negative_match)); } + void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { - JSRegExp::Flags default_flags = JSRegExp::Flags(); ZoneList* lead_surrogates = splitter->lead_surrogates(); if (lead_surrogates == nullptr) return; Zone* zone = compiler->zone(); @@ -4978,22 +5005,20 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, // Reading backward. Assert that reading forward, there is no trail // surrogate, and then backward match the lead surrogate. match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, trail_surrogates, lead_surrogates, on_success, true, - default_flags); + compiler, trail_surrogates, lead_surrogates, on_success, true); } else { // Reading forward. Forward match the lead surrogate and assert that // no trail surrogate follows. match = MatchAndNegativeLookaroundInReadDirection( - compiler, lead_surrogates, trail_surrogates, on_success, false, - default_flags); + compiler, lead_surrogates, trail_surrogates, on_success, false); } result->AddAlternative(GuardedAlternative(match)); } + void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { - JSRegExp::Flags default_flags = JSRegExp::Flags(); ZoneList* trail_surrogates = splitter->trail_surrogates(); if (trail_surrogates == nullptr) return; Zone* zone = compiler->zone(); @@ -5006,14 +5031,12 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, // Reading backward. Backward match the trail surrogate and assert that no // lead surrogate precedes it. match = MatchAndNegativeLookaroundInReadDirection( - compiler, trail_surrogates, lead_surrogates, on_success, true, - default_flags); + compiler, trail_surrogates, lead_surrogates, on_success, true); } else { // Reading forward. Assert that reading backward, there is no lead // surrogate, and then forward match the trail surrogate. match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, lead_surrogates, trail_surrogates, on_success, false, - default_flags); + compiler, lead_surrogates, trail_surrogates, on_success, false); } result->AddAlternative(GuardedAlternative(match)); } @@ -5029,9 +5052,7 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, // the associated trail surrogate. ZoneList* range = CharacterRange::List( zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - return TextNode::CreateForCharacterRanges(zone, range, false, on_success, - default_flags); + return TextNode::CreateForCharacterRanges(zone, range, false, on_success); } void AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone) { @@ -5072,10 +5093,10 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, set_.Canonicalize(); Zone* zone = compiler->zone(); ZoneList* ranges = this->ranges(zone); - if (NeedsUnicodeCaseEquivalents(flags_)) { + if (compiler->needs_unicode_case_equivalents()) { AddUnicodeCaseEquivalents(ranges, zone); } - if (IsUnicode(flags_) && !compiler->one_byte() && + if (compiler->unicode() && !compiler->one_byte() && !contains_split_surrogate()) { if (is_negated()) { ZoneList* negated = @@ -5084,10 +5105,9 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, ranges = negated; } if (ranges->length() == 0) { - JSRegExp::Flags default_flags = JSRegExp::Flags(); ranges->Add(CharacterRange::Everything(), zone); RegExpCharacterClass* fail = - new (zone) RegExpCharacterClass(ranges, default_flags, NEGATED); + new (zone) RegExpCharacterClass(ranges, NEGATED); return new (zone) TextNode(fail, compiler->read_backward(), on_success); } if (standard_type() == '*') { @@ -5162,12 +5182,10 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { // i is length or it is the index of an atom. if (i == length) break; int first_atom = i; - JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags(); i++; while (i < length) { RegExpTree* alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - if (alternative->AsAtom()->flags() != flags) break; i++; } // Sort atoms to get ones with common prefixes together. @@ -5179,7 +5197,7 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { DCHECK_LT(first_atom, alternatives->length()); DCHECK_LE(i, alternatives->length()); DCHECK_LE(first_atom, i); - if (IgnoreCase(flags)) { + if (compiler->ignore_case()) { unibrow::Mapping* canonicalize = compiler->isolate()->regexp_macro_assembler_canonicalize(); auto compare_closure = @@ -5211,8 +5229,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { i++; continue; } - RegExpAtom* const atom = alternative->AsAtom(); - JSRegExp::Flags flags = atom->flags(); + RegExpAtom* atom = alternative->AsAtom(); unibrow::uchar common_prefix = atom->data().at(0); int first_with_prefix = i; int prefix_length = atom->length(); @@ -5220,11 +5237,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { while (i < length) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - RegExpAtom* const atom = alternative->AsAtom(); - if (atom->flags() != flags) break; + atom = alternative->AsAtom(); unibrow::uchar new_prefix = atom->data().at(0); if (new_prefix != common_prefix) { - if (!IgnoreCase(flags)) break; + if (!compiler->ignore_case()) break; unibrow::Mapping* canonicalize = compiler->isolate()->regexp_macro_assembler_canonicalize(); new_prefix = Canonical(canonicalize, new_prefix); @@ -5241,7 +5257,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { // common prefix if the terms were similar or presorted in the input. // Find out how long the common prefix is. int run_length = i - first_with_prefix; - RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom(); + atom = alternatives->at(first_with_prefix)->AsAtom(); for (int j = 1; j < run_length && prefix_length > 1; j++) { RegExpAtom* old_atom = alternatives->at(j + first_with_prefix)->AsAtom(); @@ -5252,8 +5268,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { } } } - RegExpAtom* prefix = new (zone) - RegExpAtom(atom->data().SubVector(0, prefix_length), flags); + RegExpAtom* prefix = + new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); ZoneList* pair = new (zone) ZoneList(2, zone); pair->Add(prefix, zone); ZoneList* suffixes = @@ -5266,8 +5282,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { suffixes->Add(new (zone) RegExpEmpty(), zone); } else { RegExpTree* suffix = new (zone) RegExpAtom( - old_atom->data().SubVector(prefix_length, old_atom->length()), - flags); + old_atom->data().SubVector(prefix_length, old_atom->length())); suffixes->Add(suffix, zone); } } @@ -5290,6 +5305,7 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( Zone* zone = compiler->zone(); ZoneList* alternatives = this->alternatives(); int length = alternatives->length(); + const bool unicode = compiler->unicode(); int write_posn = 0; int i = 0; @@ -5300,28 +5316,24 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( i++; continue; } - RegExpAtom* const atom = alternative->AsAtom(); + RegExpAtom* atom = alternative->AsAtom(); if (atom->length() != 1) { alternatives->at(write_posn++) = alternatives->at(i); i++; continue; } - JSRegExp::Flags flags = atom->flags(); - DCHECK_IMPLIES(IsUnicode(flags), + DCHECK_IMPLIES(unicode, !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); bool contains_trail_surrogate = unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); int first_in_run = i; i++; - // Find a run of single-character atom alternatives that have identical - // flags (case independence and unicode-ness). while (i < length) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - RegExpAtom* const atom = alternative->AsAtom(); + atom = alternative->AsAtom(); if (atom->length() != 1) break; - if (atom->flags() != flags) break; - DCHECK_IMPLIES(IsUnicode(flags), + DCHECK_IMPLIES(unicode, !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); contains_trail_surrogate |= unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); @@ -5337,12 +5349,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( DCHECK_EQ(old_atom->length(), 1); ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); } - RegExpCharacterClass::CharacterClassFlags character_class_flags; - if (IsUnicode(flags) && contains_trail_surrogate) { - character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; + RegExpCharacterClass::Flags flags; + if (unicode && contains_trail_surrogate) { + flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; } alternatives->at(write_posn++) = - new (zone) RegExpCharacterClass(ranges, flags, character_class_flags); + new (zone) RegExpCharacterClass(ranges, flags); } else { // Just copy any trivial alternatives. for (int j = first_in_run; j < i; j++) { @@ -5574,9 +5586,8 @@ namespace { // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, RegExpNode* on_success, - RegExpAssertion::AssertionType type, - JSRegExp::Flags flags) { - DCHECK(NeedsUnicodeCaseEquivalents(flags)); + RegExpAssertion::AssertionType type) { + DCHECK(compiler->needs_unicode_case_equivalents()); Zone* zone = compiler->zone(); ZoneList* word_range = new (zone) ZoneList(2, zone); @@ -5594,13 +5605,13 @@ RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, stack_register, position_register); RegExpNode* backward = TextNode::CreateForCharacterRanges( - zone, word_range, true, lookbehind.on_match_success(), flags); + zone, word_range, true, lookbehind.on_match_success()); // Look to the right. RegExpLookaround::Builder lookahead(lookahead_for_word, lookbehind.ForMatch(backward), stack_register, position_register); RegExpNode* forward = TextNode::CreateForCharacterRanges( - zone, word_range, false, lookahead.on_match_success(), flags); + zone, word_range, false, lookahead.on_match_success()); result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); } return result; @@ -5618,14 +5629,13 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, case START_OF_INPUT: return AssertionNode::AtStart(on_success); case BOUNDARY: - return NeedsUnicodeCaseEquivalents(flags_) - ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, - flags_) + return compiler->needs_unicode_case_equivalents() + ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY) : AssertionNode::AtBoundary(on_success); case NON_BOUNDARY: - return NeedsUnicodeCaseEquivalents(flags_) + return compiler->needs_unicode_case_equivalents() ? BoundaryAssertionAsLookaround(compiler, on_success, - NON_BOUNDARY, flags_) + NON_BOUNDARY) : AssertionNode::AtNonBoundary(on_success); case END_OF_INPUT: return AssertionNode::AtEnd(on_success); @@ -5641,9 +5651,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, ZoneList* newline_ranges = new(zone) ZoneList(3, zone); CharacterRange::AddClassEscape('n', newline_ranges, false, zone); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - RegExpCharacterClass* newline_atom = - new (zone) RegExpCharacterClass('n', default_flags); + RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); TextNode* newline_matcher = new (zone) TextNode( newline_atom, false, ActionNode::PositiveSubmatchSuccess( stack_pointer_register, position_register, @@ -5673,7 +5681,7 @@ RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { return new (compiler->zone()) BackReferenceNode(RegExpCapture::StartRegister(index()), - RegExpCapture::EndRegister(index()), flags_, + RegExpCapture::EndRegister(index()), compiler->read_backward(), on_success); } @@ -6329,7 +6337,9 @@ void TextNode::CalculateOffsets() { void Analysis::VisitText(TextNode* that) { - that->MakeCaseIndependent(isolate(), is_one_byte_); + if (ignore_case()) { + that->MakeCaseIndependent(isolate(), is_one_byte_); + } EnsureAnalyzed(that->on_success()); if (!has_failed()) { that->CalculateOffsets(); @@ -6440,7 +6450,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, return; } uc16 character = atom->data()[j]; - if (IgnoreCase(atom->flags())) { + if (bm->compiler()->ignore_case()) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; int length = GetCaseIndependentLetters( isolate, character, bm->max_char() == String::kMaxOneByteCharCode, @@ -6592,9 +6602,9 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) { target->Accept(this); } + RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, - RegExpNode* on_success, - JSRegExp::Flags flags) { + RegExpNode* on_success) { // If the regexp matching starts within a surrogate pair, step back // to the lead surrogate and start matching from there. DCHECK(!compiler->read_backward()); @@ -6609,11 +6619,11 @@ RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpNode* step_back = TextNode::CreateForCharacterRanges( - zone, lead_surrogates, true, on_success, flags); + zone, lead_surrogates, true, on_success); RegExpLookaround::Builder builder(true, step_back, stack_register, position_register); RegExpNode* match_trail = TextNode::CreateForCharacterRanges( - zone, trail_surrogates, false, builder.on_match_success(), flags); + zone, trail_surrogates, false, builder.on_match_success()); optional_step_back->AddAlternative( GuardedAlternative(builder.ForMatch(match_trail))); @@ -6630,10 +6640,12 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { return IrregexpRegExpTooBig(isolate); } - bool is_sticky = IsSticky(flags); - bool is_global = IsGlobal(flags); - bool is_unicode = IsUnicode(flags); - RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte); + bool ignore_case = flags & JSRegExp::kIgnoreCase; + bool is_sticky = flags & JSRegExp::kSticky; + bool is_global = flags & JSRegExp::kGlobal; + bool is_unicode = flags & JSRegExp::kUnicode; + RegExpCompiler compiler(isolate, zone, data->capture_count, flags, + is_one_byte); if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); @@ -6661,11 +6673,9 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( if (!is_start_anchored && !is_sticky) { // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning or sticky. - JSRegExp::Flags default_flags = JSRegExp::Flags(); RegExpNode* loop_node = RegExpQuantifier::ToNode( - 0, RegExpTree::kInfinity, false, - new (zone) RegExpCharacterClass('*', default_flags), &compiler, - captured_body, data->contains_anchor); + 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), + &compiler, captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start @@ -6673,27 +6683,26 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); first_step_node->AddAlternative(GuardedAlternative(captured_body)); first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( - new (zone) RegExpCharacterClass('*', default_flags), false, - loop_node))); + new (zone) RegExpCharacterClass('*'), false, loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != nullptr) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); } - } else if (is_unicode && (is_global || is_sticky)) { - node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags); + } else if (compiler.unicode() && (is_global || is_sticky)) { + node = OptionallyStepBackToLeadSurrogate(&compiler, node); } if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - Analysis analysis(isolate, is_one_byte); + Analysis analysis(isolate, flags, is_one_byte); analysis.EnsureAnalyzed(node); if (analysis.has_failed()) { const char* error_message = analysis.error_message(); diff --git a/src/regexp/jsregexp.h b/src/regexp/jsregexp.h index 021c59d3e4..d485045ff0 100644 --- a/src/regexp/jsregexp.h +++ b/src/regexp/jsregexp.h @@ -21,36 +21,6 @@ class RegExpNode; class RegExpTree; class BoyerMooreLookahead; -inline bool IgnoreCase(JSRegExp::Flags flags) { - return (flags & JSRegExp::kIgnoreCase) != 0; -} - -inline bool IsUnicode(JSRegExp::Flags flags) { - return (flags & JSRegExp::kUnicode) != 0; -} - -inline bool IsSticky(JSRegExp::Flags flags) { - return (flags & JSRegExp::kSticky) != 0; -} - -inline bool IsGlobal(JSRegExp::Flags flags) { - return (flags & JSRegExp::kGlobal) != 0; -} - -inline bool DotAll(JSRegExp::Flags flags) { - return (flags & JSRegExp::kDotAll) != 0; -} - -inline bool Multiline(JSRegExp::Flags flags) { - return (flags & JSRegExp::kMultiline) != 0; -} - -inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { - // Both unicode and ignore_case flags are set. We need to use ICU to find - // the closure over case equivalents. - return IsUnicode(flags) && IgnoreCase(flags); -} - class RegExpImpl { public: // Whether V8 is compiled with native regexp support or not. @@ -525,7 +495,9 @@ class RegExpNode: public ZoneObject { // If we know that the input is one-byte then there are some nodes that can // never match. This method returns a node that can be substituted for // itself, or nullptr if the node can never match. - virtual RegExpNode* FilterOneByte(int depth) { return this; } + virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) { + return this; + } // Helper for FilterOneByte. RegExpNode* replacement() { DCHECK(info()->replacement_calculated); @@ -597,7 +569,7 @@ class SeqRegExpNode: public RegExpNode { : RegExpNode(on_success->zone()), on_success_(on_success) { } RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } - virtual RegExpNode* FilterOneByte(int depth); + virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start) { on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); @@ -605,7 +577,7 @@ class SeqRegExpNode: public RegExpNode { } protected: - RegExpNode* FilterSuccessor(int depth); + RegExpNode* FilterSuccessor(int depth, bool ignore_case); private: RegExpNode* on_success_; @@ -710,15 +682,13 @@ class TextNode: public SeqRegExpNode { static TextNode* CreateForCharacterRanges(Zone* zone, ZoneList* ranges, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags); + RegExpNode* on_success); // Create TextNode for a surrogate pair with a range given for the // lead and the trail surrogate each. static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead, CharacterRange trail, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags); + RegExpNode* on_success); virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); @@ -735,7 +705,7 @@ class TextNode: public SeqRegExpNode { virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start); void CalculateOffsets(); - virtual RegExpNode* FilterOneByte(int depth); + virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); private: enum TextEmitPassType { @@ -745,7 +715,7 @@ class TextNode: public SeqRegExpNode { CASE_CHARACTER_MATCH, // Case-independent single character check. CHARACTER_CLASS_MATCH // Character class. }; - static bool SkipPass(TextEmitPassType pass, bool ignore_case); + static bool SkipPass(int pass, bool ignore_case); static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; static const int kLastPass = CHARACTER_CLASS_MATCH; void TextEmitPass(RegExpCompiler* compiler, @@ -809,12 +779,11 @@ class AssertionNode: public SeqRegExpNode { class BackReferenceNode: public SeqRegExpNode { public: - BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags, - bool read_backward, RegExpNode* on_success) + BackReferenceNode(int start_reg, int end_reg, bool read_backward, + RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg), - flags_(flags), read_backward_(read_backward) {} virtual void Accept(NodeVisitor* visitor); int start_register() { return start_reg_; } @@ -836,7 +805,6 @@ class BackReferenceNode: public SeqRegExpNode { private: int start_reg_; int end_reg_; - JSRegExp::Flags flags_; bool read_backward_; }; @@ -961,7 +929,7 @@ class ChoiceNode: public RegExpNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return true; } - virtual RegExpNode* FilterOneByte(int depth); + virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); virtual bool read_backward() { return false; } protected: @@ -1033,7 +1001,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return !is_first; } - virtual RegExpNode* FilterOneByte(int depth); + virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); }; @@ -1060,7 +1028,7 @@ class LoopChoiceNode: public ChoiceNode { bool body_can_be_zero_length() { return body_can_be_zero_length_; } virtual bool read_backward() { return read_backward_; } virtual void Accept(NodeVisitor* visitor); - virtual RegExpNode* FilterOneByte(int depth); + virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); private: // AddAlternative is made private for loop nodes because alternatives @@ -1467,8 +1435,11 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) // +-------+ +------------+ class Analysis: public NodeVisitor { public: - Analysis(Isolate* isolate, bool is_one_byte) - : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {} + Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte) + : isolate_(isolate), + flags_(flags), + is_one_byte_(is_one_byte), + error_message_(nullptr) {} void EnsureAnalyzed(RegExpNode* node); #define DECLARE_VISIT(Type) \ @@ -1488,8 +1459,12 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) Isolate* isolate() const { return isolate_; } + bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } + bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } + private: Isolate* isolate_; + JSRegExp::Flags flags_; bool is_one_byte_; const char* error_message_; diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h index e60621f8b6..14e43b65a6 100644 --- a/src/regexp/regexp-ast.h +++ b/src/regexp/regexp-ast.h @@ -6,7 +6,6 @@ #define V8_REGEXP_REGEXP_AST_H_ #include "src/objects.h" -#include "src/objects/js-regexp.h" #include "src/objects/string.h" #include "src/utils.h" #include "src/zone/zone-containers.h" @@ -145,7 +144,7 @@ class CharacterSet final BASE_EMBEDDED { explicit CharacterSet(ZoneList* ranges) : ranges_(ranges), standard_set_type_(0) {} ZoneList* ranges(Zone* zone); - uc16 standard_set_type() const { return standard_set_type_; } + uc16 standard_set_type() { return standard_set_type_; } void set_standard_set_type(uc16 special_set_type) { standard_set_type_ = special_set_type; } @@ -275,8 +274,7 @@ class RegExpAssertion final : public RegExpTree { BOUNDARY, NON_BOUNDARY }; - RegExpAssertion(AssertionType type, JSRegExp::Flags flags) - : assertion_type_(type), flags_(flags) {} + explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpAssertion* AsAssertion() override; @@ -288,8 +286,7 @@ class RegExpAssertion final : public RegExpTree { AssertionType assertion_type() { return assertion_type_; } private: - const AssertionType assertion_type_; - const JSRegExp::Flags flags_; + AssertionType assertion_type_; }; @@ -303,18 +300,12 @@ class RegExpCharacterClass final : public RegExpTree { NEGATED = 1 << 0, CONTAINS_SPLIT_SURROGATE = 1 << 1, }; - typedef base::Flags CharacterClassFlags; + typedef base::Flags Flags; - RegExpCharacterClass( - ZoneList* ranges, JSRegExp::Flags flags, - CharacterClassFlags character_class_flags = CharacterClassFlags()) - : set_(ranges), - flags_(flags), - character_class_flags_(character_class_flags) {} - RegExpCharacterClass(uc16 type, JSRegExp::Flags flags) - : set_(type), - flags_(flags), - character_class_flags_(CharacterClassFlags()) {} + explicit RegExpCharacterClass(ZoneList* ranges, + Flags flags = Flags()) + : set_(ranges), flags_(flags) {} + explicit RegExpCharacterClass(uc16 type) : set_(type), flags_(0) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpCharacterClass* AsCharacterClass() override; @@ -341,25 +332,22 @@ class RegExpCharacterClass final : public RegExpTree { // D : non-ASCII digit // . : non-newline // * : All characters, for advancing unanchored regexp - uc16 standard_type() const { return set_.standard_set_type(); } + uc16 standard_type() { return set_.standard_set_type(); } ZoneList* ranges(Zone* zone) { return set_.ranges(zone); } - bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } - JSRegExp::Flags flags() const { return flags_; } + bool is_negated() const { return (flags_ & NEGATED) != 0; } bool contains_split_surrogate() const { - return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; + return (flags_ & CONTAINS_SPLIT_SURROGATE) != 0; } private: CharacterSet set_; - const JSRegExp::Flags flags_; - const CharacterClassFlags character_class_flags_; + const Flags flags_; }; class RegExpAtom final : public RegExpTree { public: - explicit RegExpAtom(Vector data, JSRegExp::Flags flags) - : data_(data), flags_(flags) {} + explicit RegExpAtom(Vector data) : data_(data) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpAtom* AsAtom() override; @@ -370,12 +358,9 @@ class RegExpAtom final : public RegExpTree { void AppendToText(RegExpText* text, Zone* zone) override; Vector data() { return data_; } int length() { return data_.length(); } - JSRegExp::Flags flags() const { return flags_; } - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } private: Vector data_; - const JSRegExp::Flags flags_; }; @@ -547,10 +532,9 @@ class RegExpLookaround final : public RegExpTree { class RegExpBackReference final : public RegExpTree { public: - explicit RegExpBackReference(JSRegExp::Flags flags) - : capture_(nullptr), name_(nullptr), flags_(flags) {} - RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags) - : capture_(capture), name_(nullptr), flags_(flags) {} + RegExpBackReference() : capture_(nullptr), name_(nullptr) {} + explicit RegExpBackReference(RegExpCapture* capture) + : capture_(capture), name_(nullptr) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpBackReference* AsBackReference() override; @@ -568,7 +552,6 @@ class RegExpBackReference final : public RegExpTree { private: RegExpCapture* capture_; const ZoneVector* name_; - const JSRegExp::Flags flags_; }; diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc index 832d4de2c2..7a8033a2d8 100644 --- a/src/regexp/regexp-parser.cc +++ b/src/regexp/regexp-parser.cc @@ -31,7 +31,10 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle* error, named_back_references_(nullptr), in_(in), current_(kEndMarker), - top_level_flags_(flags), + dotall_(flags & JSRegExp::kDotAll), + ignore_case_(flags & JSRegExp::kIgnoreCase), + multiline_(flags & JSRegExp::kMultiline), + unicode_(flags & JSRegExp::kUnicode), next_pos_(0), captures_started_(0), capture_count_(0), @@ -41,6 +44,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle* error, is_scanned_for_captures_(false), has_named_captures_(false), failed_(false) { + DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); Advance(); } @@ -179,7 +183,7 @@ RegExpTree* RegExpParser::ParsePattern() { RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, - 0, nullptr, top_level_flags_, zone()); + 0, nullptr, ignore_case(), unicode(), zone()); RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -249,12 +253,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("Nothing to repeat")); case '^': { Advance(); - if (builder->multiline()) { - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::START_OF_LINE, builder->flags())); + if (multiline()) { + builder->AddAssertion( + new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); } else { - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::START_OF_INPUT, builder->flags())); + builder->AddAssertion( + new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); set_contains_anchor(); } continue; @@ -262,10 +266,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '$': { Advance(); RegExpAssertion::AssertionType assertion_type = - builder->multiline() ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; - builder->AddAssertion( - new (zone()) RegExpAssertion(assertion_type, builder->flags())); + multiline() ? RegExpAssertion::END_OF_LINE + : RegExpAssertion::END_OF_INPUT; + builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); continue; } case '.': { @@ -273,7 +276,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { ZoneList* ranges = new (zone()) ZoneList(2, zone()); - if (builder->dotall()) { + if (dotall()) { // Everything. DCHECK(FLAG_harmony_regexp_dotall); CharacterRange::AddClassEscape('*', ranges, false, zone()); @@ -282,18 +285,78 @@ RegExpTree* RegExpParser::ParseDisjunction() { CharacterRange::AddClassEscape('.', ranges, false, zone()); } - RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(ranges, builder->flags()); + RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges); builder->AddCharacterClass(cc); break; } case '(': { - state = ParseOpenParenthesis(state CHECK_FAILED); + SubexpressionType subexpr_type = CAPTURE; + RegExpLookaround::Type lookaround_type = state->lookaround_type(); + bool is_named_capture = false; + Advance(); + if (current() == '?') { + switch (Next()) { + case ':': + subexpr_type = GROUPING; + Advance(2); + break; + case '=': + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = POSITIVE_LOOKAROUND; + Advance(2); + break; + case '!': + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = NEGATIVE_LOOKAROUND; + Advance(2); + break; + case '<': + Advance(); + if (FLAG_harmony_regexp_lookbehind) { + if (Next() == '=') { + subexpr_type = POSITIVE_LOOKAROUND; + lookaround_type = RegExpLookaround::LOOKBEHIND; + Advance(2); + break; + } else if (Next() == '!') { + subexpr_type = NEGATIVE_LOOKAROUND; + lookaround_type = RegExpLookaround::LOOKBEHIND; + Advance(2); + break; + } + } + if (FLAG_harmony_regexp_named_captures) { + has_named_captures_ = true; + is_named_capture = true; + Advance(); + break; + } + // Fall through. + default: + return ReportError(CStrVector("Invalid group")); + } + } + + const ZoneVector* capture_name = nullptr; + if (subexpr_type == CAPTURE) { + if (captures_started_ >= kMaxCaptures) { + return ReportError(CStrVector("Too many captures")); + } + captures_started_++; + + if (is_named_capture) { + capture_name = ParseCaptureGroupName(CHECK_FAILED); + } + } + // Store current state and begin new disjunction parsing. + state = new (zone()) RegExpParserState( + state, subexpr_type, lookaround_type, captures_started_, + capture_name, ignore_case(), unicode(), zone()); builder = state->builder(); continue; } case '[': { - RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); + RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); builder->AddCharacterClass(cc->AsCharacterClass()); break; } @@ -305,13 +368,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("\\ at end of pattern")); case 'b': Advance(2); - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::BOUNDARY, builder->flags())); + builder->AddAssertion( + new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); continue; case 'B': Advance(2); - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::NON_BOUNDARY, builder->flags())); + builder->AddAssertion( + new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); continue; // AtomEscape :: // CharacterClassEscape @@ -328,10 +391,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(2); ZoneList* ranges = new (zone()) ZoneList(2, zone()); - CharacterRange::AddClassEscape( - c, ranges, unicode() && builder->ignore_case(), zone()); + CharacterRange::AddClassEscape(c, ranges, + unicode() && ignore_case(), zone()); RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(ranges, builder->flags()); + new (zone()) RegExpCharacterClass(ranges); builder->AddCharacterClass(cc); break; } @@ -347,7 +410,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("Invalid property name")); } RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(ranges, builder->flags()); + new (zone()) RegExpCharacterClass(ranges); builder->AddCharacterClass(cc); } else { // With /u, no identity escapes except for syntax characters @@ -380,8 +443,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddEmpty(); } else { RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = - new (zone()) RegExpBackReference(capture, builder->flags()); + RegExpTree* atom = new (zone()) RegExpBackReference(capture); builder->AddAtom(atom); } break; @@ -576,143 +638,6 @@ RegExpTree* RegExpParser::ParseDisjunction() { } } -RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( - RegExpParserState* state) { - RegExpLookaround::Type lookaround_type = state->lookaround_type(); - bool is_named_capture = false; - JSRegExp::Flags switch_on = JSRegExp::kNone; - JSRegExp::Flags switch_off = JSRegExp::kNone; - const ZoneVector* capture_name = nullptr; - SubexpressionType subexpr_type = CAPTURE; - Advance(); - if (current() == '?') { - switch (Next()) { - case ':': - Advance(2); - subexpr_type = GROUPING; - break; - case '=': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = POSITIVE_LOOKAROUND; - break; - case '!': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - case '-': - case 'i': - case 's': - case 'm': { - if (!FLAG_regexp_mode_modifiers || - (Next() == 's' && !FLAG_harmony_regexp_dotall)) { - ReportError(CStrVector("Invalid group")); - return nullptr; - } - Advance(); - bool flags_sense = true; // Switching on flags. - while (subexpr_type != GROUPING) { - switch (current()) { - case '-': - if (!flags_sense) { - ReportError(CStrVector("Multiple dashes in flag group")); - return nullptr; - } - flags_sense = false; - Advance(); - continue; - case 's': - if (!FLAG_harmony_regexp_dotall) { - ReportError(CStrVector("Invalid group")); - return nullptr; - } - // Fall through. - case 'i': - case 'm': { - JSRegExp::Flags bit = JSRegExp::kUnicode; - if (current() == 'i') bit = JSRegExp::kIgnoreCase; - if (current() == 'm') bit = JSRegExp::kMultiline; - if (current() == 's') bit = JSRegExp::kDotAll; - if (((switch_on | switch_off) & bit) != 0) { - ReportError(CStrVector("Repeated flag in flag group")); - return nullptr; - } - if (flags_sense) { - switch_on |= bit; - } else { - switch_off |= bit; - } - Advance(); - continue; - } - case ')': { - Advance(); - state->builder() - ->FlushText(); // Flush pending text using old flags. - // These (?i)-style flag switches don't put us in a subexpression - // at all, they just modify the flags in the rest of the current - // subexpression. - JSRegExp::Flags flags = - (state->builder()->flags() | switch_on) & ~switch_off; - state->builder()->set_flags(flags); - return state; - } - case ':': - Advance(); - subexpr_type = GROUPING; // Will break us out of the outer loop. - continue; - default: - ReportError(CStrVector("Invalid flag group")); - return nullptr; - } - } - break; - } - case '<': - Advance(); - if (FLAG_harmony_regexp_lookbehind) { - if (Next() == '=') { - Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; - subexpr_type = POSITIVE_LOOKAROUND; - break; - } else if (Next() == '!') { - Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - } - } - if (FLAG_harmony_regexp_named_captures) { - is_named_capture = true; - has_named_captures_ = true; - Advance(); - break; - } - // Fall through. - default: - ReportError(CStrVector("Invalid group")); - return nullptr; - } - } - if (subexpr_type == CAPTURE) { - if (captures_started_ >= kMaxCaptures) { - ReportError(CStrVector("Too many captures")); - return nullptr; - } - captures_started_++; - - if (is_named_capture) { - capture_name = ParseCaptureGroupName(CHECK_FAILED); - } - } - JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off; - // Store current state and begin new disjunction parsing. - return new (zone()) - RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, - capture_name, flags, zone()); -} #ifdef DEBUG // Currently only used in an DCHECK. @@ -930,8 +855,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, if (state->IsInsideCaptureGroup(name)) { builder->AddEmpty(); } else { - RegExpBackReference* atom = - new (zone()) RegExpBackReference(builder->flags()); + RegExpBackReference* atom = new (zone()) RegExpBackReference(); atom->set_name(name); builder->AddAtom(atom); @@ -1601,7 +1525,7 @@ void RegExpParser::ParseClassEscape(ZoneList* ranges, } } -RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { +RegExpTree* RegExpParser::ParseCharacterClass() { static const char* kUnterminated = "Unterminated character class"; static const char* kRangeInvalid = "Invalid character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; @@ -1615,7 +1539,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { } ZoneList* ranges = new (zone()) ZoneList(2, zone()); - bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); + bool add_unicode_case_equivalents = unicode() && ignore_case(); while (has_more() && current() != ']') { uc32 char_1, char_2; bool is_class_1, is_class_2; @@ -1662,10 +1586,9 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { ranges->Add(CharacterRange::Everything(), zone()); is_negated = !is_negated; } - RegExpCharacterClass::CharacterClassFlags character_class_flags; - if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; - return new (zone()) - RegExpCharacterClass(ranges, builder->flags(), character_class_flags); + RegExpCharacterClass::Flags flags; + if (is_negated) flags = RegExpCharacterClass::NEGATED; + return new (zone()) RegExpCharacterClass(ranges, flags); } @@ -1699,10 +1622,11 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, return !parser.failed(); } -RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) +RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) : zone_(zone), pending_empty_(false), - flags_(flags), + ignore_case_(ignore_case), + unicode_(unicode), characters_(nullptr), pending_surrogate_(kNoPendingSurrogate), terms_(), @@ -1738,7 +1662,7 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { surrogate_pair.Add(lead_surrogate, zone()); surrogate_pair.Add(trail_surrogate, zone()); RegExpAtom* atom = - new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_); + new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); AddAtom(atom); } } else { @@ -1762,8 +1686,7 @@ void RegExpBuilder::FlushCharacters() { FlushPendingSurrogate(); pending_empty_ = false; if (characters_ != nullptr) { - RegExpTree* atom = - new (zone()) RegExpAtom(characters_->ToConstVector(), flags_); + RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); characters_ = nullptr; text_.Add(atom, zone()); LAST(ADD_ATOM); @@ -1839,7 +1762,7 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { AddTerm(new (zone()) RegExpCharacterClass( - CharacterRange::List(zone(), CharacterRange::Singleton(c)), flags_)); + CharacterRange::List(zone(), CharacterRange::Singleton(c)))); } @@ -1957,11 +1880,11 @@ bool RegExpBuilder::AddQuantifierToAtom( int num_chars = char_vector.length(); if (num_chars > 1) { Vector prefix = char_vector.SubVector(0, num_chars - 1); - text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone()); + text_.Add(new (zone()) RegExpAtom(prefix), zone()); char_vector = char_vector.SubVector(num_chars - 1, num_chars); } characters_ = nullptr; - atom = new (zone()) RegExpAtom(char_vector, flags_); + atom = new (zone()) RegExpAtom(char_vector); FlushText(); } else if (text_.length() > 0) { DCHECK(last_added_ == ADD_ATOM); diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h index 56d4ac8599..228b1d83e4 100644 --- a/src/regexp/regexp-parser.h +++ b/src/regexp/regexp-parser.h @@ -99,7 +99,7 @@ class BufferedZoneList { // Accumulates RegExp atoms and assertions into lists of terms and alternatives. class RegExpBuilder : public ZoneObject { public: - RegExpBuilder(Zone* zone, JSRegExp::Flags flags); + RegExpBuilder(Zone* zone, bool ignore_case, bool unicode); void AddCharacter(uc16 character); void AddUnicodeCharacter(uc32 character); void AddEscapedUnicodeCharacter(uc32 character); @@ -114,14 +114,7 @@ class RegExpBuilder : public ZoneObject { void NewAlternative(); // '|' bool AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type); - void FlushText(); RegExpTree* ToRegExp(); - JSRegExp::Flags flags() const { return flags_; } - void set_flags(JSRegExp::Flags flags) { flags_ = flags; } - - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } - bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } private: static const uc16 kNoPendingSurrogate = 0; @@ -129,15 +122,18 @@ class RegExpBuilder : public ZoneObject { void AddTrailSurrogate(uc16 trail_surrogate); void FlushPendingSurrogate(); void FlushCharacters(); + void FlushText(); void FlushTerms(); bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); bool NeedsDesugaringForIgnoreCase(uc32 c); Zone* zone() const { return zone_; } - bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } + bool ignore_case() const { return ignore_case_; } + bool unicode() const { return unicode_; } Zone* zone_; bool pending_empty_; - JSRegExp::Flags flags_; + bool ignore_case_; + bool unicode_; ZoneList* characters_; uc16 pending_surrogate_; BufferedZoneList terms_; @@ -163,6 +159,7 @@ class RegExpParser BASE_EMBEDDED { RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); RegExpTree* ParseGroup(); + RegExpTree* ParseCharacterClass(); // Parses a {...,...} quantifier and stores the range in the given // out parameters. @@ -178,7 +175,6 @@ class RegExpParser BASE_EMBEDDED { bool ParseUnicodeEscape(uc32* value); bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); bool ParsePropertyClass(ZoneList* result, bool negate); - RegExpTree* ParseCharacterClass(const RegExpBuilder* state); uc32 ParseOctalLiteral(); @@ -209,9 +205,10 @@ class RegExpParser BASE_EMBEDDED { int captures_started() { return captures_started_; } int position() { return next_pos_ - 1; } bool failed() { return failed_; } - // The Unicode flag can't be changed using in-regexp syntax, so it's OK to - // just read the initial flag value here. - bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } + bool dotall() const { return dotall_; } + bool ignore_case() const { return ignore_case_; } + bool multiline() const { return multiline_; } + bool unicode() const { return unicode_; } static bool IsSyntaxCharacterOrSlash(uc32 c); @@ -229,35 +226,34 @@ class RegExpParser BASE_EMBEDDED { class RegExpParserState : public ZoneObject { public: - // Push a state on the stack. RegExpParserState(RegExpParserState* previous_state, SubexpressionType group_type, RegExpLookaround::Type lookaround_type, int disjunction_capture_index, - const ZoneVector* capture_name, - JSRegExp::Flags flags, Zone* zone) + const ZoneVector* capture_name, bool ignore_case, + bool unicode, Zone* zone) : previous_state_(previous_state), - builder_(new (zone) RegExpBuilder(zone, flags)), + builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)), group_type_(group_type), lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index), capture_name_(capture_name) {} // Parser state of containing expression, if any. - RegExpParserState* previous_state() const { return previous_state_; } + RegExpParserState* previous_state() { return previous_state_; } bool IsSubexpression() { return previous_state_ != nullptr; } // RegExpBuilder building this regexp's AST. - RegExpBuilder* builder() const { return builder_; } + RegExpBuilder* builder() { return builder_; } // Type of regexp being parsed (parenthesized group or entire regexp). - SubexpressionType group_type() const { return group_type_; } + SubexpressionType group_type() { return group_type_; } // Lookahead or Lookbehind. - RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } + RegExpLookaround::Type lookaround_type() { return lookaround_type_; } // Index in captures array of first capture in this sub-expression, if any. // Also the capture index of this sub-expression itself, if group_type // is CAPTURE. - int capture_index() const { return disjunction_capture_index_; } + int capture_index() { return disjunction_capture_index_; } // The name of the current sub-expression, if group_type is CAPTURE. Only // used for named captures. - const ZoneVector* capture_name() const { return capture_name_; } + const ZoneVector* capture_name() { return capture_name_; } bool IsNamedCapture() const { return capture_name_ != nullptr; } @@ -268,17 +264,17 @@ class RegExpParser BASE_EMBEDDED { private: // Linked list implementation of stack of states. - RegExpParserState* const previous_state_; + RegExpParserState* previous_state_; // Builder for the stored disjunction. - RegExpBuilder* const builder_; + RegExpBuilder* builder_; // Stored disjunction type (capture, look-ahead or grouping), if any. - const SubexpressionType group_type_; + SubexpressionType group_type_; // Stored read direction. - const RegExpLookaround::Type lookaround_type_; + RegExpLookaround::Type lookaround_type_; // Stored disjunction's capture index (if any). - const int disjunction_capture_index_; + int disjunction_capture_index_; // Stored capture name (if any). - const ZoneVector* const capture_name_; + const ZoneVector* capture_name_; }; // Return the 1-indexed RegExpCapture object, allocate if necessary. @@ -295,7 +291,6 @@ class RegExpParser BASE_EMBEDDED { bool ParseNamedBackReference(RegExpBuilder* builder, RegExpParserState* state); - RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); // After the initial parsing pass, patch corresponding RegExpCapture objects // into all RegExpBackReferences. This is done after initial parsing in order @@ -328,10 +323,10 @@ class RegExpParser BASE_EMBEDDED { ZoneList* named_back_references_; FlatStringReader* in_; uc32 current_; - // These are the flags specified outside the regexp syntax ie after the - // terminating '/' or in the second argument to the constructor. The current - // flags are stored on the RegExpBuilder. - JSRegExp::Flags top_level_flags_; + bool dotall_; + bool ignore_case_; + bool multiline_; + bool unicode_; int next_pos_; int captures_started_; int capture_count_; // Only valid after we have scanned for captures. diff --git a/test/mjsunit/regexp-modifiers-autogenerated.js b/test/mjsunit/regexp-modifiers-autogenerated.js deleted file mode 100644 index 309c010505..0000000000 --- a/test/mjsunit/regexp-modifiers-autogenerated.js +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2017 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Flags: --regexp-mode-modifiers --harmony-regexp-property - -// These regexps are just grepped out of the other tests we already have -// and the syntax changed from out-of-line i flag to inline i flag. - -assertFalse(/(?i)x(...)\1/.test("x\u03a3\u03c2\u03c3\u03c2\u03c3")); -assertTrue(/(?i)\u03a3((?:))\1\1x/.test("\u03c2x"), "backref-UC16-empty"); -assertTrue(/(?i)x(?:...|(...))\1x/.test("x\u03a3\u03c2\u03c3x")); -assertTrue(/(?i)x(?:...|(...))\1x/.test("x\u03c2\u03c3\u039b\u03a3\u03c2\u03bbx")); -assertFalse(/(?i)\xc1/.test('fooA'), "quickcheck-uc16-pattern-ascii-subject"); -assertFalse(/(?i)x(...)\1/.test("xaaaaa"), "backref-ASCII-short"); -assertTrue(/(?i)x((?:))\1\1x/.test("xx"), "backref-ASCII-empty"); -assertTrue(/(?i)x(?:...|(...))\1x/.test("xabcx"), "backref-ASCII-uncaptured"); -assertTrue(/(?i)x(?:...|(...))\1x/.test("xabcABCx"), "backref-ASCII-backtrack"); -assertFalse(/(?i)f/.test('b')); -assertFalse(/(?i)[abc]f/.test('x')); -assertFalse(/(?i)[abc]f/.test('xa')); -assertFalse(/(?i)[abc] new RegExp("foo(?i:")); -assertThrows(() => new RegExp("foo(?--i)")); -assertThrows(() => new RegExp("foo(?i-i)")); - -assertThrows(() => new RegExp("foo(?m:")); -assertThrows(() => new RegExp("foo(?--m)")); -assertThrows(() => new RegExp("foo(?m-m)")); - - -// The following tests are taken from test/mjsunit/es7/regexp-ui-word.js but -// using inline syntax instead of the global /i flag. -assertTrue(/(?i)\w/u.test('\u017F')); -assertTrue(/(?i)\w/u.test('\u212A')); -assertFalse(/(?i)\W/u.test('\u017F')); -assertFalse(/(?i)\W/u.test('\u212A')); -assertFalse(/(?i)\W/u.test('s')); -assertFalse(/(?i)\W/u.test('S')); -assertFalse(/(?i)\W/u.test('K')); -assertFalse(/(?i)\W/u.test('k')); - -assertTrue(/(?i)[\w]/u.test('\u017F')); -assertTrue(/(?i)[\w]/u.test('\u212A')); -assertFalse(/(?i)[\W]/u.test('\u017F')); -assertFalse(/(?i)[\W]/u.test('\u212A')); -assertFalse(/(?i)[\W]/u.test('s')); -assertFalse(/(?i)[\W]/u.test('S')); -assertFalse(/(?i)[\W]/u.test('K')); -assertFalse(/(?i)[\W]/u.test('k')); - -assertTrue(/(?i)\b/u.test('\u017F')); -assertFalse(/(?i:)\b/u.test('\u017F')); -assertTrue(/(?i)\b/u.test('\u212A')); -assertFalse(/(?i:)\b/u.test('\u212A')); -assertTrue(/(?i)\b/u.test('s')); -assertTrue(/(?i)\b/u.test('S')); -assertFalse(/(?i)\B/u.test('\u017F')); -assertFalse(/(?i)\B/u.test('\u212A')); -assertFalse(/(?i)\B/u.test('s')); -assertFalse(/(?i)\B/u.test('S')); -assertFalse(/(?i)\B/u.test('K')); -assertFalse(/(?i)\B/u.test('k')); - -var re = /^\s(?m)^.$\s(?-m)$/; -assertTrue(re.test("\n.\n")); -assertFalse(re.test(" .\n")); -assertFalse(re.test("\n. ")); -assertFalse(re.test(" . ")); -assertFalse(re.test("_\n.\n")); -assertFalse(re.test("\n.\n_")); -assertFalse(re.test("_\n.\n_")); - -assertEquals(["abcd", "d"], /a.*?(.)(?i)\b/.exec('abcd\u017F cd')); -assertEquals(["abcd", "d"], /a.*?(.)(?i)\b/.exec('abcd\u212A cd')); -assertEquals(["abcd\u017F", "\u017F"], /a.*?(.)(?i)\b/u.exec('abcd\u017F cd')); -assertEquals(["abcd\u212A", "\u212A"], /a.*?(.)(?i)\b/u.exec('abcd\u212A cd')); - -assertEquals(["a\u017F ", " "], /a.*?(?i)\B(.)/.exec('a\u017F ')); -assertEquals(["a\u212A ", " "], /a.*?(?i)\B(.)/.exec('a\u212A ')); -assertEquals(["a\u017F", "\u017F"], /a.*?(?i:\B)(.)/u.exec('a\u017F ')); -assertEquals(["a\u212A", "\u212A"], /a.*?(?i:\B)(.)/u.exec('a\u212A ')); - -// Nested flags. -var res = [ - /^a(?i:b(?-i:c(?i:d)e)f)g$/, - /^a(?i:b(?-i)c(?i)d(?-i)e(?i)f)g$/, - /^(?-i:a(?i:b(?-i:c(?i:d)e)f)g)$/i, - /^(?-i:a(?i:b(?-i)c(?i)d(?-i)e(?i)f)g)$/i, -]; - -for (var idx = 0; idx < res.length; idx++) { - var re = res[idx]; - for (var i = 0; i < 128; i++) { - var s = (i & 1) ? "A" : "a"; - s += (i & 2) ? "B" : "b"; - s += (i & 4) ? "C" : "c"; - s += (i & 8) ? "D" : "d"; - s += (i & 16) ? "E" : "e"; - s += (i & 32) ? "F" : "f"; - s += (i & 64) ? "G" : "g"; - if ((i & (1 | 4 | 16 | 64)) != 0) { - assertFalse(re.test(s), s); - } else { - assertTrue(re.test(s), s); - } - } -}