diff --git a/src/flag-definitions.h b/src/flag-definitions.h index 72ea7544b4..4d217c8eaa 100644 --- a/src/flag-definitions.h +++ b/src/flag-definitions.h @@ -959,6 +959,7 @@ DEFINE_BOOL(serialization_statistics, false, // Regexp DEFINE_BOOL(regexp_optimization, true, "generate optimized regexp code") +DEFINE_BOOL(regexp_mode_modifiers, false, "enable inline flags in regexp.") // Testing flags test/cctest/test-{flags,api,serialization}.cc DEFINE_BOOL(testing_bool_flag, true, "testing_bool_flag") diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc index 7befa2e81f..acaff99047 100644 --- a/src/regexp/jsregexp.cc +++ b/src/regexp/jsregexp.cc @@ -132,17 +132,17 @@ MaybeHandle RegExpImpl::Compile(Handle re, bool has_been_compiled = false; - if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && - !(flags & JSRegExp::kSticky) && + if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) && pattern->length() <= kPatternTooShortForBoyerMoore) { // Parse-tree is a single atom that is equal to the pattern. AtomCompile(re, pattern, flags, pattern); has_been_compiled = true; - } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) && - !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) { + } else if (parse_result.tree->IsAtom() && !IsSticky(flags) && + parse_result.capture_count == 0) { RegExpAtom* atom = parse_result.tree->AsAtom(); Vector atom_pattern = atom->data(); - if (atom_pattern.length() <= kPatternTooShortForBoyerMoore) { + if (!IgnoreCase(atom->flags()) && + atom_pattern.length() <= kPatternTooShortForBoyerMoore) { Handle atom_string; ASSIGN_RETURN_ON_EXCEPTION( isolate, atom_string, @@ -622,7 +622,7 @@ RegExpImpl::GlobalCache::GlobalCache(Handle regexp, } } - DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal); + DCHECK(IsGlobal(regexp->GetFlags())); if (!interpreted) { register_array_size_ = Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); @@ -653,8 +653,7 @@ RegExpImpl::GlobalCache::GlobalCache(Handle regexp, } int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { - if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 && - last_index + 1 < subject_->length() && + if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() && unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { // Advance over the surrogate pair. @@ -916,7 +915,7 @@ class FrequencyCollator { class RegExpCompiler { public: RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - JSRegExp::Flags flags, bool is_one_byte); + bool is_one_byte); int AllocateRegister() { if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { @@ -968,13 +967,6 @@ class RegExpCompiler { void SetRegExpTooBig() { reg_exp_too_big_ = true; } - inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } - // Both unicode and ignore_case flags are set. We need to use ICU to find - // the closure over case equivalents. - inline bool needs_unicode_case_equivalents() { - return unicode() && ignore_case(); - } inline bool one_byte() { return one_byte_; } inline bool optimize() { return optimize_; } inline void set_optimize(bool value) { optimize_ = value; } @@ -1004,7 +996,6 @@ class RegExpCompiler { std::vector* work_list_; int recursion_depth_; RegExpMacroAssembler* macro_assembler_; - JSRegExp::Flags flags_; bool one_byte_; bool reg_exp_too_big_; bool limiting_recursion_; @@ -1036,13 +1027,12 @@ static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { // Attempts to compile the regexp using an Irregexp code generator. Returns // a fixed array or a null handle depending on whether it succeeded. RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - JSRegExp::Flags flags, bool one_byte) + bool one_byte) : next_register_(2 * (capture_count + 1)), unicode_lookaround_stack_register_(kNoRegister), unicode_lookaround_position_register_(kNoRegister), work_list_(nullptr), recursion_depth_(0), - flags_(flags), one_byte_(one_byte), reg_exp_too_big_(false), limiting_recursion_(false), @@ -2503,7 +2493,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, QuickCheckDetails::Position* pos = details->positions(characters_filled_in); uc16 c = quarks[i]; - if (compiler->ignore_case()) { + if (elm.atom()->ignore_case()) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; int length = GetCaseIndependentLetters(isolate, c, compiler->one_byte(), chars); @@ -2711,18 +2701,16 @@ class VisitMarker { NodeInfo* info_; }; - -RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { +RegExpNode* SeqRegExpNode::FilterOneByte(int depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; DCHECK(!info()->visited); VisitMarker marker(info()); - return FilterSuccessor(depth - 1, ignore_case); + return FilterSuccessor(depth - 1); } - -RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { - RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); +RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { + RegExpNode* next = on_success_->FilterOneByte(depth - 1); if (next == nullptr) return set_replacement(nullptr); on_success_ = next; return set_replacement(this); @@ -2745,8 +2733,7 @@ static bool RangesContainLatin1Equivalents(ZoneList* ranges) { return false; } - -RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { +RegExpNode* TextNode::FilterOneByte(int depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; DCHECK(!info()->visited); @@ -2759,7 +2746,7 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { for (int j = 0; j < quarks.length(); j++) { uint16_t c = quarks[j]; if (c <= String::kMaxOneByteCharCode) continue; - if (!ignore_case) return set_replacement(nullptr); + if (!IgnoreCase(elm.atom()->flags())) return set_replacement(nullptr); // Here, we need to check for characters whose upper and lower cases // are outside the Latin-1 range. uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); @@ -2781,42 +2768,41 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { ranges->at(0).from() == 0 && ranges->at(0).to() >= String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; + if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) + continue; return set_replacement(nullptr); } } else { if (range_count == 0 || ranges->at(0).from() > String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; + if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) + continue; return set_replacement(nullptr); } } } } - return FilterSuccessor(depth - 1, ignore_case); + return FilterSuccessor(depth - 1); } - -RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { +RegExpNode* LoopChoiceNode::FilterOneByte(int depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; { VisitMarker marker(info()); - RegExpNode* continue_replacement = - continue_node_->FilterOneByte(depth - 1, ignore_case); + RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); // If we can't continue after the loop then there is no sense in doing the // loop. if (continue_replacement == nullptr) return set_replacement(nullptr); } - return ChoiceNode::FilterOneByte(depth - 1, ignore_case); + return ChoiceNode::FilterOneByte(depth - 1); } - -RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { +RegExpNode* ChoiceNode::FilterOneByte(int depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2836,8 +2822,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { RegExpNode* survivor = nullptr; for (int i = 0; i < choice_count; i++) { GuardedAlternative alternative = alternatives_->at(i); - RegExpNode* replacement = - alternative.node()->FilterOneByte(depth - 1, ignore_case); + RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. if (replacement != nullptr) { alternatives_->at(i).set_node(replacement); @@ -2857,7 +2842,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { new(zone()) ZoneList(surviving, zone()); for (int i = 0; i < choice_count; i++) { RegExpNode* replacement = - alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); + alternatives_->at(i).node()->FilterOneByte(depth - 1); if (replacement != nullptr) { alternatives_->at(i).set_node(replacement); new_alternatives->Add(alternatives_->at(i), zone()); @@ -2867,9 +2852,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { return this; } - -RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, - bool ignore_case) { +RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2877,12 +2860,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, // Alternative 0 is the negative lookahead, alternative 1 is what comes // afterwards. RegExpNode* node = alternatives_->at(1).node(); - RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); + RegExpNode* replacement = node->FilterOneByte(depth - 1); if (replacement == nullptr) return set_replacement(nullptr); alternatives_->at(1).set_node(replacement); RegExpNode* neg_node = alternatives_->at(0).node(); - RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); // If the negative lookahead is always going to fail then // we don't need to check it. if (neg_replacement == nullptr) return set_replacement(replacement); @@ -3199,6 +3182,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextElement elm = elements()->at(i); int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; if (elm.text_type() == TextElement::ATOM) { + if (SkipPass(pass, elm.atom()->ignore_case())) continue; Vector quarks = elm.atom()->data(); for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { if (first_element_checked && i == 0 && j == 0) continue; @@ -3254,9 +3238,7 @@ int TextNode::Length() { return elm.cp_offset() + elm.length(); } - -bool TextNode::SkipPass(int int_pass, bool ignore_case) { - TextEmitPassType pass = static_cast(int_pass); +bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { if (ignore_case) { return pass == SIMPLE_CHARACTER_MATCH; } else { @@ -3264,32 +3246,33 @@ bool TextNode::SkipPass(int int_pass, bool ignore_case) { } } - TextNode* TextNode::CreateForCharacterRanges(Zone* zone, ZoneList* ranges, bool read_backward, - RegExpNode* on_success) { + RegExpNode* on_success, + JSRegExp::Flags flags) { DCHECK_NOT_NULL(ranges); ZoneList* elms = new (zone) ZoneList(1, zone); - elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)), - zone); + elms->Add( + TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, flags)), + zone); return new (zone) TextNode(elms, read_backward, on_success); } - TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, CharacterRange trail, bool read_backward, - RegExpNode* on_success) { + RegExpNode* on_success, + JSRegExp::Flags flags) { ZoneList* lead_ranges = CharacterRange::List(zone, lead); ZoneList* trail_ranges = CharacterRange::List(zone, trail); ZoneList* elms = new (zone) ZoneList(2, zone); - elms->Add( - TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)), - zone); - elms->Add( - TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)), - zone); + elms->Add(TextElement::CharClass( + new (zone) RegExpCharacterClass(lead_ranges, flags)), + zone); + elms->Add(TextElement::CharClass( + new (zone) RegExpCharacterClass(trail_ranges, flags)), + zone); return new (zone) TextNode(elms, read_backward, on_success); } @@ -3323,27 +3306,15 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { // check that now. if (trace->characters_preloaded() == 1) { for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { - if (!SkipPass(pass, compiler->ignore_case())) { - TextEmitPass(compiler, - static_cast(pass), - true, - trace, - false, - &bound_checked_to); - } + TextEmitPass(compiler, static_cast(pass), true, trace, + false, &bound_checked_to); } first_elt_done = true; } for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { - if (!SkipPass(pass, compiler->ignore_case())) { - TextEmitPass(compiler, - static_cast(pass), - false, - trace, - first_elt_done, - &bound_checked_to); - } + TextEmitPass(compiler, static_cast(pass), false, trace, + first_elt_done, &bound_checked_to); } Trace successor_trace(*trace); @@ -3386,11 +3357,15 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { TextElement elm = elements()->at(i); if (elm.text_type() == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.char_class(); - // None of the standard character classes is different in the case - // independent case and it slows us down if we don't know that. - if (cc->is_standard(zone())) continue; - ZoneList* ranges = cc->ranges(zone()); - CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); + if (IgnoreCase(cc->flags()) && + !NeedsUnicodeCaseEquivalents(cc->flags())) { + // None of the standard character classes is different in the case + // independent case and it slows us down if we don't know that. + if (cc->is_standard(zone())) continue; + ZoneList* ranges = cc->ranges(zone()); + CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, + is_one_byte); + } } } } @@ -4353,9 +4328,9 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RecursionCheck rc(compiler); DCHECK_EQ(start_reg_ + 1, end_reg_); - if (compiler->ignore_case()) { + if (IgnoreCase(flags_)) { assembler->CheckNotBackReferenceIgnoreCase( - start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); + start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -4364,7 +4339,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { if (read_backward()) trace->set_at_start(Trace::UNKNOWN); // Check that the back reference does not end inside a surrogate pair. - if (compiler->unicode() && !compiler->one_byte()) { + if (IsUnicode(flags_) && !compiler->one_byte()) { assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); } on_success()->Emit(compiler, trace); @@ -4887,24 +4862,24 @@ void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); } - void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { ZoneList* bmp = splitter->bmp(); if (bmp == nullptr) return; + JSRegExp::Flags default_flags = JSRegExp::Flags(); result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( - compiler->zone(), bmp, compiler->read_backward(), on_success))); + compiler->zone(), bmp, compiler->read_backward(), on_success, + default_flags))); } - void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { ZoneList* non_bmp = splitter->non_bmp(); if (non_bmp == nullptr) return; - DCHECK(compiler->unicode()); DCHECK(!compiler->one_byte()); Zone* zone = compiler->zone(); + JSRegExp::Flags default_flags = JSRegExp::Flags(); CharacterRange::Canonicalize(non_bmp); for (int i = 0; i < non_bmp->length(); i++) { // Match surrogate pair. @@ -4924,7 +4899,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Singleton(from_l), CharacterRange::Range(from_t, to_t), compiler->read_backward(), - on_success))); + on_success, default_flags))); } else { if (from_t != kTrailSurrogateStart) { // Add [from_l][from_t-\udfff] @@ -4932,7 +4907,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Singleton(from_l), CharacterRange::Range(from_t, kTrailSurrogateEnd), - compiler->read_backward(), on_success))); + compiler->read_backward(), on_success, default_flags))); from_l++; } if (to_t != kTrailSurrogateEnd) { @@ -4941,7 +4916,7 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Singleton(to_l), CharacterRange::Range(kTrailSurrogateStart, to_t), - compiler->read_backward(), on_success))); + compiler->read_backward(), on_success, default_flags))); to_l--; } if (from_l <= to_l) { @@ -4950,49 +4925,47 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, GuardedAlternative(TextNode::CreateForSurrogatePair( zone, CharacterRange::Range(from_l, to_l), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), - compiler->read_backward(), on_success))); + compiler->read_backward(), on_success, default_flags))); } } } } - RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( RegExpCompiler* compiler, ZoneList* lookbehind, - ZoneList* match, RegExpNode* on_success, - bool read_backward) { + ZoneList* match, RegExpNode* on_success, bool read_backward, + JSRegExp::Flags flags) { Zone* zone = compiler->zone(); RegExpNode* match_node = TextNode::CreateForCharacterRanges( - zone, match, read_backward, on_success); + zone, match, read_backward, on_success, flags); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpLookaround::Builder lookaround(false, match_node, stack_register, position_register); RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookbehind, !read_backward, lookaround.on_match_success()); + zone, lookbehind, !read_backward, lookaround.on_match_success(), flags); return lookaround.ForMatch(negative_match); } - RegExpNode* MatchAndNegativeLookaroundInReadDirection( RegExpCompiler* compiler, ZoneList* match, ZoneList* lookahead, RegExpNode* on_success, - bool read_backward) { + bool read_backward, JSRegExp::Flags flags) { Zone* zone = compiler->zone(); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpLookaround::Builder lookaround(false, on_success, stack_register, position_register); RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookahead, read_backward, lookaround.on_match_success()); + zone, lookahead, read_backward, lookaround.on_match_success(), flags); return TextNode::CreateForCharacterRanges( - zone, match, read_backward, lookaround.ForMatch(negative_match)); + zone, match, read_backward, lookaround.ForMatch(negative_match), flags); } - void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { + JSRegExp::Flags default_flags = JSRegExp::Flags(); ZoneList* lead_surrogates = splitter->lead_surrogates(); if (lead_surrogates == nullptr) return; Zone* zone = compiler->zone(); @@ -5005,20 +4978,22 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, // Reading backward. Assert that reading forward, there is no trail // surrogate, and then backward match the lead surrogate. match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, trail_surrogates, lead_surrogates, on_success, true); + compiler, trail_surrogates, lead_surrogates, on_success, true, + default_flags); } else { // Reading forward. Forward match the lead surrogate and assert that // no trail surrogate follows. match = MatchAndNegativeLookaroundInReadDirection( - compiler, lead_surrogates, trail_surrogates, on_success, false); + compiler, lead_surrogates, trail_surrogates, on_success, false, + default_flags); } result->AddAlternative(GuardedAlternative(match)); } - void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { + JSRegExp::Flags default_flags = JSRegExp::Flags(); ZoneList* trail_surrogates = splitter->trail_surrogates(); if (trail_surrogates == nullptr) return; Zone* zone = compiler->zone(); @@ -5031,12 +5006,14 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, // Reading backward. Backward match the trail surrogate and assert that no // lead surrogate precedes it. match = MatchAndNegativeLookaroundInReadDirection( - compiler, trail_surrogates, lead_surrogates, on_success, true); + compiler, trail_surrogates, lead_surrogates, on_success, true, + default_flags); } else { // Reading forward. Assert that reading backward, there is no lead // surrogate, and then forward match the trail surrogate. match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, lead_surrogates, trail_surrogates, on_success, false); + compiler, lead_surrogates, trail_surrogates, on_success, false, + default_flags); } result->AddAlternative(GuardedAlternative(match)); } @@ -5052,7 +5029,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, // the associated trail surrogate. ZoneList* range = CharacterRange::List( zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); - return TextNode::CreateForCharacterRanges(zone, range, false, on_success); + JSRegExp::Flags default_flags = JSRegExp::Flags(); + return TextNode::CreateForCharacterRanges(zone, range, false, on_success, + default_flags); } void AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone) { @@ -5093,10 +5072,10 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, set_.Canonicalize(); Zone* zone = compiler->zone(); ZoneList* ranges = this->ranges(zone); - if (compiler->needs_unicode_case_equivalents()) { + if (NeedsUnicodeCaseEquivalents(flags_)) { AddUnicodeCaseEquivalents(ranges, zone); } - if (compiler->unicode() && !compiler->one_byte() && + if (IsUnicode(flags_) && !compiler->one_byte() && !contains_split_surrogate()) { if (is_negated()) { ZoneList* negated = @@ -5105,9 +5084,10 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, ranges = negated; } if (ranges->length() == 0) { + JSRegExp::Flags default_flags = JSRegExp::Flags(); ranges->Add(CharacterRange::Everything(), zone); RegExpCharacterClass* fail = - new (zone) RegExpCharacterClass(ranges, NEGATED); + new (zone) RegExpCharacterClass(ranges, default_flags, NEGATED); return new (zone) TextNode(fail, compiler->read_backward(), on_success); } if (standard_type() == '*') { @@ -5182,10 +5162,12 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { // i is length or it is the index of an atom. if (i == length) break; int first_atom = i; + JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags(); i++; while (i < length) { RegExpTree* alternative = alternatives->at(i); if (!alternative->IsAtom()) break; + if (alternative->AsAtom()->flags() != flags) break; i++; } // Sort atoms to get ones with common prefixes together. @@ -5197,7 +5179,7 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { DCHECK_LT(first_atom, alternatives->length()); DCHECK_LE(i, alternatives->length()); DCHECK_LE(first_atom, i); - if (compiler->ignore_case()) { + if (IgnoreCase(flags)) { unibrow::Mapping* canonicalize = compiler->isolate()->regexp_macro_assembler_canonicalize(); auto compare_closure = @@ -5229,7 +5211,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { i++; continue; } - RegExpAtom* atom = alternative->AsAtom(); + RegExpAtom* const atom = alternative->AsAtom(); + JSRegExp::Flags flags = atom->flags(); unibrow::uchar common_prefix = atom->data().at(0); int first_with_prefix = i; int prefix_length = atom->length(); @@ -5237,10 +5220,11 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { while (i < length) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - atom = alternative->AsAtom(); + RegExpAtom* const atom = alternative->AsAtom(); + if (atom->flags() != flags) break; unibrow::uchar new_prefix = atom->data().at(0); if (new_prefix != common_prefix) { - if (!compiler->ignore_case()) break; + if (!IgnoreCase(flags)) break; unibrow::Mapping* canonicalize = compiler->isolate()->regexp_macro_assembler_canonicalize(); new_prefix = Canonical(canonicalize, new_prefix); @@ -5257,7 +5241,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { // common prefix if the terms were similar or presorted in the input. // Find out how long the common prefix is. int run_length = i - first_with_prefix; - atom = alternatives->at(first_with_prefix)->AsAtom(); + RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom(); for (int j = 1; j < run_length && prefix_length > 1; j++) { RegExpAtom* old_atom = alternatives->at(j + first_with_prefix)->AsAtom(); @@ -5268,8 +5252,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { } } } - RegExpAtom* prefix = - new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); + RegExpAtom* prefix = new (zone) + RegExpAtom(atom->data().SubVector(0, prefix_length), flags); ZoneList* pair = new (zone) ZoneList(2, zone); pair->Add(prefix, zone); ZoneList* suffixes = @@ -5282,7 +5266,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { suffixes->Add(new (zone) RegExpEmpty(), zone); } else { RegExpTree* suffix = new (zone) RegExpAtom( - old_atom->data().SubVector(prefix_length, old_atom->length())); + old_atom->data().SubVector(prefix_length, old_atom->length()), + flags); suffixes->Add(suffix, zone); } } @@ -5305,7 +5290,6 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( Zone* zone = compiler->zone(); ZoneList* alternatives = this->alternatives(); int length = alternatives->length(); - const bool unicode = compiler->unicode(); int write_posn = 0; int i = 0; @@ -5316,24 +5300,28 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( i++; continue; } - RegExpAtom* atom = alternative->AsAtom(); + RegExpAtom* const atom = alternative->AsAtom(); if (atom->length() != 1) { alternatives->at(write_posn++) = alternatives->at(i); i++; continue; } - DCHECK_IMPLIES(unicode, + JSRegExp::Flags flags = atom->flags(); + DCHECK_IMPLIES(IsUnicode(flags), !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); bool contains_trail_surrogate = unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); int first_in_run = i; i++; + // Find a run of single-character atom alternatives that have identical + // flags (case independence and unicode-ness). while (i < length) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - atom = alternative->AsAtom(); + RegExpAtom* const atom = alternative->AsAtom(); if (atom->length() != 1) break; - DCHECK_IMPLIES(unicode, + if (atom->flags() != flags) break; + DCHECK_IMPLIES(IsUnicode(flags), !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); contains_trail_surrogate |= unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); @@ -5349,12 +5337,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( DCHECK_EQ(old_atom->length(), 1); ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); } - RegExpCharacterClass::Flags flags; - if (unicode && contains_trail_surrogate) { - flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; + RegExpCharacterClass::CharacterClassFlags character_class_flags; + if (IsUnicode(flags) && contains_trail_surrogate) { + character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; } alternatives->at(write_posn++) = - new (zone) RegExpCharacterClass(ranges, flags); + new (zone) RegExpCharacterClass(ranges, flags, character_class_flags); } else { // Just copy any trivial alternatives. for (int j = first_in_run; j < i; j++) { @@ -5586,8 +5574,9 @@ namespace { // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, RegExpNode* on_success, - RegExpAssertion::AssertionType type) { - DCHECK(compiler->needs_unicode_case_equivalents()); + RegExpAssertion::AssertionType type, + JSRegExp::Flags flags) { + DCHECK(NeedsUnicodeCaseEquivalents(flags)); Zone* zone = compiler->zone(); ZoneList* word_range = new (zone) ZoneList(2, zone); @@ -5605,13 +5594,13 @@ RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, stack_register, position_register); RegExpNode* backward = TextNode::CreateForCharacterRanges( - zone, word_range, true, lookbehind.on_match_success()); + zone, word_range, true, lookbehind.on_match_success(), flags); // Look to the right. RegExpLookaround::Builder lookahead(lookahead_for_word, lookbehind.ForMatch(backward), stack_register, position_register); RegExpNode* forward = TextNode::CreateForCharacterRanges( - zone, word_range, false, lookahead.on_match_success()); + zone, word_range, false, lookahead.on_match_success(), flags); result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); } return result; @@ -5629,13 +5618,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, case START_OF_INPUT: return AssertionNode::AtStart(on_success); case BOUNDARY: - return compiler->needs_unicode_case_equivalents() - ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY) + return NeedsUnicodeCaseEquivalents(flags_) + ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, + flags_) : AssertionNode::AtBoundary(on_success); case NON_BOUNDARY: - return compiler->needs_unicode_case_equivalents() + return NeedsUnicodeCaseEquivalents(flags_) ? BoundaryAssertionAsLookaround(compiler, on_success, - NON_BOUNDARY) + NON_BOUNDARY, flags_) : AssertionNode::AtNonBoundary(on_success); case END_OF_INPUT: return AssertionNode::AtEnd(on_success); @@ -5651,7 +5641,9 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, ZoneList* newline_ranges = new(zone) ZoneList(3, zone); CharacterRange::AddClassEscape('n', newline_ranges, false, zone); - RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); + JSRegExp::Flags default_flags = JSRegExp::Flags(); + RegExpCharacterClass* newline_atom = + new (zone) RegExpCharacterClass('n', default_flags); TextNode* newline_matcher = new (zone) TextNode( newline_atom, false, ActionNode::PositiveSubmatchSuccess( stack_pointer_register, position_register, @@ -5681,7 +5673,7 @@ RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { return new (compiler->zone()) BackReferenceNode(RegExpCapture::StartRegister(index()), - RegExpCapture::EndRegister(index()), + RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(), on_success); } @@ -6337,9 +6329,7 @@ void TextNode::CalculateOffsets() { void Analysis::VisitText(TextNode* that) { - if (ignore_case()) { - that->MakeCaseIndependent(isolate(), is_one_byte_); - } + that->MakeCaseIndependent(isolate(), is_one_byte_); EnsureAnalyzed(that->on_success()); if (!has_failed()) { that->CalculateOffsets(); @@ -6450,7 +6440,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, return; } uc16 character = atom->data()[j]; - if (bm->compiler()->ignore_case()) { + if (IgnoreCase(atom->flags())) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; int length = GetCaseIndependentLetters( isolate, character, bm->max_char() == String::kMaxOneByteCharCode, @@ -6602,9 +6592,9 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) { target->Accept(this); } - RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, - RegExpNode* on_success) { + RegExpNode* on_success, + JSRegExp::Flags flags) { // If the regexp matching starts within a surrogate pair, step back // to the lead surrogate and start matching from there. DCHECK(!compiler->read_backward()); @@ -6619,11 +6609,11 @@ RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpNode* step_back = TextNode::CreateForCharacterRanges( - zone, lead_surrogates, true, on_success); + zone, lead_surrogates, true, on_success, flags); RegExpLookaround::Builder builder(true, step_back, stack_register, position_register); RegExpNode* match_trail = TextNode::CreateForCharacterRanges( - zone, trail_surrogates, false, builder.on_match_success()); + zone, trail_surrogates, false, builder.on_match_success(), flags); optional_step_back->AddAlternative( GuardedAlternative(builder.ForMatch(match_trail))); @@ -6640,12 +6630,10 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { return IrregexpRegExpTooBig(isolate); } - bool ignore_case = flags & JSRegExp::kIgnoreCase; - bool is_sticky = flags & JSRegExp::kSticky; - bool is_global = flags & JSRegExp::kGlobal; - bool is_unicode = flags & JSRegExp::kUnicode; - RegExpCompiler compiler(isolate, zone, data->capture_count, flags, - is_one_byte); + bool is_sticky = IsSticky(flags); + bool is_global = IsGlobal(flags); + bool is_unicode = IsUnicode(flags); + RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte); if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); @@ -6673,9 +6661,11 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( if (!is_start_anchored && !is_sticky) { // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning or sticky. + JSRegExp::Flags default_flags = JSRegExp::Flags(); RegExpNode* loop_node = RegExpQuantifier::ToNode( - 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), - &compiler, captured_body, data->contains_anchor); + 0, RegExpTree::kInfinity, false, + new (zone) RegExpCharacterClass('*', default_flags), &compiler, + captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start @@ -6683,26 +6673,27 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); first_step_node->AddAlternative(GuardedAlternative(captured_body)); first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( - new (zone) RegExpCharacterClass('*'), false, loop_node))); + new (zone) RegExpCharacterClass('*', default_flags), false, + loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != nullptr) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); } - } else if (compiler.unicode() && (is_global || is_sticky)) { - node = OptionallyStepBackToLeadSurrogate(&compiler, node); + } else if (is_unicode && (is_global || is_sticky)) { + node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags); } if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - Analysis analysis(isolate, flags, is_one_byte); + Analysis analysis(isolate, is_one_byte); analysis.EnsureAnalyzed(node); if (analysis.has_failed()) { const char* error_message = analysis.error_message(); diff --git a/src/regexp/jsregexp.h b/src/regexp/jsregexp.h index d485045ff0..021c59d3e4 100644 --- a/src/regexp/jsregexp.h +++ b/src/regexp/jsregexp.h @@ -21,6 +21,36 @@ class RegExpNode; class RegExpTree; class BoyerMooreLookahead; +inline bool IgnoreCase(JSRegExp::Flags flags) { + return (flags & JSRegExp::kIgnoreCase) != 0; +} + +inline bool IsUnicode(JSRegExp::Flags flags) { + return (flags & JSRegExp::kUnicode) != 0; +} + +inline bool IsSticky(JSRegExp::Flags flags) { + return (flags & JSRegExp::kSticky) != 0; +} + +inline bool IsGlobal(JSRegExp::Flags flags) { + return (flags & JSRegExp::kGlobal) != 0; +} + +inline bool DotAll(JSRegExp::Flags flags) { + return (flags & JSRegExp::kDotAll) != 0; +} + +inline bool Multiline(JSRegExp::Flags flags) { + return (flags & JSRegExp::kMultiline) != 0; +} + +inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { + // Both unicode and ignore_case flags are set. We need to use ICU to find + // the closure over case equivalents. + return IsUnicode(flags) && IgnoreCase(flags); +} + class RegExpImpl { public: // Whether V8 is compiled with native regexp support or not. @@ -495,9 +525,7 @@ class RegExpNode: public ZoneObject { // If we know that the input is one-byte then there are some nodes that can // never match. This method returns a node that can be substituted for // itself, or nullptr if the node can never match. - virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) { - return this; - } + virtual RegExpNode* FilterOneByte(int depth) { return this; } // Helper for FilterOneByte. RegExpNode* replacement() { DCHECK(info()->replacement_calculated); @@ -569,7 +597,7 @@ class SeqRegExpNode: public RegExpNode { : RegExpNode(on_success->zone()), on_success_(on_success) { } RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } - virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(int depth); virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start) { on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); @@ -577,7 +605,7 @@ class SeqRegExpNode: public RegExpNode { } protected: - RegExpNode* FilterSuccessor(int depth, bool ignore_case); + RegExpNode* FilterSuccessor(int depth); private: RegExpNode* on_success_; @@ -682,13 +710,15 @@ class TextNode: public SeqRegExpNode { static TextNode* CreateForCharacterRanges(Zone* zone, ZoneList* ranges, bool read_backward, - RegExpNode* on_success); + RegExpNode* on_success, + JSRegExp::Flags flags); // Create TextNode for a surrogate pair with a range given for the // lead and the trail surrogate each. static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead, CharacterRange trail, bool read_backward, - RegExpNode* on_success); + RegExpNode* on_success, + JSRegExp::Flags flags); virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); @@ -705,7 +735,7 @@ class TextNode: public SeqRegExpNode { virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start); void CalculateOffsets(); - virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(int depth); private: enum TextEmitPassType { @@ -715,7 +745,7 @@ class TextNode: public SeqRegExpNode { CASE_CHARACTER_MATCH, // Case-independent single character check. CHARACTER_CLASS_MATCH // Character class. }; - static bool SkipPass(int pass, bool ignore_case); + static bool SkipPass(TextEmitPassType pass, bool ignore_case); static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; static const int kLastPass = CHARACTER_CLASS_MATCH; void TextEmitPass(RegExpCompiler* compiler, @@ -779,11 +809,12 @@ class AssertionNode: public SeqRegExpNode { class BackReferenceNode: public SeqRegExpNode { public: - BackReferenceNode(int start_reg, int end_reg, bool read_backward, - RegExpNode* on_success) + BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags, + bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg), + flags_(flags), read_backward_(read_backward) {} virtual void Accept(NodeVisitor* visitor); int start_register() { return start_reg_; } @@ -805,6 +836,7 @@ class BackReferenceNode: public SeqRegExpNode { private: int start_reg_; int end_reg_; + JSRegExp::Flags flags_; bool read_backward_; }; @@ -929,7 +961,7 @@ class ChoiceNode: public RegExpNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return true; } - virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(int depth); virtual bool read_backward() { return false; } protected: @@ -1001,7 +1033,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return !is_first; } - virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(int depth); }; @@ -1028,7 +1060,7 @@ class LoopChoiceNode: public ChoiceNode { bool body_can_be_zero_length() { return body_can_be_zero_length_; } virtual bool read_backward() { return read_backward_; } virtual void Accept(NodeVisitor* visitor); - virtual RegExpNode* FilterOneByte(int depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(int depth); private: // AddAlternative is made private for loop nodes because alternatives @@ -1435,11 +1467,8 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) // +-------+ +------------+ class Analysis: public NodeVisitor { public: - Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte) - : isolate_(isolate), - flags_(flags), - is_one_byte_(is_one_byte), - error_message_(nullptr) {} + Analysis(Isolate* isolate, bool is_one_byte) + : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {} void EnsureAnalyzed(RegExpNode* node); #define DECLARE_VISIT(Type) \ @@ -1459,12 +1488,8 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) Isolate* isolate() const { return isolate_; } - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } - private: Isolate* isolate_; - JSRegExp::Flags flags_; bool is_one_byte_; const char* error_message_; diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h index 14e43b65a6..e60621f8b6 100644 --- a/src/regexp/regexp-ast.h +++ b/src/regexp/regexp-ast.h @@ -6,6 +6,7 @@ #define V8_REGEXP_REGEXP_AST_H_ #include "src/objects.h" +#include "src/objects/js-regexp.h" #include "src/objects/string.h" #include "src/utils.h" #include "src/zone/zone-containers.h" @@ -144,7 +145,7 @@ class CharacterSet final BASE_EMBEDDED { explicit CharacterSet(ZoneList* ranges) : ranges_(ranges), standard_set_type_(0) {} ZoneList* ranges(Zone* zone); - uc16 standard_set_type() { return standard_set_type_; } + uc16 standard_set_type() const { return standard_set_type_; } void set_standard_set_type(uc16 special_set_type) { standard_set_type_ = special_set_type; } @@ -274,7 +275,8 @@ class RegExpAssertion final : public RegExpTree { BOUNDARY, NON_BOUNDARY }; - explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} + RegExpAssertion(AssertionType type, JSRegExp::Flags flags) + : assertion_type_(type), flags_(flags) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpAssertion* AsAssertion() override; @@ -286,7 +288,8 @@ class RegExpAssertion final : public RegExpTree { AssertionType assertion_type() { return assertion_type_; } private: - AssertionType assertion_type_; + const AssertionType assertion_type_; + const JSRegExp::Flags flags_; }; @@ -300,12 +303,18 @@ class RegExpCharacterClass final : public RegExpTree { NEGATED = 1 << 0, CONTAINS_SPLIT_SURROGATE = 1 << 1, }; - typedef base::Flags Flags; + typedef base::Flags CharacterClassFlags; - explicit RegExpCharacterClass(ZoneList* ranges, - Flags flags = Flags()) - : set_(ranges), flags_(flags) {} - explicit RegExpCharacterClass(uc16 type) : set_(type), flags_(0) {} + RegExpCharacterClass( + ZoneList* ranges, JSRegExp::Flags flags, + CharacterClassFlags character_class_flags = CharacterClassFlags()) + : set_(ranges), + flags_(flags), + character_class_flags_(character_class_flags) {} + RegExpCharacterClass(uc16 type, JSRegExp::Flags flags) + : set_(type), + flags_(flags), + character_class_flags_(CharacterClassFlags()) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpCharacterClass* AsCharacterClass() override; @@ -332,22 +341,25 @@ class RegExpCharacterClass final : public RegExpTree { // D : non-ASCII digit // . : non-newline // * : All characters, for advancing unanchored regexp - uc16 standard_type() { return set_.standard_set_type(); } + uc16 standard_type() const { return set_.standard_set_type(); } ZoneList* ranges(Zone* zone) { return set_.ranges(zone); } - bool is_negated() const { return (flags_ & NEGATED) != 0; } + bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } + JSRegExp::Flags flags() const { return flags_; } bool contains_split_surrogate() const { - return (flags_ & CONTAINS_SPLIT_SURROGATE) != 0; + return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; } private: CharacterSet set_; - const Flags flags_; + const JSRegExp::Flags flags_; + const CharacterClassFlags character_class_flags_; }; class RegExpAtom final : public RegExpTree { public: - explicit RegExpAtom(Vector data) : data_(data) {} + explicit RegExpAtom(Vector data, JSRegExp::Flags flags) + : data_(data), flags_(flags) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpAtom* AsAtom() override; @@ -358,9 +370,12 @@ class RegExpAtom final : public RegExpTree { void AppendToText(RegExpText* text, Zone* zone) override; Vector data() { return data_; } int length() { return data_.length(); } + JSRegExp::Flags flags() const { return flags_; } + bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } private: Vector data_; + const JSRegExp::Flags flags_; }; @@ -532,9 +547,10 @@ class RegExpLookaround final : public RegExpTree { class RegExpBackReference final : public RegExpTree { public: - RegExpBackReference() : capture_(nullptr), name_(nullptr) {} - explicit RegExpBackReference(RegExpCapture* capture) - : capture_(capture), name_(nullptr) {} + explicit RegExpBackReference(JSRegExp::Flags flags) + : capture_(nullptr), name_(nullptr), flags_(flags) {} + RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags) + : capture_(capture), name_(nullptr), flags_(flags) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; RegExpBackReference* AsBackReference() override; @@ -552,6 +568,7 @@ class RegExpBackReference final : public RegExpTree { private: RegExpCapture* capture_; const ZoneVector* name_; + const JSRegExp::Flags flags_; }; diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc index 7a8033a2d8..832d4de2c2 100644 --- a/src/regexp/regexp-parser.cc +++ b/src/regexp/regexp-parser.cc @@ -31,10 +31,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle* error, named_back_references_(nullptr), in_(in), current_(kEndMarker), - dotall_(flags & JSRegExp::kDotAll), - ignore_case_(flags & JSRegExp::kIgnoreCase), - multiline_(flags & JSRegExp::kMultiline), - unicode_(flags & JSRegExp::kUnicode), + top_level_flags_(flags), next_pos_(0), captures_started_(0), capture_count_(0), @@ -44,7 +41,6 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle* error, is_scanned_for_captures_(false), has_named_captures_(false), failed_(false) { - DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); Advance(); } @@ -183,7 +179,7 @@ RegExpTree* RegExpParser::ParsePattern() { RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, - 0, nullptr, ignore_case(), unicode(), zone()); + 0, nullptr, top_level_flags_, zone()); RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -253,12 +249,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("Nothing to repeat")); case '^': { Advance(); - if (multiline()) { - builder->AddAssertion( - new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); + if (builder->multiline()) { + builder->AddAssertion(new (zone()) RegExpAssertion( + RegExpAssertion::START_OF_LINE, builder->flags())); } else { - builder->AddAssertion( - new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); + builder->AddAssertion(new (zone()) RegExpAssertion( + RegExpAssertion::START_OF_INPUT, builder->flags())); set_contains_anchor(); } continue; @@ -266,9 +262,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '$': { Advance(); RegExpAssertion::AssertionType assertion_type = - multiline() ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; - builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); + builder->multiline() ? RegExpAssertion::END_OF_LINE + : RegExpAssertion::END_OF_INPUT; + builder->AddAssertion( + new (zone()) RegExpAssertion(assertion_type, builder->flags())); continue; } case '.': { @@ -276,7 +273,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { ZoneList* ranges = new (zone()) ZoneList(2, zone()); - if (dotall()) { + if (builder->dotall()) { // Everything. DCHECK(FLAG_harmony_regexp_dotall); CharacterRange::AddClassEscape('*', ranges, false, zone()); @@ -285,78 +282,18 @@ RegExpTree* RegExpParser::ParseDisjunction() { CharacterRange::AddClassEscape('.', ranges, false, zone()); } - RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges); + RegExpCharacterClass* cc = + new (zone()) RegExpCharacterClass(ranges, builder->flags()); builder->AddCharacterClass(cc); break; } case '(': { - SubexpressionType subexpr_type = CAPTURE; - RegExpLookaround::Type lookaround_type = state->lookaround_type(); - bool is_named_capture = false; - Advance(); - if (current() == '?') { - switch (Next()) { - case ':': - subexpr_type = GROUPING; - Advance(2); - break; - case '=': - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = POSITIVE_LOOKAROUND; - Advance(2); - break; - case '!': - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = NEGATIVE_LOOKAROUND; - Advance(2); - break; - case '<': - Advance(); - if (FLAG_harmony_regexp_lookbehind) { - if (Next() == '=') { - subexpr_type = POSITIVE_LOOKAROUND; - lookaround_type = RegExpLookaround::LOOKBEHIND; - Advance(2); - break; - } else if (Next() == '!') { - subexpr_type = NEGATIVE_LOOKAROUND; - lookaround_type = RegExpLookaround::LOOKBEHIND; - Advance(2); - break; - } - } - if (FLAG_harmony_regexp_named_captures) { - has_named_captures_ = true; - is_named_capture = true; - Advance(); - break; - } - // Fall through. - default: - return ReportError(CStrVector("Invalid group")); - } - } - - const ZoneVector* capture_name = nullptr; - if (subexpr_type == CAPTURE) { - if (captures_started_ >= kMaxCaptures) { - return ReportError(CStrVector("Too many captures")); - } - captures_started_++; - - if (is_named_capture) { - capture_name = ParseCaptureGroupName(CHECK_FAILED); - } - } - // Store current state and begin new disjunction parsing. - state = new (zone()) RegExpParserState( - state, subexpr_type, lookaround_type, captures_started_, - capture_name, ignore_case(), unicode(), zone()); + state = ParseOpenParenthesis(state CHECK_FAILED); builder = state->builder(); continue; } case '[': { - RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); + RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); builder->AddCharacterClass(cc->AsCharacterClass()); break; } @@ -368,13 +305,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("\\ at end of pattern")); case 'b': Advance(2); - builder->AddAssertion( - new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); + builder->AddAssertion(new (zone()) RegExpAssertion( + RegExpAssertion::BOUNDARY, builder->flags())); continue; case 'B': Advance(2); - builder->AddAssertion( - new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); + builder->AddAssertion(new (zone()) RegExpAssertion( + RegExpAssertion::NON_BOUNDARY, builder->flags())); continue; // AtomEscape :: // CharacterClassEscape @@ -391,10 +328,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(2); ZoneList* ranges = new (zone()) ZoneList(2, zone()); - CharacterRange::AddClassEscape(c, ranges, - unicode() && ignore_case(), zone()); + CharacterRange::AddClassEscape( + c, ranges, unicode() && builder->ignore_case(), zone()); RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(ranges); + new (zone()) RegExpCharacterClass(ranges, builder->flags()); builder->AddCharacterClass(cc); break; } @@ -410,7 +347,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("Invalid property name")); } RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(ranges); + new (zone()) RegExpCharacterClass(ranges, builder->flags()); builder->AddCharacterClass(cc); } else { // With /u, no identity escapes except for syntax characters @@ -443,7 +380,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddEmpty(); } else { RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = new (zone()) RegExpBackReference(capture); + RegExpTree* atom = + new (zone()) RegExpBackReference(capture, builder->flags()); builder->AddAtom(atom); } break; @@ -638,6 +576,143 @@ RegExpTree* RegExpParser::ParseDisjunction() { } } +RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( + RegExpParserState* state) { + RegExpLookaround::Type lookaround_type = state->lookaround_type(); + bool is_named_capture = false; + JSRegExp::Flags switch_on = JSRegExp::kNone; + JSRegExp::Flags switch_off = JSRegExp::kNone; + const ZoneVector* capture_name = nullptr; + SubexpressionType subexpr_type = CAPTURE; + Advance(); + if (current() == '?') { + switch (Next()) { + case ':': + Advance(2); + subexpr_type = GROUPING; + break; + case '=': + Advance(2); + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = POSITIVE_LOOKAROUND; + break; + case '!': + Advance(2); + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = NEGATIVE_LOOKAROUND; + break; + case '-': + case 'i': + case 's': + case 'm': { + if (!FLAG_regexp_mode_modifiers || + (Next() == 's' && !FLAG_harmony_regexp_dotall)) { + ReportError(CStrVector("Invalid group")); + return nullptr; + } + Advance(); + bool flags_sense = true; // Switching on flags. + while (subexpr_type != GROUPING) { + switch (current()) { + case '-': + if (!flags_sense) { + ReportError(CStrVector("Multiple dashes in flag group")); + return nullptr; + } + flags_sense = false; + Advance(); + continue; + case 's': + if (!FLAG_harmony_regexp_dotall) { + ReportError(CStrVector("Invalid group")); + return nullptr; + } + // Fall through. + case 'i': + case 'm': { + JSRegExp::Flags bit = JSRegExp::kUnicode; + if (current() == 'i') bit = JSRegExp::kIgnoreCase; + if (current() == 'm') bit = JSRegExp::kMultiline; + if (current() == 's') bit = JSRegExp::kDotAll; + if (((switch_on | switch_off) & bit) != 0) { + ReportError(CStrVector("Repeated flag in flag group")); + return nullptr; + } + if (flags_sense) { + switch_on |= bit; + } else { + switch_off |= bit; + } + Advance(); + continue; + } + case ')': { + Advance(); + state->builder() + ->FlushText(); // Flush pending text using old flags. + // These (?i)-style flag switches don't put us in a subexpression + // at all, they just modify the flags in the rest of the current + // subexpression. + JSRegExp::Flags flags = + (state->builder()->flags() | switch_on) & ~switch_off; + state->builder()->set_flags(flags); + return state; + } + case ':': + Advance(); + subexpr_type = GROUPING; // Will break us out of the outer loop. + continue; + default: + ReportError(CStrVector("Invalid flag group")); + return nullptr; + } + } + break; + } + case '<': + Advance(); + if (FLAG_harmony_regexp_lookbehind) { + if (Next() == '=') { + Advance(2); + lookaround_type = RegExpLookaround::LOOKBEHIND; + subexpr_type = POSITIVE_LOOKAROUND; + break; + } else if (Next() == '!') { + Advance(2); + lookaround_type = RegExpLookaround::LOOKBEHIND; + subexpr_type = NEGATIVE_LOOKAROUND; + break; + } + } + if (FLAG_harmony_regexp_named_captures) { + is_named_capture = true; + has_named_captures_ = true; + Advance(); + break; + } + // Fall through. + default: + ReportError(CStrVector("Invalid group")); + return nullptr; + } + } + if (subexpr_type == CAPTURE) { + if (captures_started_ >= kMaxCaptures) { + ReportError(CStrVector("Too many captures")); + return nullptr; + } + captures_started_++; + + if (is_named_capture) { + capture_name = ParseCaptureGroupName(CHECK_FAILED); + } + } + JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off; + // Store current state and begin new disjunction parsing. + return new (zone()) + RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, + capture_name, flags, zone()); +} #ifdef DEBUG // Currently only used in an DCHECK. @@ -855,7 +930,8 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, if (state->IsInsideCaptureGroup(name)) { builder->AddEmpty(); } else { - RegExpBackReference* atom = new (zone()) RegExpBackReference(); + RegExpBackReference* atom = + new (zone()) RegExpBackReference(builder->flags()); atom->set_name(name); builder->AddAtom(atom); @@ -1525,7 +1601,7 @@ void RegExpParser::ParseClassEscape(ZoneList* ranges, } } -RegExpTree* RegExpParser::ParseCharacterClass() { +RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { static const char* kUnterminated = "Unterminated character class"; static const char* kRangeInvalid = "Invalid character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; @@ -1539,7 +1615,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() { } ZoneList* ranges = new (zone()) ZoneList(2, zone()); - bool add_unicode_case_equivalents = unicode() && ignore_case(); + bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); while (has_more() && current() != ']') { uc32 char_1, char_2; bool is_class_1, is_class_2; @@ -1586,9 +1662,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() { ranges->Add(CharacterRange::Everything(), zone()); is_negated = !is_negated; } - RegExpCharacterClass::Flags flags; - if (is_negated) flags = RegExpCharacterClass::NEGATED; - return new (zone()) RegExpCharacterClass(ranges, flags); + RegExpCharacterClass::CharacterClassFlags character_class_flags; + if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; + return new (zone()) + RegExpCharacterClass(ranges, builder->flags(), character_class_flags); } @@ -1622,11 +1699,10 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, return !parser.failed(); } -RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) +RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) : zone_(zone), pending_empty_(false), - ignore_case_(ignore_case), - unicode_(unicode), + flags_(flags), characters_(nullptr), pending_surrogate_(kNoPendingSurrogate), terms_(), @@ -1662,7 +1738,7 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { surrogate_pair.Add(lead_surrogate, zone()); surrogate_pair.Add(trail_surrogate, zone()); RegExpAtom* atom = - new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); + new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_); AddAtom(atom); } } else { @@ -1686,7 +1762,8 @@ void RegExpBuilder::FlushCharacters() { FlushPendingSurrogate(); pending_empty_ = false; if (characters_ != nullptr) { - RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); + RegExpTree* atom = + new (zone()) RegExpAtom(characters_->ToConstVector(), flags_); characters_ = nullptr; text_.Add(atom, zone()); LAST(ADD_ATOM); @@ -1762,7 +1839,7 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { AddTerm(new (zone()) RegExpCharacterClass( - CharacterRange::List(zone(), CharacterRange::Singleton(c)))); + CharacterRange::List(zone(), CharacterRange::Singleton(c)), flags_)); } @@ -1880,11 +1957,11 @@ bool RegExpBuilder::AddQuantifierToAtom( int num_chars = char_vector.length(); if (num_chars > 1) { Vector prefix = char_vector.SubVector(0, num_chars - 1); - text_.Add(new (zone()) RegExpAtom(prefix), zone()); + text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone()); char_vector = char_vector.SubVector(num_chars - 1, num_chars); } characters_ = nullptr; - atom = new (zone()) RegExpAtom(char_vector); + atom = new (zone()) RegExpAtom(char_vector, flags_); FlushText(); } else if (text_.length() > 0) { DCHECK(last_added_ == ADD_ATOM); diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h index 228b1d83e4..56d4ac8599 100644 --- a/src/regexp/regexp-parser.h +++ b/src/regexp/regexp-parser.h @@ -99,7 +99,7 @@ class BufferedZoneList { // Accumulates RegExp atoms and assertions into lists of terms and alternatives. class RegExpBuilder : public ZoneObject { public: - RegExpBuilder(Zone* zone, bool ignore_case, bool unicode); + RegExpBuilder(Zone* zone, JSRegExp::Flags flags); void AddCharacter(uc16 character); void AddUnicodeCharacter(uc32 character); void AddEscapedUnicodeCharacter(uc32 character); @@ -114,7 +114,14 @@ class RegExpBuilder : public ZoneObject { void NewAlternative(); // '|' bool AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type); + void FlushText(); RegExpTree* ToRegExp(); + JSRegExp::Flags flags() const { return flags_; } + void set_flags(JSRegExp::Flags flags) { flags_ = flags; } + + bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } + bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } + bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } private: static const uc16 kNoPendingSurrogate = 0; @@ -122,18 +129,15 @@ class RegExpBuilder : public ZoneObject { void AddTrailSurrogate(uc16 trail_surrogate); void FlushPendingSurrogate(); void FlushCharacters(); - void FlushText(); void FlushTerms(); bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); bool NeedsDesugaringForIgnoreCase(uc32 c); Zone* zone() const { return zone_; } - bool ignore_case() const { return ignore_case_; } - bool unicode() const { return unicode_; } + bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } Zone* zone_; bool pending_empty_; - bool ignore_case_; - bool unicode_; + JSRegExp::Flags flags_; ZoneList* characters_; uc16 pending_surrogate_; BufferedZoneList terms_; @@ -159,7 +163,6 @@ class RegExpParser BASE_EMBEDDED { RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); RegExpTree* ParseGroup(); - RegExpTree* ParseCharacterClass(); // Parses a {...,...} quantifier and stores the range in the given // out parameters. @@ -175,6 +178,7 @@ class RegExpParser BASE_EMBEDDED { bool ParseUnicodeEscape(uc32* value); bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); bool ParsePropertyClass(ZoneList* result, bool negate); + RegExpTree* ParseCharacterClass(const RegExpBuilder* state); uc32 ParseOctalLiteral(); @@ -205,10 +209,9 @@ class RegExpParser BASE_EMBEDDED { int captures_started() { return captures_started_; } int position() { return next_pos_ - 1; } bool failed() { return failed_; } - bool dotall() const { return dotall_; } - bool ignore_case() const { return ignore_case_; } - bool multiline() const { return multiline_; } - bool unicode() const { return unicode_; } + // The Unicode flag can't be changed using in-regexp syntax, so it's OK to + // just read the initial flag value here. + bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } static bool IsSyntaxCharacterOrSlash(uc32 c); @@ -226,34 +229,35 @@ class RegExpParser BASE_EMBEDDED { class RegExpParserState : public ZoneObject { public: + // Push a state on the stack. RegExpParserState(RegExpParserState* previous_state, SubexpressionType group_type, RegExpLookaround::Type lookaround_type, int disjunction_capture_index, - const ZoneVector* capture_name, bool ignore_case, - bool unicode, Zone* zone) + const ZoneVector* capture_name, + JSRegExp::Flags flags, Zone* zone) : previous_state_(previous_state), - builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)), + builder_(new (zone) RegExpBuilder(zone, flags)), group_type_(group_type), lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index), capture_name_(capture_name) {} // Parser state of containing expression, if any. - RegExpParserState* previous_state() { return previous_state_; } + RegExpParserState* previous_state() const { return previous_state_; } bool IsSubexpression() { return previous_state_ != nullptr; } // RegExpBuilder building this regexp's AST. - RegExpBuilder* builder() { return builder_; } + RegExpBuilder* builder() const { return builder_; } // Type of regexp being parsed (parenthesized group or entire regexp). - SubexpressionType group_type() { return group_type_; } + SubexpressionType group_type() const { return group_type_; } // Lookahead or Lookbehind. - RegExpLookaround::Type lookaround_type() { return lookaround_type_; } + RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } // Index in captures array of first capture in this sub-expression, if any. // Also the capture index of this sub-expression itself, if group_type // is CAPTURE. - int capture_index() { return disjunction_capture_index_; } + int capture_index() const { return disjunction_capture_index_; } // The name of the current sub-expression, if group_type is CAPTURE. Only // used for named captures. - const ZoneVector* capture_name() { return capture_name_; } + const ZoneVector* capture_name() const { return capture_name_; } bool IsNamedCapture() const { return capture_name_ != nullptr; } @@ -264,17 +268,17 @@ class RegExpParser BASE_EMBEDDED { private: // Linked list implementation of stack of states. - RegExpParserState* previous_state_; + RegExpParserState* const previous_state_; // Builder for the stored disjunction. - RegExpBuilder* builder_; + RegExpBuilder* const builder_; // Stored disjunction type (capture, look-ahead or grouping), if any. - SubexpressionType group_type_; + const SubexpressionType group_type_; // Stored read direction. - RegExpLookaround::Type lookaround_type_; + const RegExpLookaround::Type lookaround_type_; // Stored disjunction's capture index (if any). - int disjunction_capture_index_; + const int disjunction_capture_index_; // Stored capture name (if any). - const ZoneVector* capture_name_; + const ZoneVector* const capture_name_; }; // Return the 1-indexed RegExpCapture object, allocate if necessary. @@ -291,6 +295,7 @@ class RegExpParser BASE_EMBEDDED { bool ParseNamedBackReference(RegExpBuilder* builder, RegExpParserState* state); + RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); // After the initial parsing pass, patch corresponding RegExpCapture objects // into all RegExpBackReferences. This is done after initial parsing in order @@ -323,10 +328,10 @@ class RegExpParser BASE_EMBEDDED { ZoneList* named_back_references_; FlatStringReader* in_; uc32 current_; - bool dotall_; - bool ignore_case_; - bool multiline_; - bool unicode_; + // These are the flags specified outside the regexp syntax ie after the + // terminating '/' or in the second argument to the constructor. The current + // flags are stored on the RegExpBuilder. + JSRegExp::Flags top_level_flags_; int next_pos_; int captures_started_; int capture_count_; // Only valid after we have scanned for captures. diff --git a/test/mjsunit/mjsunit.status b/test/mjsunit/mjsunit.status index f4467b6dab..3829ab1796 100644 --- a/test/mjsunit/mjsunit.status +++ b/test/mjsunit/mjsunit.status @@ -158,6 +158,8 @@ 'es6/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]], 'regress/regress-5036': [PASS, ['no_i18n == True', FAIL]], 'es7/regexp-ui-word': [PASS, ['no_i18n == True', FAIL]], + 'regexp-modifiers-i18n': [PASS, ['no_i18n == True', FAIL]], + 'regexp-modifiers-autogenerated-i18n': [PASS, ['no_i18n == True', FAIL]], # desugaring regexp property class relies on ICU. 'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]], diff --git a/test/mjsunit/regexp-modifiers-autogenerated-i18n.js b/test/mjsunit/regexp-modifiers-autogenerated-i18n.js new file mode 100644 index 0000000000..18e086c339 --- /dev/null +++ b/test/mjsunit/regexp-modifiers-autogenerated-i18n.js @@ -0,0 +1,81 @@ +// Copyright 2017 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Flags: --regexp-mode-modifiers --harmony-regexp-property + +// These regexps are just grepped out of the other tests we already have +// and the syntax changed from out-of-line i flag to inline i flag. + +// These tests won't all run on the noi18n build of V8. + +assertTrue(/(?i)\u00e5/u.test("\u00c5")); +assertTrue(/(?i)\u00e5/u.test("\u00e5")); +assertTrue(/(?i)\u00c5/u.test("\u00e5")); +assertTrue(/(?i)\u00c5/u.test("\u00c5")); +assertTrue(/(?i)\u212b/u.test("\u212b")); +assertFalse(/(?i)\u00df/u.test("SS")); +assertFalse(/(?i)\u1f8d/u.test("\u1f05\u03b9")); +assertTrue(/(?i)\u1f6b/u.test("\u1f63")); +assertTrue(/(?i)\u00e5/u.test("\u212b")); +assertTrue(/(?i)\u00e5/u.test("\u00c5")); +assertTrue(/(?i)\u00e5/u.test("\u00e5")); +assertTrue(/(?i)\u00e5/u.test("\u212b")); +assertTrue(/(?i)\u00c5/u.test("\u00e5")); +assertTrue(/(?i)\u00c5/u.test("\u212b")); +assertTrue(/(?i)\u00c5/u.test("\u00c5")); +assertTrue(/(?i)\u212b/u.test("\u00c5")); +assertTrue(/(?i)\u212b/u.test("\u00e5")); +assertTrue(/(?i)\u212b/u.test("\u212b")); +assertTrue(/(?i)\u{10400}/u.test("\u{10428}")); +assertTrue(/(?i)\ud801\udc00/u.test("\u{10428}")); +assertTrue(/(?i)[\u{10428}]/u.test("\u{10400}")); +assertTrue(/(?i)[\ud801\udc28]/u.test("\u{10400}")); +assertFalse(/(?i)\u00df/u.test("SS")); +assertFalse(/(?i)\u1f8d/u.test("\u1f05\u03b9")); +assertTrue(/(?i)\u1f8d/u.test("\u1f85")); +assertTrue(/(?i)\u1f6b/u.test("\u1f63")); +assertTrue(/(?i)\u00e5\u00e5\u00e5/u.test("\u212b\u00e5\u00c5")); +assertTrue(/(?i)AB\u{10400}/u.test("ab\u{10428}")); +assertTrue(/(?i)\w/u.test('\u017F')); +assertTrue(/(?i)\w/u.test('\u212A')); +assertFalse(/(?i)\W/u.test('\u017F')); +assertFalse(/(?i)\W/u.test('\u212A')); +assertFalse(/(?i)\W/u.test('s')); +assertFalse(/(?i)\W/u.test('S')); +assertFalse(/(?i)\W/u.test('K')); +assertFalse(/(?i)\W/u.test('k')); +assertTrue(/(?i)[\w]/u.test('\u017F')); +assertTrue(/(?i)[\w]/u.test('\u212A')); +assertFalse(/(?i)[\W]/u.test('\u017F')); +assertFalse(/(?i)[\W]/u.test('\u212A')); +assertFalse(/(?i)[\W]/u.test('s')); +assertFalse(/(?i)[\W]/u.test('S')); +assertFalse(/(?i)[\W]/u.test('K')); +assertFalse(/(?i)[\W]/u.test('k')); +assertTrue(/(?i)\b/u.test('\u017F')); +assertTrue(/(?i)\b/u.test('\u212A')); +assertTrue(/(?i)\b/u.test('s')); +assertTrue(/(?i)\b/u.test('S')); +assertFalse(/(?i)\B/u.test('\u017F')); +assertFalse(/(?i)\B/u.test('\u212A')); +assertFalse(/(?i)\B/u.test('s')); +assertFalse(/(?i)\B/u.test('S')); +assertFalse(/(?i)\B/u.test('K')); +assertFalse(/(?i)\B/u.test('k')); +assertTrue(/(?i)\p{Ll}/u.test("a")); +assertTrue(/(?i)\p{Ll}/u.test("\u{118D4}")); +assertTrue(/(?i)\p{Ll}/u.test("A")); +assertTrue(/(?i)\p{Ll}/u.test("\u{118B4}")); +assertTrue(/(?i)\P{Ll}/u.test("a")); +assertTrue(/(?i)\P{Ll}/u.test("\u{118D4}")); +assertTrue(/(?i)\P{Ll}/u.test("A")); +assertTrue(/(?i)\P{Ll}/u.test("\u{118B4}")); +assertTrue(/(?i)\p{Lu}/u.test("a")); +assertTrue(/(?i)\p{Lu}/u.test("\u{118D4}")); +assertTrue(/(?i)\p{Lu}/u.test("A")); +assertTrue(/(?i)\p{Lu}/u.test("\u{118B4}")); +assertTrue(/(?i)\P{Lu}/u.test("a")); +assertTrue(/(?i)\P{Lu}/u.test("\u{118D4}")); +assertTrue(/(?i)\P{Lu}/u.test("A")); +assertTrue(/(?i)\P{Lu}/u.test("\u{118B4}")); diff --git a/test/mjsunit/regexp-modifiers-autogenerated.js b/test/mjsunit/regexp-modifiers-autogenerated.js new file mode 100644 index 0000000000..e74ea8b384 --- /dev/null +++ b/test/mjsunit/regexp-modifiers-autogenerated.js @@ -0,0 +1,74 @@ +// Copyright 2017 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Flags: --regexp-mode-modifiers --harmony-regexp-property + +// These regexps are just grepped out of the other tests we already have +// and the syntax changed from out-of-line i flag to inline i flag. + +assertFalse(/(?i)x(...)\1/.test("x\u03a3\u03c2\u03c3\u03c2\u03c3")); +assertTrue(/(?i)\u03a3((?:))\1\1x/.test("\u03c2x"), "backref-UC16-empty"); +assertTrue(/(?i)x(?:...|(...))\1x/.test("x\u03a3\u03c2\u03c3x")); +assertTrue(/(?i)x(?:...|(...))\1x/.test("x\u03c2\u03c3\u039b\u03a3\u03c2\u03bbx")); +assertFalse(/(?i)\xc1/.test('fooA'), "quickcheck-uc16-pattern-ascii-subject"); +assertFalse(/(?i)x(...)\1/.test("xaaaaa"), "backref-ASCII-short"); +assertTrue(/(?i)x((?:))\1\1x/.test("xx"), "backref-ASCII-empty"); +assertTrue(/(?i)x(?:...|(...))\1x/.test("xabcx"), "backref-ASCII-uncaptured"); +assertTrue(/(?i)x(?:...|(...))\1x/.test("xabcABCx"), "backref-ASCII-backtrack"); +assertFalse(/(?i)f/.test('b')); +assertFalse(/(?i)[abc]f/.test('x')); +assertFalse(/(?i)[abc]f/.test('xa')); +assertFalse(/(?i)[abc] new RegExp("foo(?i:")); +assertThrows(() => new RegExp("foo(?--i)")); +assertThrows(() => new RegExp("foo(?i-i)")); + +assertThrows(() => new RegExp("foo(?m:")); +assertThrows(() => new RegExp("foo(?--m)")); +assertThrows(() => new RegExp("foo(?m-m)")); + +var re = /^\s(?m)^.$\s(?-m)$/; +assertTrue(re.test("\n.\n")); +assertFalse(re.test(" .\n")); +assertFalse(re.test("\n. ")); +assertFalse(re.test(" . ")); +assertFalse(re.test("_\n.\n")); +assertFalse(re.test("\n.\n_")); +assertFalse(re.test("_\n.\n_")); + +assertEquals(["abcd", "d"], /a.*?(.)(?i)\b/.exec('abcd\u017F cd')); +assertEquals(["abcd", "d"], /a.*?(.)(?i)\b/.exec('abcd\u212A cd')); + +assertEquals(["a\u017F ", " "], /a.*?(?i)\B(.)/.exec('a\u017F ')); +assertEquals(["a\u212A ", " "], /a.*?(?i)\B(.)/.exec('a\u212A ')); + +// Nested flags. +var res = [ + /^a(?i:b(?-i:c(?i:d)e)f)g$/, + /^a(?i:b(?-i)c(?i)d(?-i)e(?i)f)g$/, + /^(?-i:a(?i:b(?-i:c(?i:d)e)f)g)$/i, + /^(?-i:a(?i:b(?-i)c(?i)d(?-i)e(?i)f)g)$/i, +]; + +for (var idx = 0; idx < res.length; idx++) { + var re = res[idx]; + for (var i = 0; i < 128; i++) { + var s = (i & 1) ? "A" : "a"; + s += (i & 2) ? "B" : "b"; + s += (i & 4) ? "C" : "c"; + s += (i & 8) ? "D" : "d"; + s += (i & 16) ? "E" : "e"; + s += (i & 32) ? "F" : "f"; + s += (i & 64) ? "G" : "g"; + if ((i & (1 | 4 | 16 | 64)) != 0) { + assertFalse(re.test(s), s); + } else { + assertTrue(re.test(s), s); + } + } +}