diff --git a/BUILD.gn b/BUILD.gn index 3fd693b2a5..f012442687 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -2952,10 +2952,6 @@ v8_source_set("v8_base_without_compiler") { "src/objects/elements.cc", "src/objects/objects.cc", "src/parsing/parser.cc", - - # Explicit template instantiation clash (these files are also very large). - "src/regexp/regexp-compiler-tonode.cc", - "src/regexp/regexp-compiler.cc", ] if (v8_current_cpu == "x86") { diff --git a/src/base/small-vector.h b/src/base/small-vector.h index 124554b646..b594e113c5 100644 --- a/src/base/small-vector.h +++ b/src/base/small-vector.h @@ -98,11 +98,13 @@ class SmallVector { return begin_[index]; } - const T& operator[](size_t index) const { + const T& at(size_t index) const { DCHECK_GT(size(), index); return begin_[index]; } + const T& operator[](size_t index) const { return at(index); } + template void emplace_back(Args&&... args) { if (V8_UNLIKELY(end_ == end_of_storage_)) Grow(); diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc index fab57e3de4..e2dc13ce49 100644 --- a/src/regexp/regexp-compiler-tonode.cc +++ b/src/regexp/regexp-compiler-tonode.cc @@ -7,7 +7,6 @@ #include "src/execution/isolate.h" #include "src/regexp/regexp.h" #include "src/strings/unicode-inl.h" -#include "src/utils/splay-tree-inl.h" #include "src/zone/zone-list-inl.h" #ifdef V8_INTL_SUPPORT @@ -21,11 +20,6 @@ namespace internal { using namespace regexp_compiler_constants; // NOLINT(build/namespaces) -// Explicit template instantiations. -template class ZoneSplayTree; -template void DispatchTable::ForEach( - UnicodeRangeSplitter*); - // ------------------------------------------------------------------- // Tree to graph conversion @@ -128,14 +122,7 @@ bool RegExpCharacterClass::is_standard(Zone* zone) { return false; } -UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, - ZoneList* base) - : zone_(zone), - table_(zone), - bmp_(nullptr), - lead_surrogates_(nullptr), - trail_surrogates_(nullptr), - non_bmp_(nullptr) { +UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList* base) { // The unicode range splitter categorizes given character ranges into: // - Code points from the BMP representable by one code unit. // - Code points outside the BMP that need to be split into surrogate pairs. @@ -143,50 +130,75 @@ UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, // - Lone trail surrogates. // Lone surrogates are valid code points, even though no actual characters. // They require special matching to make sure we do not split surrogate pairs. - // We use the dispatch table to accomplish this. The base range is split up - // by the table by the overlay ranges, and the Call callback is used to - // filter and collect ranges for each category. - for (int i = 0; i < base->length(); i++) { - table_.AddRange(base->at(i), kBase, zone_); - } - // Add overlay ranges. - table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1), - kBmpCodePoints, zone_); - table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), - kLeadSurrogates, zone_); - table_.AddRange( - CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), - kTrailSurrogates, zone_); - table_.AddRange( - CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1), - kBmpCodePoints, zone_); - table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd), - kNonBmpCodePoints, zone_); - table_.ForEach(this); + + for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); } -void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { - OutSet* outset = entry.out_set(); - if (!outset->Get(kBase)) return; - ZoneList** target = nullptr; - if (outset->Get(kBmpCodePoints)) { - target = &bmp_; - } else if (outset->Get(kLeadSurrogates)) { - target = &lead_surrogates_; - } else if (outset->Get(kTrailSurrogates)) { - target = &trail_surrogates_; - } else { - DCHECK(outset->Get(kNonBmpCodePoints)); - target = &non_bmp_; +void UnicodeRangeSplitter::AddRange(CharacterRange range) { + static constexpr uc32 kBmp1Start = 0; + static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1; + static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1; + static constexpr uc32 kBmp2End = kNonBmpStart - 1; + + // Ends are all inclusive. + STATIC_ASSERT(kBmp1Start == 0); + STATIC_ASSERT(kBmp1Start < kBmp1End); + STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart); + STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd); + STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); + STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd); + STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start); + STATIC_ASSERT(kBmp2Start < kBmp2End); + STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart); + STATIC_ASSERT(kNonBmpStart < kNonBmpEnd); + + static constexpr uc32 kStarts[] = { + kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, + kBmp2Start, kNonBmpStart, + }; + + static constexpr uc32 kEnds[] = { + kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, + }; + + CharacterRangeVector* const kTargets[] = { + &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, + }; + + static constexpr int kCount = arraysize(kStarts); + STATIC_ASSERT(kCount == arraysize(kEnds)); + STATIC_ASSERT(kCount == arraysize(kTargets)); + + for (int i = 0; i < kCount; i++) { + if (kStarts[i] > range.to()) break; + const uc32 from = std::max(kStarts[i], range.from()); + const uc32 to = std::min(kEnds[i], range.to()); + if (from > to) continue; + kTargets[i]->emplace_back(CharacterRange::Range(from, to)); } - if (*target == nullptr) - *target = new (zone_) ZoneList(2, zone_); - (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); +} + +namespace { + +// Translates between new and old V8-isms (SmallVector, ZoneList). +ZoneList* ToCanonicalZoneList( + const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { + if (v->empty()) return nullptr; + + ZoneList* result = + new (zone) ZoneList(static_cast(v->size()), zone); + for (size_t i = 0; i < v->size(); i++) { + result->Add(v->at(i), zone); + } + + CharacterRange::Canonicalize(result); + return result; } void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { - ZoneList* bmp = splitter->bmp(); + ZoneList* bmp = + ToCanonicalZoneList(splitter->bmp(), compiler->zone()); if (bmp == nullptr) return; JSRegExp::Flags default_flags = JSRegExp::Flags(); result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( @@ -197,7 +209,8 @@ void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { - ZoneList* non_bmp = splitter->non_bmp(); + ZoneList* non_bmp = + ToCanonicalZoneList(splitter->non_bmp(), compiler->zone()); if (non_bmp == nullptr) return; DCHECK(!compiler->one_byte()); Zone* zone = compiler->zone(); @@ -288,7 +301,8 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { JSRegExp::Flags default_flags = JSRegExp::Flags(); - ZoneList* lead_surrogates = splitter->lead_surrogates(); + ZoneList* lead_surrogates = + ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); if (lead_surrogates == nullptr) return; Zone* zone = compiler->zone(); // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). @@ -316,7 +330,8 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, UnicodeRangeSplitter* splitter) { JSRegExp::Flags default_flags = JSRegExp::Flags(); - ZoneList* trail_surrogates = splitter->trail_surrogates(); + ZoneList* trail_surrogates = + ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); if (trail_surrogates == nullptr) return; Zone* zone = compiler->zone(); // E.g. \udc01 becomes (?* ranges, Zone* zone) { #endif // V8_INTL_SUPPORT } +} // namespace + RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { set_.Canonicalize(); @@ -414,7 +431,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, return UnanchoredAdvance(compiler, on_success); } else { ChoiceNode* result = new (zone) ChoiceNode(2, zone); - UnicodeRangeSplitter splitter(zone, ranges); + UnicodeRangeSplitter splitter(ranges); AddBmpCharacters(compiler, result, on_success, &splitter); AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); AddLoneLeadSurrogates(compiler, result, on_success, &splitter); diff --git a/src/regexp/regexp-compiler.cc b/src/regexp/regexp-compiler.cc index cfb3defd74..62ffafed34 100644 --- a/src/regexp/regexp-compiler.cc +++ b/src/regexp/regexp-compiler.cc @@ -25,11 +25,6 @@ namespace internal { using namespace regexp_compiler_constants; // NOLINT(build/namespaces) -// Explicit template instantiations. -template class ZoneSplayTree; -template void DispatchTable::ForEach( - UnicodeRangeSplitter*); - // ------------------------------------------------------------------- // Implementation of the Irregexp regular expression engine. // diff --git a/src/regexp/regexp-compiler.h b/src/regexp/regexp-compiler.h index 4ffed6de6b..fd644ac523 100644 --- a/src/regexp/regexp-compiler.h +++ b/src/regexp/regexp-compiler.h @@ -5,6 +5,7 @@ #ifndef V8_REGEXP_REGEXP_COMPILER_H_ #define V8_REGEXP_REGEXP_COMPILER_H_ +#include "src/base/small-vector.h" #include "src/regexp/regexp-nodes.h" #include "src/zone/zone-splay-tree.h" @@ -721,29 +722,27 @@ class RegExpCompiler { // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. class UnicodeRangeSplitter { public: - V8_EXPORT_PRIVATE UnicodeRangeSplitter(Zone* zone, - ZoneList* base); - void Call(uc32 from, DispatchTable::Entry entry); + V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList* base); - ZoneList* bmp() { return bmp_; } - ZoneList* lead_surrogates() { return lead_surrogates_; } - ZoneList* trail_surrogates() { return trail_surrogates_; } - ZoneList* non_bmp() const { return non_bmp_; } + static constexpr int kInitialSize = 8; + using CharacterRangeVector = base::SmallVector; + + const CharacterRangeVector* bmp() const { return &bmp_; } + const CharacterRangeVector* lead_surrogates() const { + return &lead_surrogates_; + } + const CharacterRangeVector* trail_surrogates() const { + return &trail_surrogates_; + } + const CharacterRangeVector* non_bmp() const { return &non_bmp_; } private: - static const int kBase = 0; - // Separate ranges into - static const int kBmpCodePoints = 1; - static const int kLeadSurrogates = 2; - static const int kTrailSurrogates = 3; - static const int kNonBmpCodePoints = 4; + void AddRange(CharacterRange range); - Zone* zone_; - DispatchTable table_; - ZoneList* bmp_; - ZoneList* lead_surrogates_; - ZoneList* trail_surrogates_; - ZoneList* non_bmp_; + CharacterRangeVector bmp_; + CharacterRangeVector lead_surrogates_; + CharacterRangeVector trail_surrogates_; + CharacterRangeVector non_bmp_; }; // We need to check for the following characters: 0x39C 0x3BC 0x178. diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 489c8f06df..5fb7264b65 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -1591,10 +1591,10 @@ TEST(CharacterRangeCaseIndependence) { #endif // !V8_INTL_SUPPORT } - -static bool InClass(uc32 c, ZoneList* ranges) { +static bool InClass(uc32 c, + const UnicodeRangeSplitter::CharacterRangeVector* ranges) { if (ranges == nullptr) return false; - for (int i = 0; i < ranges->length(); i++) { + for (size_t i = 0; i < ranges->size(); i++) { CharacterRange range = ranges->at(i); if (range.from() <= c && c <= range.to()) return true; @@ -1602,13 +1602,12 @@ static bool InClass(uc32 c, ZoneList* ranges) { return false; } - TEST(UnicodeRangeSplitter) { Zone zone(CcTest::i_isolate()->allocator(), ZONE_NAME); ZoneList* base = new(&zone) ZoneList(1, &zone); base->Add(CharacterRange::Everything(), &zone); - UnicodeRangeSplitter splitter(&zone, base); + UnicodeRangeSplitter splitter(base); // BMP for (uc32 c = 0; c < 0xD800; c++) { CHECK(InClass(c, splitter.bmp()));