From d2135603bcf462e15a1284d8ed969f6692610dda Mon Sep 17 00:00:00 2001 From: erikcorry Date: Wed, 24 Jun 2015 11:17:33 -0700 Subject: [PATCH] Extend big-disjunction optimization to case-independent regexps R=yangguo@chromium.org BUG=chromium:482998 LOG=n Review URL: https://codereview.chromium.org/1182783009 Cr-Commit-Position: refs/heads/master@{#29264} --- src/heap-snapshot-generator.cc | 3 +- src/jsregexp.cc | 65 +++++++++++++++++--- src/list-inl.h | 15 +++-- src/list.h | 13 ++-- src/vector.h | 27 +++++--- test/mjsunit/regexp-sort.js | 48 +++++++++++++++ test/mjsunit/regress/regress-crbug-482998.js | 9 +-- 7 files changed, 145 insertions(+), 35 deletions(-) create mode 100644 test/mjsunit/regexp-sort.js diff --git a/src/heap-snapshot-generator.cc b/src/heap-snapshot-generator.cc index a27f419458..f1bdc71cca 100644 --- a/src/heap-snapshot-generator.cc +++ b/src/heap-snapshot-generator.cc @@ -323,7 +323,8 @@ List* HeapSnapshot::GetSortedEntriesList() { for (int i = 0; i < entries_.length(); ++i) { sorted_entries_[i] = &entries_[i]; } - sorted_entries_.Sort(SortByIds); + sorted_entries_.Sort( + SortByIds); } return &sorted_entries_; } diff --git a/src/jsregexp.cc b/src/jsregexp.cc index 92fdc77aaa..a02141d77a 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -4837,6 +4837,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { } +static unibrow::uchar Canonical( + unibrow::Mapping* canonicalize, + unibrow::uchar c) { + unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; + int length = canonicalize->get(c, '\0', chars); + DCHECK_LE(length, 1); + unibrow::uchar canonical = c; + if (length == 1) canonical = chars[0]; + return canonical; +} + + +int CompareFirstCharCaseIndependent( + unibrow::Mapping* canonicalize, + RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + unibrow::uchar character1 = atom1->data().at(0); + unibrow::uchar character2 = atom2->data().at(0); + if (character1 == character2) return 0; + if (character1 >= 'a' || character2 >= 'a') { + character1 = Canonical(canonicalize, character1); + character2 = Canonical(canonicalize, character2); + } + return static_cast(character1) - static_cast(character2); +} + + // We can stable sort runs of atoms, since the order does not matter if they // start with different characters. // Returns true if any consecutive atoms were found. @@ -4860,15 +4888,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { i++; } // Sort atoms to get ones with common prefixes together. - // This step is not valid if we are in a case-independent regexp, + // This step is more tricky if we are in a case-independent regexp, // because it would change /is|I/ to /I|is/, and order matters when // the regexp parts don't match only disjoint starting points. To fix - // this would need a version of CompareFirstChar that uses case- + // this we have a version of CompareFirstChar that uses case- // independent character classes for comparison. - if (!compiler->ignore_case()) { - DCHECK_LT(first_atom, alternatives->length()); - DCHECK_LE(i, alternatives->length()); - DCHECK_LE(first_atom, i); + DCHECK_LT(first_atom, alternatives->length()); + DCHECK_LE(i, alternatives->length()); + DCHECK_LE(first_atom, i); + if (compiler->ignore_case()) { + unibrow::Mapping* canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); + auto compare_closure = + [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { + return CompareFirstCharCaseIndependent(canonicalize, a, b); + }; + alternatives->StableSort(compare_closure, first_atom, i - first_atom); + } else { alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); } if (i - first_atom > 1) found_consecutive_atoms = true; @@ -4893,7 +4929,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { continue; } RegExpAtom* atom = alternative->AsAtom(); - uc16 common_prefix = atom->data().at(0); + unibrow::uchar common_prefix = atom->data().at(0); int first_with_prefix = i; int prefix_length = atom->length(); i++; @@ -4901,7 +4937,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; atom = alternative->AsAtom(); - if (atom->data().at(0) != common_prefix) break; + unibrow::uchar new_prefix = atom->data().at(0); + if (new_prefix != common_prefix) { + if (!compiler->ignore_case()) break; + unibrow::Mapping* canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); + new_prefix = Canonical(canonicalize, new_prefix); + common_prefix = Canonical(canonicalize, common_prefix); + if (new_prefix != common_prefix) break; + } prefix_length = Min(prefix_length, atom->length()); i++; } @@ -4917,7 +4961,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { RegExpAtom* old_atom = alternatives->at(j + first_with_prefix)->AsAtom(); for (int k = 1; k < prefix_length; k++) { - if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; + if (atom->data().at(k) != old_atom->data().at(k)) { + prefix_length = k; + break; + } } } RegExpAtom* prefix = diff --git a/src/list-inl.h b/src/list-inl.h index c09788e9ae..98f0343fa5 100644 --- a/src/list-inl.h +++ b/src/list-inl.h @@ -193,14 +193,16 @@ int List::CountOccurrences(const T& elm, int start, int end) const { } -template -void List::Sort(int (*cmp)(const T* x, const T* y)) { +template +template +void List::Sort(CompareFunction cmp) { Sort(cmp, 0, length_); } template -void List::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) { +template +void List::Sort(CompareFunction cmp, size_t s, size_t l) { ToVector().Sort(cmp, s, l); #ifdef DEBUG for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); @@ -215,14 +217,15 @@ void List::Sort() { template -void List::StableSort(int (*cmp)(const T* x, const T* y)) { +template +void List::StableSort(CompareFunction cmp) { StableSort(cmp, 0, length_); } template -void List::StableSort(int (*cmp)(const T* x, const T* y), size_t s, - size_t l) { +template +void List::StableSort(CompareFunction cmp, size_t s, size_t l) { ToVector().StableSort(cmp, s, l); #ifdef DEBUG for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); diff --git a/src/list.h b/src/list.h index 00cbd40312..b636449c42 100644 --- a/src/list.h +++ b/src/list.h @@ -149,12 +149,15 @@ class List { void Iterate(Visitor* visitor); // Sort all list entries (using QuickSort) - void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length); - void Sort(int (*cmp)(const T* x, const T* y)); + template + void Sort(CompareFunction cmp, size_t start, size_t length); + template + void Sort(CompareFunction cmp); void Sort(); - void StableSort(int (*cmp)(const T* x, const T* y), size_t start, - size_t length); - void StableSort(int (*cmp)(const T* x, const T* y)); + template + void StableSort(CompareFunction cmp, size_t start, size_t length); + template + void StableSort(CompareFunction cmp); void StableSort(); INLINE(void Initialize(int capacity, diff --git a/src/vector.h b/src/vector.h index d022fde3a5..4f3128b918 100644 --- a/src/vector.h +++ b/src/vector.h @@ -69,24 +69,30 @@ class Vector { return Vector(result, length_); } - void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) { - std::sort(start() + s, start() + s + l, RawComparer(cmp)); + template + void Sort(CompareFunction cmp, size_t s, size_t l) { + std::sort(start() + s, start() + s + l, RawComparer(cmp)); } - void Sort(int (*cmp)(const T*, const T*)) { - std::sort(start(), start() + length(), RawComparer(cmp)); + template + void Sort(CompareFunction cmp) { + std::sort(start(), start() + length(), RawComparer(cmp)); } void Sort() { std::sort(start(), start() + length()); } - void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) { - std::stable_sort(start() + s, start() + s + l, RawComparer(cmp)); + template + void StableSort(CompareFunction cmp, size_t s, size_t l) { + std::stable_sort(start() + s, start() + s + l, + RawComparer(cmp)); } - void StableSort(int (*cmp)(const T*, const T*)) { - std::stable_sort(start(), start() + length(), RawComparer(cmp)); + template + void StableSort(CompareFunction cmp) { + std::stable_sort(start(), start() + length(), + RawComparer(cmp)); } void StableSort() { std::stable_sort(start(), start() + length()); } @@ -136,15 +142,16 @@ class Vector { T* start_; int length_; + template class RawComparer { public: - explicit RawComparer(int (*cmp)(const T*, const T*)) : cmp_(cmp) {} + explicit RawComparer(CookedComparer cmp) : cmp_(cmp) {} bool operator()(const T& a, const T& b) { return cmp_(&a, &b) < 0; } private: - int (*cmp_)(const T*, const T*); + CookedComparer cmp_; }; }; diff --git a/test/mjsunit/regexp-sort.js b/test/mjsunit/regexp-sort.js new file mode 100644 index 0000000000..57d50701cd --- /dev/null +++ b/test/mjsunit/regexp-sort.js @@ -0,0 +1,48 @@ +// Copyright 2015 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +function Test(lower, upper) { + var lx = lower + "x"; + var ux = upper + "x"; + var lp = lower + "|"; + var uxp = upper + "x|"; + assertEquals(lx, new RegExp(uxp + lp + lower + "cat", "i").exec(lx) + ""); + assertEquals(ux, new RegExp(uxp + lp + lower + "cat", "i").exec(ux) + ""); + assertEquals(lower, new RegExp(lp + uxp + lower + "cat", "i").exec(lx) + ""); + assertEquals(upper, new RegExp(lp + uxp + lower + "cat", "i").exec(ux) + ""); +} + +function TestFail(lower, upper) { + var lx = lower + "x"; + var ux = upper + "x"; + var lp = lower + "|"; + var uxp = upper + "x|"; + assertEquals(lower, new RegExp(uxp + lp + lower + "cat", "i").exec(lx) + ""); + assertEquals(ux, new RegExp(uxp + lp + lower + "cat", "i").exec(ux) + ""); + assertEquals(lower, new RegExp(lp + uxp + lower + "cat", "i").exec(lx) + ""); + assertEquals(ux, new RegExp(lp + uxp + lower + "cat", "i").exec(ux) + ""); +} + +Test("a", "A"); +Test("0", "0"); +TestFail("a", "b"); +// Small and capital o-umlaut +Test(String.fromCharCode(0xf6), String.fromCharCode(0xd6)); +// Small and capital kha. +Test(String.fromCharCode(0x445), String.fromCharCode(0x425)); +// Small and capital y-umlaut. +Test(String.fromCharCode(0xff), String.fromCharCode(0x178)); +// Small and large Greek mu. +Test(String.fromCharCode(0x3bc), String.fromCharCode(0x39c)); +// Micron and large Greek mu. +Test(String.fromCharCode(0xb5), String.fromCharCode(0x39c)); +// Micron and small Greek mu. +Test(String.fromCharCode(0xb5), String.fromCharCode(0x3bc)); +// German double s and capital S. These are not equivalent since one is double. +TestFail(String.fromCharCode(0xdf), "S"); +// Small i and Turkish capital dotted I. These are not equivalent due to +// 21.2.2.8.2 section 3g. One is below 128 and the other is above 127. +TestFail("i", String.fromCharCode(0x130)); +// Small dotless i and I. These are not equivalent either. +TestFail(String.fromCharCode(0x131), "I"); diff --git a/test/mjsunit/regress/regress-crbug-482998.js b/test/mjsunit/regress/regress-crbug-482998.js index 94ff5008e8..80933a7a6d 100644 --- a/test/mjsunit/regress/regress-crbug-482998.js +++ b/test/mjsunit/regress/regress-crbug-482998.js @@ -3,13 +3,13 @@ // found in the LICENSE file. // Should not time out. Running time 0.5s vs. 120s before the change. -function collapse() { +function collapse(flags) { var src = "(?:"; for (var i = 128; i < 0x1000; i++) { - src += "a" + String.fromCharCode(i) + "|"; + src += String.fromCharCode(96 + i % 26) + String.fromCharCode(i) + "|"; } src += "aa)"; - var collapsible = new RegExp(src); + var collapsible = new RegExp(src, flags); var subject = "zzzzzzz" + String.fromCharCode(3000); for (var i = 0; i < 1000; i++) { subject += "xxxxxxx"; @@ -19,4 +19,5 @@ function collapse() { } } -collapse(); +collapse("i"); +collapse("");