diff --git a/src/jsregexp.cc b/src/jsregexp.cc index 0ec01ed802..bce9805f11 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -218,7 +218,7 @@ Handle RegExpImpl::AtomExec(Handle re, } LOG(RegExpExecEvent(re, start_index, subject)); - int value = Runtime::StringMatchKmp(subject, needle, start_index); + int value = Runtime::StringMatch(subject, needle, start_index); if (value == -1) return Factory::null_value(); Handle result = Factory::NewJSArray(2); SetElement(result, 0, Handle(Smi::FromInt(value))); @@ -239,7 +239,7 @@ Handle RegExpImpl::AtomExecGlobal(Handle re, LOG(RegExpExecEvent(re, index, subject)); int value = -1; if (index + needle_length <= subject_length) { - value = Runtime::StringMatchKmp(subject, needle, index); + value = Runtime::StringMatch(subject, needle, index); } if (value == -1) break; HandleScope scope; diff --git a/src/runtime.cc b/src/runtime.cc index 92e23301b7..97493e075c 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -1053,6 +1053,64 @@ static int SimpleIndexOf(Vector subject, return -1; } +// Maximal length (+1) of suffix that is indexed. Also the size of the +// maximal bad-character skip. +static const int kBMHSignificantSuffixLength = 0xff; + +// Significant bits taken from characters to use in bad-character +// skips, to reduce size of the table for Unicode letters. +static const int kBMHSignificantBitsMask = 0xff; + +// Number of elements in bad-char table. +static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1; + +// Simplified Boyer-Moore string matching. Only uses bad-char skipping, +// and restricts table to a suffix of long strings (also restricting +// the maximum possible skip-length) in order to reduce space. +template +static int BoyerMooreHorspoolIndexOf(Vector subject, + Vector pattern, + int start_index) { + ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips. + static byte bad_char_map[kBMHBadCharCount]; + + int m = pattern.length(); + int n = subject.length(); + // Cap bad char table to last p chars of pattern. Also max skip value. + int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength; + + memset(bad_char_map, p, kBMHBadCharCount); + + // Run forwards to populate bad_char_table, so that *last* instance + // of character equivalence class is the one registered. + // Notice: Doesn't include last character. + for (int i = p < m ? m - p : 0; i < m - 1; i++) { + pchar c = pattern[i]; + if (sizeof(schar) == 1 && c > 255) return -1; + bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i; + } + + for (int i = start_index + m - 1, j = m - 1; i < n;) { + schar c = subject[i]; + if (c == pattern[j]) { + if (j == 0) { + return i; + } + j--; + i--; + } else { + int skip = bad_char_map[c & kBMHSignificantBitsMask]; + if (skip < (m - j)) { + skip = m - j; + } + i += skip; + j = m - 1; + } + } + return -1; +} + + // Full KMP pattern match. template // Pattern & subject char types static int KMPIndexOf(Vector subject, @@ -1101,35 +1159,40 @@ static int KMPIndexOf(Vector subject, // Dispatch to different algorithms for different length of pattern/subject template -static int StringMatchKMP(Vector sub, - Vector pat, - int start_index) { +static int StringMatchStrategy(Vector sub, + Vector pat, + int start_index) { + int pattern_length = pat.length(); // Searching for one specific character is common. For one // character patterns the KMP algorithm is guaranteed to slow down // the search, so we just run through the subject string. - if (pat.length() == 1) { + if (pattern_length == 1) { return SingleCharIndexOf(sub, pat[0], start_index); } - // For small searches, KMP is not worth the setup overhead. - if (sub.length() - start_index < 100) { + // For small searches, a complex sort is not worth the setup overhead. + if (sub.length() - start_index < 25) { return SimpleIndexOf(sub, pat, start_index); } - // For patterns with a larger length we use the KMP algorithm. - return KMPIndexOf(sub, pat, start_index); + return BoyerMooreHorspoolIndexOf(sub, pat, start_index); } // Perform string match of pattern on subject, starting at start index. // Caller must ensure that 0 <= start_index <= sub->length(), // and should check that pat->length() + start_index <= sub->length() -int Runtime::StringMatchKmp(Handle sub, - Handle pat, - int start_index) { +int Runtime::StringMatch(Handle sub, + Handle pat, + int start_index) { ASSERT(0 <= start_index); ASSERT(start_index <= sub->length()); - if (pat->length() == 0) return start_index; + int pattern_length = pat->length(); + if (pattern_length == 0) return start_index; + + int subject_length = sub->length(); + if (start_index + pattern_length > subject_length) return -1; + FlattenString(sub); FlattenString(pat); @@ -1138,15 +1201,15 @@ int Runtime::StringMatchKmp(Handle sub, if (pat->is_ascii()) { Vector pat_vector = ToAsciiVector(*pat); if (sub->is_ascii()) { - return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index); + return StringMatchStrategy(ToAsciiVector(*sub), pat_vector, start_index); } - return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index); + return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index); } Vector pat_vector = ToUC16Vector(*pat); if (sub->is_ascii()) { - return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index); + return StringMatchStrategy(ToAsciiVector(*sub), pat_vector, start_index); } - return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index); + return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index); } @@ -1161,7 +1224,7 @@ static Object* Runtime_StringIndexOf(Arguments args) { uint32_t start_index; if (!Array::IndexFromObject(index, &start_index)) return Smi::FromInt(-1); - int position = Runtime::StringMatchKmp(sub, pat, start_index); + int position = Runtime::StringMatch(sub, pat, start_index); return Smi::FromInt(position); } diff --git a/src/runtime.h b/src/runtime.h index 47536aecd2..3704a1dc33 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -335,7 +335,7 @@ class Runtime : public AllStatic { // Get the runtime function with the given name. static Function* FunctionForName(const char* name); - static int StringMatchKmp(Handle sub, Handle pat, int index); + static int StringMatch(Handle sub, Handle pat, int index); // TODO(1240886): The following three methods are *not* handle safe, // but accept handle arguments. This seems fragile.