From dc87d51f70a8da13d388b4f92bc81e33aac93cbd Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Fri, 17 Oct 2008 11:15:54 +0000 Subject: [PATCH] Changed Boyer-Moore's bad-char table code: - Reduce it to half size if the pattern is ASCII, saving on initialization - If pattern is ASCII and subject is not, any non-ASCII char can cause a full pattern-length shift, even if we haven't indexed the entire pattern. - Use memset to initialize buffer in the common case where the pattern is shorter than the max significant suffix limit. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@519 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/runtime.cc | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/runtime.cc b/src/runtime.cc index bcdd76deca..bc1152142a 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -965,7 +965,7 @@ static const int kBMMinPatternLength = 5; // Holds the two buffers used by Boyer-Moore string search's Good Suffix // shift. Only allows the last kBMMaxShift characters of the needle // to be indexed. -class BMGoodSuffixBuffers: public AllStatic { +class BMGoodSuffixBuffers { public: BMGoodSuffixBuffers() {} inline void init(int needle_length) { @@ -1005,12 +1005,18 @@ static void BoyerMoorePopulateBadCharTable(Vector pattern, // Run forwards to populate bad_char_table, so that *last* instance // of character equivalence class is the one registered. // Notice: Doesn't include the last character. - for (int i = 0; i < kBMAlphabetSize; i++) { - bad_char_occurence[i] = start - 1; + int table_size = (sizeof(pchar) == 1) ? String::kMaxAsciiCharCode + 1 + : kBMAlphabetSize; + if (start == 0) { // All patterns less than kBMMaxShift in length. + memset(bad_char_occurence, -1, table_size * sizeof(*bad_char_occurence)); + } else { + for (int i = 0; i < table_size; i++) { + bad_char_occurence[i] = start - 1; + } } for (int i = start; i < pattern.length() - 1; i++) { pchar c = pattern[i]; - int bucket = c % kBMAlphabetSize; + int bucket = (sizeof(pchar) ==1) ? c : c % kBMAlphabetSize; bad_char_occurence[bucket] = i; } } @@ -1065,6 +1071,19 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector pattern, } } +template +static inline int CharOccurence(int char_code) { + if (sizeof(schar) == 1) { + return bad_char_occurence[char_code]; + } + if (sizeof(pchar) == 1) { + if (char_code > String::kMaxAsciiCharCode) { + return -1; + } + return bad_char_occurence[char_code]; + } + return bad_char_occurence[char_code % kBMAlphabetSize]; +} // Restricted simplified Boyer-Moore string matching. Restricts tables to a // suffix of long pattern strings and handles only equivalence classes @@ -1090,7 +1109,7 @@ static int BoyerMooreSimplified(Vector subject, int j = m - 1; int c; while (last_char != (c = subject[idx + j])) { - int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; + int bc_occ = CharOccurence(c); int shift = j - bc_occ; idx += shift; badness += 1 - shift; // at most zero, so badness cannot increase. @@ -1105,7 +1124,7 @@ static int BoyerMooreSimplified(Vector subject, complete = true; return idx; } else { - int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; + int bc_occ = CharOccurence(c); int shift = bc_occ < j ? j - bc_occ : 1; idx += shift; // Badness increases by the number of characters we have @@ -1141,7 +1160,7 @@ static int BoyerMooreIndexOf(Vector subject, int j = m - 1; schar c; while (last_char != (c = subject[idx + j])) { - int shift = j - bad_char_occurence[c % kBMAlphabetSize]; + int shift = j - CharOccurence(c); idx += shift; if (idx > n - m) { return -1; @@ -1155,7 +1174,7 @@ static int BoyerMooreIndexOf(Vector subject, idx += 1; } else { int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift. - int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; + int bc_occ = CharOccurence(c); int shift = j - bc_occ; // Bad-char shift. shift = (gs_shift > shift) ? gs_shift : shift; idx += shift;