Boyer-Moore delayes computation of Good-Suffix table until bad-char has shown itself

to be insufficient.x
Changed order of tests in loop in simple text search.
Changed limit on pattern length for when we pick simple search.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@496 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
lrn@chromium.org 2008-10-14 12:45:06 +00:00
parent c46b0e84d4
commit 51c7fa95ce

View File

@ -1025,69 +1025,97 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
}
// End of Bad Char computation.
// Compute Good Suffix shift table.
bmgs_buffers.init(m);
bmgs_buffers.shift(m-1) = 1;
bmgs_buffers.suffix(m) = m + 1;
pchar last_char = pattern[m - 1];
int suffix = m + 1;
for (int i = m; i > start;) {
for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
if (bmgs_buffers.shift(suffix) == len) {
bmgs_buffers.shift(suffix) = suffix - i;
}
suffix = bmgs_buffers.suffix(suffix);
}
i--;
suffix--;
bmgs_buffers.suffix(i) = suffix;
if (suffix == m) {
// no suffix to extend, so we check against last_char only.
while (i > start && pattern[i - 1] != last_char) {
if (bmgs_buffers.shift(m) == len) {
bmgs_buffers.shift(m) = m - i;
}
i--;
bmgs_buffers.suffix(i) = m;
}
if (i > start) {
i--;
suffix--;
bmgs_buffers.suffix(i) = suffix;
}
}
}
if (suffix < m) {
for (int i = start; i <= m; i++) {
if (bmgs_buffers.shift(i) == len) {
bmgs_buffers.shift(i) = suffix - start;
}
if (i == suffix) {
suffix = bmgs_buffers.suffix(suffix);
}
}
}
// End of Good Suffix computation.
int badness = 0; // How bad we are doing without a good-suffix table.
int idx; // No matches found prior to this index.
// Perform search
for (int i = start_index; i <= n - m;) {
for (idx = start_index; idx <= n - m;) {
int j = m - 1;
schar c;
while (j >= 0 && pattern[j] == (c = subject[i + j])) j--;
while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
if (j < 0) {
return i;
} else if (j < start) {
// we have matched more than our tables allow us to be smart about.
i += 1;
return idx;
} else {
int gs_shift = bmgs_buffers.shift(j + 1);
int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
int bc_shift = j - bc_occ;
i += (gs_shift > bc_shift) ? gs_shift : bc_shift;
int shift = bc_occ < j ? j - bc_occ : 1;
idx += shift;
// Badness increases by the number of characters we have
// checked, and decreases by the number of characters we
// can skip by shifting. It's a measure of how we are doing
// compared to reading each character exactly once.
badness += (m - j) - shift;
if (badness > m) break;
}
}
// If we are not done, we got here because we should build the Good Suffix
// table and continue searching.
if (idx <= n - m) {
// Compute Good Suffix tables.
bmgs_buffers.init(m);
bmgs_buffers.shift(m-1) = 1;
bmgs_buffers.suffix(m) = m + 1;
pchar last_char = pattern[m - 1];
int suffix = m + 1;
for (int i = m; i > start;) {
for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
if (bmgs_buffers.shift(suffix) == len) {
bmgs_buffers.shift(suffix) = suffix - i;
}
suffix = bmgs_buffers.suffix(suffix);
}
i--;
suffix--;
bmgs_buffers.suffix(i) = suffix;
if (suffix == m) {
// no suffix to extend, so we check against last_char only.
while (i > start && pattern[i - 1] != last_char) {
if (bmgs_buffers.shift(m) == len) {
bmgs_buffers.shift(m) = m - i;
}
i--;
bmgs_buffers.suffix(i) = m;
}
if (i > start) {
i--;
suffix--;
bmgs_buffers.suffix(i) = suffix;
}
}
}
if (suffix < m) {
for (int i = start; i <= m; i++) {
if (bmgs_buffers.shift(i) == len) {
bmgs_buffers.shift(i) = suffix - start;
}
if (i == suffix) {
suffix = bmgs_buffers.suffix(suffix);
}
}
}
// End of Good Suffix computation.
// Continue search from i.
do {
int j = m - 1;
schar c;
while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
if (j < 0) {
return idx;
} else if (j < start) {
// we have matched more than our tables allow us to be smart about.
idx += 1;
} else {
int gs_shift = bmgs_buffers.shift(j + 1);
int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
int bc_shift = j - bc_occ;
idx += (gs_shift > bc_shift) ? gs_shift : bc_shift;
}
} while (idx <= n - m);
}
return -1;
}
@ -1116,11 +1144,14 @@ static int SimpleIndexOf(Vector<const schar> subject,
for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
if (subject[i] != pattern_first_char) continue;
int j = 1;
while (pattern[j] == subject[j+i]) {
j++;
if (j == pattern_length) {
return i;
do {
if (pattern[j] != subject[j+i]) {
break;
}
j++;
} while (j < pattern_length);
if (j == pattern_length) {
return i;
}
}
return -1;
@ -1136,7 +1167,7 @@ static int StringMatchStrategy(Vector<const schar> sub,
// For small searches, a complex sort is not worth the setup overhead.
int subject_length = sub.length() - start_index;
if (subject_length < 100 || pattern_length < 4) {
if (subject_length < 100 || pattern_length < 8) {
return SimpleIndexOf(sub, pat, start_index);
}