4e670fd05e
R=dcarney@chromium.org, marja@chromium.org Review URL: https://codereview.chromium.org/559913002 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@23840 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
558 lines
18 KiB
C++
558 lines
18 KiB
C++
// Copyright 2011 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#ifndef V8_STRING_SEARCH_H_
|
|
#define V8_STRING_SEARCH_H_
|
|
|
|
namespace v8 {
|
|
namespace internal {
|
|
|
|
|
|
//---------------------------------------------------------------------
|
|
// String Search object.
|
|
//---------------------------------------------------------------------
|
|
|
|
// Class holding constants and methods that apply to all string search variants,
|
|
// independently of subject and pattern char size.
|
|
class StringSearchBase {
|
|
protected:
|
|
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
|
|
// limit, we can fix the size of tables. For a needle longer than this limit,
|
|
// search will not be optimal, since we only build tables for a suffix
|
|
// of the string, but it is a safe approximation.
|
|
static const int kBMMaxShift = Isolate::kBMMaxShift;
|
|
|
|
// Reduce alphabet to this size.
|
|
// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
|
|
// proportional to the input alphabet. We reduce the alphabet size by
|
|
// equating input characters modulo a smaller alphabet size. This gives
|
|
// a potentially less efficient searching, but is a safe approximation.
|
|
// For needles using only characters in the same Unicode 256-code point page,
|
|
// there is no search speed degradation.
|
|
static const int kLatin1AlphabetSize = 256;
|
|
static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
|
|
|
|
// Bad-char shift table stored in the state. It's length is the alphabet size.
|
|
// For patterns below this length, the skip length of Boyer-Moore is too short
|
|
// to compensate for the algorithmic overhead compared to simple brute force.
|
|
static const int kBMMinPatternLength = 7;
|
|
|
|
static inline bool IsOneByteString(Vector<const uint8_t> string) {
|
|
return true;
|
|
}
|
|
|
|
static inline bool IsOneByteString(Vector<const uc16> string) {
|
|
return String::IsOneByte(string.start(), string.length());
|
|
}
|
|
|
|
friend class Isolate;
|
|
};
|
|
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
class StringSearch : private StringSearchBase {
|
|
public:
|
|
StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
|
|
: isolate_(isolate),
|
|
pattern_(pattern),
|
|
start_(Max(0, pattern.length() - kBMMaxShift)) {
|
|
if (sizeof(PatternChar) > sizeof(SubjectChar)) {
|
|
if (!IsOneByteString(pattern_)) {
|
|
strategy_ = &FailSearch;
|
|
return;
|
|
}
|
|
}
|
|
int pattern_length = pattern_.length();
|
|
if (pattern_length < kBMMinPatternLength) {
|
|
if (pattern_length == 1) {
|
|
strategy_ = &SingleCharSearch;
|
|
return;
|
|
}
|
|
strategy_ = &LinearSearch;
|
|
return;
|
|
}
|
|
strategy_ = &InitialSearch;
|
|
}
|
|
|
|
int Search(Vector<const SubjectChar> subject, int index) {
|
|
return strategy_(this, subject, index);
|
|
}
|
|
|
|
static inline int AlphabetSize() {
|
|
if (sizeof(PatternChar) == 1) {
|
|
// Latin1 needle.
|
|
return kLatin1AlphabetSize;
|
|
} else {
|
|
DCHECK(sizeof(PatternChar) == 2);
|
|
// UC16 needle.
|
|
return kUC16AlphabetSize;
|
|
}
|
|
}
|
|
|
|
private:
|
|
typedef int (*SearchFunction)( // NOLINT - it's not a cast!
|
|
StringSearch<PatternChar, SubjectChar>*,
|
|
Vector<const SubjectChar>,
|
|
int);
|
|
|
|
static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
|
|
Vector<const SubjectChar>,
|
|
int) {
|
|
return -1;
|
|
}
|
|
|
|
static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index);
|
|
|
|
static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index);
|
|
|
|
static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index);
|
|
|
|
static int BoyerMooreHorspoolSearch(
|
|
StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index);
|
|
|
|
static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index);
|
|
|
|
void PopulateBoyerMooreHorspoolTable();
|
|
|
|
void PopulateBoyerMooreTable();
|
|
|
|
static inline bool exceedsOneByte(uint8_t c) {
|
|
return false;
|
|
}
|
|
|
|
static inline bool exceedsOneByte(uint16_t c) {
|
|
return c > String::kMaxOneByteCharCodeU;
|
|
}
|
|
|
|
static inline int CharOccurrence(int* bad_char_occurrence,
|
|
SubjectChar char_code) {
|
|
if (sizeof(SubjectChar) == 1) {
|
|
return bad_char_occurrence[static_cast<int>(char_code)];
|
|
}
|
|
if (sizeof(PatternChar) == 1) {
|
|
if (exceedsOneByte(char_code)) {
|
|
return -1;
|
|
}
|
|
return bad_char_occurrence[static_cast<unsigned int>(char_code)];
|
|
}
|
|
// Both pattern and subject are UC16. Reduce character to equivalence class.
|
|
int equiv_class = char_code % kUC16AlphabetSize;
|
|
return bad_char_occurrence[equiv_class];
|
|
}
|
|
|
|
// The following tables are shared by all searches.
|
|
// TODO(lrn): Introduce a way for a pattern to keep its tables
|
|
// between searches (e.g., for an Atom RegExp).
|
|
|
|
// Store for the BoyerMoore(Horspool) bad char shift table.
|
|
// Return a table covering the last kBMMaxShift+1 positions of
|
|
// pattern.
|
|
int* bad_char_table() {
|
|
return isolate_->bad_char_shift_table();
|
|
}
|
|
|
|
// Store for the BoyerMoore good suffix shift table.
|
|
int* good_suffix_shift_table() {
|
|
// Return biased pointer that maps the range [start_..pattern_.length()
|
|
// to the kGoodSuffixShiftTable array.
|
|
return isolate_->good_suffix_shift_table() - start_;
|
|
}
|
|
|
|
// Table used temporarily while building the BoyerMoore good suffix
|
|
// shift table.
|
|
int* suffix_table() {
|
|
// Return biased pointer that maps the range [start_..pattern_.length()
|
|
// to the kSuffixTable array.
|
|
return isolate_->suffix_table() - start_;
|
|
}
|
|
|
|
Isolate* isolate_;
|
|
// The pattern to search for.
|
|
Vector<const PatternChar> pattern_;
|
|
// Pointer to implementation of the search.
|
|
SearchFunction strategy_;
|
|
// Cache value of Max(0, pattern_length() - kBMMaxShift)
|
|
int start_;
|
|
};
|
|
|
|
|
|
//---------------------------------------------------------------------
|
|
// Single Character Pattern Search Strategy
|
|
//---------------------------------------------------------------------
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
|
|
StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int index) {
|
|
DCHECK_EQ(1, search->pattern_.length());
|
|
PatternChar pattern_first_char = search->pattern_[0];
|
|
int i = index;
|
|
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
|
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
|
memchr(subject.start() + i,
|
|
pattern_first_char,
|
|
subject.length() - i));
|
|
if (pos == NULL) return -1;
|
|
return static_cast<int>(pos - subject.start());
|
|
} else {
|
|
if (sizeof(PatternChar) > sizeof(SubjectChar)) {
|
|
if (exceedsOneByte(pattern_first_char)) {
|
|
return -1;
|
|
}
|
|
}
|
|
SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
|
|
int n = subject.length();
|
|
while (i < n) {
|
|
if (subject[i++] == search_char) return i - 1;
|
|
}
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Linear Search Strategy
|
|
//---------------------------------------------------------------------
|
|
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
inline bool CharCompare(const PatternChar* pattern,
|
|
const SubjectChar* subject,
|
|
int length) {
|
|
DCHECK(length > 0);
|
|
int pos = 0;
|
|
do {
|
|
if (pattern[pos] != subject[pos]) {
|
|
return false;
|
|
}
|
|
pos++;
|
|
} while (pos < length);
|
|
return true;
|
|
}
|
|
|
|
|
|
// Simple linear search for short patterns. Never bails out.
|
|
template <typename PatternChar, typename SubjectChar>
|
|
int StringSearch<PatternChar, SubjectChar>::LinearSearch(
|
|
StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int index) {
|
|
Vector<const PatternChar> pattern = search->pattern_;
|
|
DCHECK(pattern.length() > 1);
|
|
int pattern_length = pattern.length();
|
|
PatternChar pattern_first_char = pattern[0];
|
|
int i = index;
|
|
int n = subject.length() - pattern_length;
|
|
while (i <= n) {
|
|
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
|
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
|
memchr(subject.start() + i,
|
|
pattern_first_char,
|
|
n - i + 1));
|
|
if (pos == NULL) return -1;
|
|
i = static_cast<int>(pos - subject.start()) + 1;
|
|
} else {
|
|
if (subject[i++] != pattern_first_char) continue;
|
|
}
|
|
// Loop extracted to separate function to allow using return to do
|
|
// a deeper break.
|
|
if (CharCompare(pattern.start() + 1,
|
|
subject.start() + i,
|
|
pattern_length - 1)) {
|
|
return i - 1;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Boyer-Moore string search
|
|
//---------------------------------------------------------------------
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
|
|
StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index) {
|
|
Vector<const PatternChar> pattern = search->pattern_;
|
|
int subject_length = subject.length();
|
|
int pattern_length = pattern.length();
|
|
// Only preprocess at most kBMMaxShift last characters of pattern.
|
|
int start = search->start_;
|
|
|
|
int* bad_char_occurence = search->bad_char_table();
|
|
int* good_suffix_shift = search->good_suffix_shift_table();
|
|
|
|
PatternChar last_char = pattern[pattern_length - 1];
|
|
int index = start_index;
|
|
// Continue search from i.
|
|
while (index <= subject_length - pattern_length) {
|
|
int j = pattern_length - 1;
|
|
int c;
|
|
while (last_char != (c = subject[index + j])) {
|
|
int shift =
|
|
j - CharOccurrence(bad_char_occurence, c);
|
|
index += shift;
|
|
if (index > subject_length - pattern_length) {
|
|
return -1;
|
|
}
|
|
}
|
|
while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
|
|
if (j < 0) {
|
|
return index;
|
|
} else if (j < start) {
|
|
// we have matched more than our tables allow us to be smart about.
|
|
// Fall back on BMH shift.
|
|
index += pattern_length - 1
|
|
- CharOccurrence(bad_char_occurence,
|
|
static_cast<SubjectChar>(last_char));
|
|
} else {
|
|
int gs_shift = good_suffix_shift[j + 1];
|
|
int bc_occ =
|
|
CharOccurrence(bad_char_occurence, c);
|
|
int shift = j - bc_occ;
|
|
if (gs_shift > shift) {
|
|
shift = gs_shift;
|
|
}
|
|
index += shift;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
|
|
int pattern_length = pattern_.length();
|
|
const PatternChar* pattern = pattern_.start();
|
|
// Only look at the last kBMMaxShift characters of pattern (from start_
|
|
// to pattern_length).
|
|
int start = start_;
|
|
int length = pattern_length - start;
|
|
|
|
// Biased tables so that we can use pattern indices as table indices,
|
|
// even if we only cover the part of the pattern from offset start.
|
|
int* shift_table = good_suffix_shift_table();
|
|
int* suffix_table = this->suffix_table();
|
|
|
|
// Initialize table.
|
|
for (int i = start; i < pattern_length; i++) {
|
|
shift_table[i] = length;
|
|
}
|
|
shift_table[pattern_length] = 1;
|
|
suffix_table[pattern_length] = pattern_length + 1;
|
|
|
|
if (pattern_length <= start) {
|
|
return;
|
|
}
|
|
|
|
// Find suffixes.
|
|
PatternChar last_char = pattern[pattern_length - 1];
|
|
int suffix = pattern_length + 1;
|
|
{
|
|
int i = pattern_length;
|
|
while (i > start) {
|
|
PatternChar c = pattern[i - 1];
|
|
while (suffix <= pattern_length && c != pattern[suffix - 1]) {
|
|
if (shift_table[suffix] == length) {
|
|
shift_table[suffix] = suffix - i;
|
|
}
|
|
suffix = suffix_table[suffix];
|
|
}
|
|
suffix_table[--i] = --suffix;
|
|
if (suffix == pattern_length) {
|
|
// No suffix to extend, so we check against last_char only.
|
|
while ((i > start) && (pattern[i - 1] != last_char)) {
|
|
if (shift_table[pattern_length] == length) {
|
|
shift_table[pattern_length] = pattern_length - i;
|
|
}
|
|
suffix_table[--i] = pattern_length;
|
|
}
|
|
if (i > start) {
|
|
suffix_table[--i] = --suffix;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Build shift table using suffixes.
|
|
if (suffix < pattern_length) {
|
|
for (int i = start; i <= pattern_length; i++) {
|
|
if (shift_table[i] == length) {
|
|
shift_table[i] = suffix - start;
|
|
}
|
|
if (i == suffix) {
|
|
suffix = suffix_table[suffix];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Boyer-Moore-Horspool string search.
|
|
//---------------------------------------------------------------------
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
|
|
StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int start_index) {
|
|
Vector<const PatternChar> pattern = search->pattern_;
|
|
int subject_length = subject.length();
|
|
int pattern_length = pattern.length();
|
|
int* char_occurrences = search->bad_char_table();
|
|
int badness = -pattern_length;
|
|
|
|
// How bad we are doing without a good-suffix table.
|
|
PatternChar last_char = pattern[pattern_length - 1];
|
|
int last_char_shift = pattern_length - 1 -
|
|
CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
|
|
// Perform search
|
|
int index = start_index; // No matches found prior to this index.
|
|
while (index <= subject_length - pattern_length) {
|
|
int j = pattern_length - 1;
|
|
int subject_char;
|
|
while (last_char != (subject_char = subject[index + j])) {
|
|
int bc_occ = CharOccurrence(char_occurrences, subject_char);
|
|
int shift = j - bc_occ;
|
|
index += shift;
|
|
badness += 1 - shift; // at most zero, so badness cannot increase.
|
|
if (index > subject_length - pattern_length) {
|
|
return -1;
|
|
}
|
|
}
|
|
j--;
|
|
while (j >= 0 && pattern[j] == (subject[index + j])) j--;
|
|
if (j < 0) {
|
|
return index;
|
|
} else {
|
|
index += last_char_shift;
|
|
// Badness increases by the number of characters we have
|
|
// checked, and decreases by the number of characters we
|
|
// can skip by shifting. It's a measure of how we are doing
|
|
// compared to reading each character exactly once.
|
|
badness += (pattern_length - j) - last_char_shift;
|
|
if (badness > 0) {
|
|
search->PopulateBoyerMooreTable();
|
|
search->strategy_ = &BoyerMooreSearch;
|
|
return BoyerMooreSearch(search, subject, index);
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
template <typename PatternChar, typename SubjectChar>
|
|
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
|
|
int pattern_length = pattern_.length();
|
|
|
|
int* bad_char_occurrence = bad_char_table();
|
|
|
|
// Only preprocess at most kBMMaxShift last characters of pattern.
|
|
int start = start_;
|
|
// Run forwards to populate bad_char_table, so that *last* instance
|
|
// of character equivalence class is the one registered.
|
|
// Notice: Doesn't include the last character.
|
|
int table_size = AlphabetSize();
|
|
if (start == 0) { // All patterns less than kBMMaxShift in length.
|
|
memset(bad_char_occurrence,
|
|
-1,
|
|
table_size * sizeof(*bad_char_occurrence));
|
|
} else {
|
|
for (int i = 0; i < table_size; i++) {
|
|
bad_char_occurrence[i] = start - 1;
|
|
}
|
|
}
|
|
for (int i = start; i < pattern_length - 1; i++) {
|
|
PatternChar c = pattern_[i];
|
|
int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
|
|
bad_char_occurrence[bucket] = i;
|
|
}
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Linear string search with bailout to BMH.
|
|
//---------------------------------------------------------------------
|
|
|
|
// Simple linear search for short patterns, which bails out if the string
|
|
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
|
|
template <typename PatternChar, typename SubjectChar>
|
|
int StringSearch<PatternChar, SubjectChar>::InitialSearch(
|
|
StringSearch<PatternChar, SubjectChar>* search,
|
|
Vector<const SubjectChar> subject,
|
|
int index) {
|
|
Vector<const PatternChar> pattern = search->pattern_;
|
|
int pattern_length = pattern.length();
|
|
// Badness is a count of how much work we have done. When we have
|
|
// done enough work we decide it's probably worth switching to a better
|
|
// algorithm.
|
|
int badness = -10 - (pattern_length << 2);
|
|
|
|
// We know our pattern is at least 2 characters, we cache the first so
|
|
// the common case of the first character not matching is faster.
|
|
PatternChar pattern_first_char = pattern[0];
|
|
for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
|
|
badness++;
|
|
if (badness <= 0) {
|
|
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
|
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
|
memchr(subject.start() + i,
|
|
pattern_first_char,
|
|
n - i + 1));
|
|
if (pos == NULL) {
|
|
return -1;
|
|
}
|
|
i = static_cast<int>(pos - subject.start());
|
|
} else {
|
|
if (subject[i] != pattern_first_char) continue;
|
|
}
|
|
int j = 1;
|
|
do {
|
|
if (pattern[j] != subject[i + j]) {
|
|
break;
|
|
}
|
|
j++;
|
|
} while (j < pattern_length);
|
|
if (j == pattern_length) {
|
|
return i;
|
|
}
|
|
badness += j;
|
|
} else {
|
|
search->PopulateBoyerMooreHorspoolTable();
|
|
search->strategy_ = &BoyerMooreHorspoolSearch;
|
|
return BoyerMooreHorspoolSearch(search, subject, i);
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
// Perform a a single stand-alone search.
|
|
// If searching multiple times for the same pattern, a search
|
|
// object should be constructed once and the Search function then called
|
|
// for each search.
|
|
template <typename SubjectChar, typename PatternChar>
|
|
int SearchString(Isolate* isolate,
|
|
Vector<const SubjectChar> subject,
|
|
Vector<const PatternChar> pattern,
|
|
int start_index) {
|
|
StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
|
|
return search.Search(subject, start_index);
|
|
}
|
|
|
|
}} // namespace v8::internal
|
|
|
|
#endif // V8_STRING_SEARCH_H_
|