v8/src/string-search.h

// Copyright 2011 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_STRING_SEARCH_H_
#define V8_STRING_SEARCH_H_

#include "src/isolate.h"
#include "src/vector.h"

namespace v8 {
namespace internal {


//---------------------------------------------------------------------
// String Search object.
//---------------------------------------------------------------------

// Class holding constants and methods that apply to all string search variants,
// independently of subject and pattern char size.
class StringSearchBase {
 protected:
  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
  // limit, we can fix the size of tables. For a needle longer than this limit,
  // search will not be optimal, since we only build tables for a suffix
  // of the string, but it is a safe approximation.
  static const int kBMMaxShift = Isolate::kBMMaxShift;

  // Reduce alphabet to this size.
  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
  // proportional to the input alphabet. We reduce the alphabet size by
  // equating input characters modulo a smaller alphabet size. This gives
  // a potentially less efficient searching, but is a safe approximation.
  // For needles using only characters in the same Unicode 256-code point page,
  // there is no search speed degradation.
  static const int kLatin1AlphabetSize = 256;
  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;

  // Bad-char shift table stored in the state. It's length is the alphabet size.
  // For patterns below this length, the skip length of Boyer-Moore is too short
  // to compensate for the algorithmic overhead compared to simple brute force.
  static const int kBMMinPatternLength = 7;

  static inline bool IsOneByteString(Vector<const uint8_t> string) {
    return true;
  }

  static inline bool IsOneByteString(Vector<const uc16> string) {
    return String::IsOneByte(string.start(), string.length());
  }

  friend class Isolate;
};


template <typename PatternChar, typename SubjectChar>
class StringSearch : private StringSearchBase {
 public:
  StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
      : isolate_(isolate),
        pattern_(pattern),
        start_(Max(0, pattern.length() - kBMMaxShift)) {
    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
      if (!IsOneByteString(pattern_)) {
        strategy_ = &FailSearch;
        return;
      }
    }
    int pattern_length = pattern_.length();
    if (pattern_length < kBMMinPatternLength) {
      if (pattern_length == 1) {
        strategy_ = &SingleCharSearch;
        return;
      }
      strategy_ = &LinearSearch;
      return;
    }
    strategy_ = &InitialSearch;
  }

  int Search(Vector<const SubjectChar> subject, int index) {
    return strategy_(this, subject, index);
  }

  static inline int AlphabetSize() {
    if (sizeof(PatternChar) == 1) {
      // Latin1 needle.
      return kLatin1AlphabetSize;
    } else {
      DCHECK(sizeof(PatternChar) == 2);
      // UC16 needle.
      return kUC16AlphabetSize;
    }
  }

 private:
  typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
      StringSearch<PatternChar, SubjectChar>*,
      Vector<const SubjectChar>,
      int);

  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
                        Vector<const SubjectChar>,
                        int) {
    return -1;
  }

  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
                              Vector<const SubjectChar> subject,
                              int start_index);

  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
                          Vector<const SubjectChar> subject,
                          int start_index);

  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
                           Vector<const SubjectChar> subject,
                           int start_index);

  static int BoyerMooreHorspoolSearch(
      StringSearch<PatternChar, SubjectChar>* search,
      Vector<const SubjectChar> subject,
      int start_index);

  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
                              Vector<const SubjectChar> subject,
                              int start_index);

  void PopulateBoyerMooreHorspoolTable();

  void PopulateBoyerMooreTable();

  static inline bool exceedsOneByte(uint8_t c) {
    return false;
  }

  static inline bool exceedsOneByte(uint16_t c) {
    return c > String::kMaxOneByteCharCodeU;
  }

  static inline int CharOccurrence(int* bad_char_occurrence,
                                   SubjectChar char_code) {
    if (sizeof(SubjectChar) == 1) {
      return bad_char_occurrence[static_cast<int>(char_code)];
    }
    if (sizeof(PatternChar) == 1) {
      if (exceedsOneByte(char_code)) {
        return -1;
      }
      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
    }
    // Both pattern and subject are UC16. Reduce character to equivalence class.
    int equiv_class = char_code % kUC16AlphabetSize;
    return bad_char_occurrence[equiv_class];
  }

  // The following tables are shared by all searches.
  // TODO(lrn): Introduce a way for a pattern to keep its tables
  // between searches (e.g., for an Atom RegExp).

  // Store for the BoyerMoore(Horspool) bad char shift table.
  // Return a table covering the last kBMMaxShift+1 positions of
  // pattern.
  int* bad_char_table() {
    return isolate_->bad_char_shift_table();
  }

  // Store for the BoyerMoore good suffix shift table.
  int* good_suffix_shift_table() {
    // Return biased pointer that maps the range  [start_..pattern_.length()
    // to the kGoodSuffixShiftTable array.
    return isolate_->good_suffix_shift_table() - start_;
  }

  // Table used temporarily while building the BoyerMoore good suffix
  // shift table.
  int* suffix_table() {
    // Return biased pointer that maps the range  [start_..pattern_.length()
    // to the kSuffixTable array.
    return isolate_->suffix_table() - start_;
  }

  Isolate* isolate_;
  // The pattern to search for.
  Vector<const PatternChar> pattern_;
  // Pointer to implementation of the search.
  SearchFunction strategy_;
  // Cache value of Max(0, pattern_length() - kBMMaxShift)
  int start_;
};


template <typename PatternChar, typename SubjectChar>
int FindFirstCharacter(Vector<const PatternChar> pattern,
                       Vector<const SubjectChar> subject, int index) {
  PatternChar pattern_first_char = pattern[0];
  const int max_n = (subject.length() - pattern.length() + 1);

  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    DCHECK_GE(max_n - index, 0);
    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
        memchr(subject.start() + index, pattern_first_char, max_n - index));
    if (char_pos == NULL) return -1;
    return static_cast<int>(char_pos - subject.start());
  } else {
    const uint8_t search_low_byte =
        static_cast<uint8_t>(pattern_first_char & 0xFF);
    const SubjectChar search_char =
        static_cast<SubjectChar>(pattern_first_char);
    int pos = index;
    do {
      DCHECK_GE(max_n - pos, 0);
      const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
          memchr(subject.start() + pos, search_low_byte,
                 (max_n - pos) * sizeof(SubjectChar)));
      if (char_pos == NULL) return -1;
      pos = static_cast<int>(char_pos - subject.start());
      if (IsAligned(reinterpret_cast<uintptr_t>(char_pos),
                    sizeof(SubjectChar))) {
        if (subject[pos] == search_char) return pos;
      }
    } while (++pos < max_n);
  }
  return -1;
}


//---------------------------------------------------------------------
// Single Character Pattern Search Strategy
//---------------------------------------------------------------------

template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
    StringSearch<PatternChar, SubjectChar>* search,
    Vector<const SubjectChar> subject,
    int index) {
  DCHECK_EQ(1, search->pattern_.length());
  PatternChar pattern_first_char = search->pattern_[0];
  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    return FindFirstCharacter(search->pattern_, subject, index);
  } else {
    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
      if (exceedsOneByte(pattern_first_char)) {
        return -1;
      }
    }
    return FindFirstCharacter(search->pattern_, subject, index);
  }
}

//---------------------------------------------------------------------
// Linear Search Strategy
//---------------------------------------------------------------------


template <typename PatternChar, typename SubjectChar>
inline bool CharCompare(const PatternChar* pattern,
                        const SubjectChar* subject,
                        int length) {
  DCHECK(length > 0);
  int pos = 0;
  do {
    if (pattern[pos] != subject[pos]) {
      return false;
    }
    pos++;
  } while (pos < length);
  return true;
}


// Simple linear search for short patterns. Never bails out.
template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    StringSearch<PatternChar, SubjectChar>* search,
    Vector<const SubjectChar> subject,
    int index) {
  Vector<const PatternChar> pattern = search->pattern_;
  DCHECK(pattern.length() > 1);
  int pattern_length = pattern.length();
  int i = index;
  int n = subject.length() - pattern_length;
  while (i <= n) {
    i = FindFirstCharacter(pattern, subject, i);
    if (i == -1) return -1;
    DCHECK_LE(i, n);
    i++;
    // Loop extracted to separate function to allow using return to do
    // a deeper break.
    if (CharCompare(pattern.start() + 1,
                    subject.start() + i,
                    pattern_length - 1)) {
      return i - 1;
    }
  }
  return -1;
}

//---------------------------------------------------------------------
// Boyer-Moore string search
//---------------------------------------------------------------------

template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
    StringSearch<PatternChar, SubjectChar>* search,
    Vector<const SubjectChar> subject,
    int start_index) {
  Vector<const PatternChar> pattern = search->pattern_;
  int subject_length = subject.length();
  int pattern_length = pattern.length();
  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = search->start_;

  int* bad_char_occurence = search->bad_char_table();
  int* good_suffix_shift = search->good_suffix_shift_table();

  PatternChar last_char = pattern[pattern_length - 1];
  int index = start_index;
  // Continue search from i.
  while (index <= subject_length - pattern_length) {
    int j = pattern_length - 1;
    int c;
    while (last_char != (c = subject[index + j])) {
      int shift =
          j - CharOccurrence(bad_char_occurence, c);
      index += shift;
      if (index > subject_length - pattern_length) {
        return -1;
      }
    }
    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
    if (j < 0) {
      return index;
    } else if (j < start) {
      // we have matched more than our tables allow us to be smart about.
      // Fall back on BMH shift.
      index += pattern_length - 1
          - CharOccurrence(bad_char_occurence,
                           static_cast<SubjectChar>(last_char));
    } else {
      int gs_shift = good_suffix_shift[j + 1];
      int bc_occ =
          CharOccurrence(bad_char_occurence, c);
      int shift = j - bc_occ;
      if (gs_shift > shift) {
        shift = gs_shift;
      }
      index += shift;
    }
  }

  return -1;
}


template <typename PatternChar, typename SubjectChar>
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
  int pattern_length = pattern_.length();
  const PatternChar* pattern = pattern_.start();
  // Only look at the last kBMMaxShift characters of pattern (from start_
  // to pattern_length).
  int start = start_;
  int length = pattern_length - start;

  // Biased tables so that we can use pattern indices as table indices,
  // even if we only cover the part of the pattern from offset start.
  int* shift_table = good_suffix_shift_table();
  int* suffix_table = this->suffix_table();

  // Initialize table.
  for (int i = start; i < pattern_length; i++) {
    shift_table[i] = length;
  }
  shift_table[pattern_length] = 1;
  suffix_table[pattern_length] = pattern_length + 1;

  if (pattern_length <= start) {
    return;
  }

  // Find suffixes.
  PatternChar last_char = pattern[pattern_length - 1];
  int suffix = pattern_length + 1;
  {
    int i = pattern_length;
    while (i > start) {
      PatternChar c = pattern[i - 1];
      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
        if (shift_table[suffix] == length) {
          shift_table[suffix] = suffix - i;
        }
        suffix = suffix_table[suffix];
      }
      suffix_table[--i] = --suffix;
      if (suffix == pattern_length) {
        // No suffix to extend, so we check against last_char only.
        while ((i > start) && (pattern[i - 1] != last_char)) {
          if (shift_table[pattern_length] == length) {
            shift_table[pattern_length] = pattern_length - i;
          }
          suffix_table[--i] = pattern_length;
        }
        if (i > start) {
          suffix_table[--i] = --suffix;
        }
      }
    }
  }
  // Build shift table using suffixes.
  if (suffix < pattern_length) {
    for (int i = start; i <= pattern_length; i++) {
      if (shift_table[i] == length) {
        shift_table[i] = suffix - start;
      }
      if (i == suffix) {
        suffix = suffix_table[suffix];
      }
    }
  }
}

//---------------------------------------------------------------------
// Boyer-Moore-Horspool string search.
//---------------------------------------------------------------------

template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
    StringSearch<PatternChar, SubjectChar>* search,
    Vector<const SubjectChar> subject,
    int start_index) {
  Vector<const PatternChar> pattern = search->pattern_;
  int subject_length = subject.length();
  int pattern_length = pattern.length();
  int* char_occurrences = search->bad_char_table();
  int badness = -pattern_length;

  // How bad we are doing without a good-suffix table.
  PatternChar last_char = pattern[pattern_length - 1];
  int last_char_shift = pattern_length - 1 -
      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
  // Perform search
  int index = start_index;  // No matches found prior to this index.
  while (index <= subject_length - pattern_length) {
    int j = pattern_length - 1;
    int subject_char;
    while (last_char != (subject_char = subject[index + j])) {
      int bc_occ = CharOccurrence(char_occurrences, subject_char);
      int shift = j - bc_occ;
      index += shift;
      badness += 1 - shift;  // at most zero, so badness cannot increase.
      if (index > subject_length - pattern_length) {
        return -1;
      }
    }
    j--;
    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
    if (j < 0) {
      return index;
    } else {
      index += last_char_shift;
      // Badness increases by the number of characters we have
      // checked, and decreases by the number of characters we
      // can skip by shifting. It's a measure of how we are doing
      // compared to reading each character exactly once.
      badness += (pattern_length - j) - last_char_shift;
      if (badness > 0) {
        search->PopulateBoyerMooreTable();
        search->strategy_ = &BoyerMooreSearch;
        return BoyerMooreSearch(search, subject, index);
      }
    }
  }
  return -1;
}


template <typename PatternChar, typename SubjectChar>
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
  int pattern_length = pattern_.length();

  int* bad_char_occurrence = bad_char_table();

  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = start_;
  // Run forwards to populate bad_char_table, so that *last* instance
  // of character equivalence class is the one registered.
  // Notice: Doesn't include the last character.
  int table_size = AlphabetSize();
  if (start == 0) {  // All patterns less than kBMMaxShift in length.
    memset(bad_char_occurrence,
           -1,
           table_size * sizeof(*bad_char_occurrence));
  } else {
    for (int i = 0; i < table_size; i++) {
      bad_char_occurrence[i] = start - 1;
    }
  }
  for (int i = start; i < pattern_length - 1; i++) {
    PatternChar c = pattern_[i];
    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
    bad_char_occurrence[bucket] = i;
  }
}

//---------------------------------------------------------------------
// Linear string search with bailout to BMH.
//---------------------------------------------------------------------

// Simple linear search for short patterns, which bails out if the string
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::InitialSearch(
    StringSearch<PatternChar, SubjectChar>* search,
    Vector<const SubjectChar> subject,
    int index) {
  Vector<const PatternChar> pattern = search->pattern_;
  int pattern_length = pattern.length();
  // Badness is a count of how much work we have done.  When we have
  // done enough work we decide it's probably worth switching to a better
  // algorithm.
  int badness = -10 - (pattern_length << 2);

  // We know our pattern is at least 2 characters, we cache the first so
  // the common case of the first character not matching is faster.
  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
    badness++;
    if (badness <= 0) {
      i = FindFirstCharacter(pattern, subject, i);
      if (i == -1) return -1;
      DCHECK_LE(i, n);
      int j = 1;
      do {
        if (pattern[j] != subject[i + j]) {
          break;
        }
        j++;
      } while (j < pattern_length);
      if (j == pattern_length) {
        return i;
      }
      badness += j;
    } else {
      search->PopulateBoyerMooreHorspoolTable();
      search->strategy_ = &BoyerMooreHorspoolSearch;
      return BoyerMooreHorspoolSearch(search, subject, i);
    }
  }
  return -1;
}


// Perform a a single stand-alone search.
// If searching multiple times for the same pattern, a search
// object should be constructed once and the Search function then called
// for each search.
template <typename SubjectChar, typename PatternChar>
int SearchString(Isolate* isolate,
                 Vector<const SubjectChar> subject,
                 Vector<const PatternChar> pattern,
                 int start_index) {
  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
  return search.Search(subject, start_index);
}

}}  // namespace v8::internal

#endif  // V8_STRING_SEARCH_H_