Most operations are faster than before.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@492 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2008-10-14 08:57:31 +00:00 · 2008-10-14 08:57:31 +00:00 · 9e0609db8e
commit 9e0609db8e
parent 49c5ed0029
2 changed files with 265 additions and 124 deletions
--- a/src/runtime.cc
+++ b/src/runtime.cc
@ -952,6 +952,145 @@ static Object* Runtime_CharFromCode(Arguments args) {
 }


+// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
+// limit, we can fix the size of tables.
+static const int kBMMaxShift = 0xff;
+static const int kBMAlphabetSize = 0x100;  // Reduce alphabet to this size.
+
+// Holds the two buffers used by Boyer-Moore string search's Good Suffix
+// shift. Only allows the last kBMMaxShift characters of the needle
+// to be indexed.
+class BMGoodSuffixBuffers: public AllStatic {
+ public:
+  BMGoodSuffixBuffers() {}
+  inline void init(int needle_length) {
+    ASSERT(needle_length > 1);
+    int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
+    int len = needle_length - start;
+    biased_suffixes_ = suffixes_ - start;
+    biased_good_suffix_shift_ = good_suffix_shift_ - start;
+    for (int i = 0; i <= len; i++) {
+      good_suffix_shift_[i] = len;
+    }
+  }
+  inline int& suffix(int index) {
+    ASSERT(biased_suffixes_ + index >= suffixes_);
+    return biased_suffixes_[index];
+  }
+  inline int& shift(int index) {
+    ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
+    return biased_good_suffix_shift_[index];
+  }
+ private:
+  int suffixes_[kBMMaxShift + 1];
+  int good_suffix_shift_[kBMMaxShift + 1];
+  int *biased_suffixes_;
+  int *biased_good_suffix_shift_;
+  DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
+};
+
+// buffers reused by BoyerMoore
+static int bad_char_occurence[kBMAlphabetSize];
+static BMGoodSuffixBuffers bmgs_buffers;
+
+// Restricted Boyer-Moore string matching. Restricts tables to a
+// suffix of long pattern strings and handles only equivalence classes
+// of the full alphabet. This allows us to ensure that tables take only
+// a fixed amount of space.
+template <typename schar, typename pchar>
+static int BoyerMooreIndexOf(Vector<const schar> subject,
+                             Vector<const pchar> pattern,
+                             int start_index) {
+  int m = pattern.length();
+  int n = subject.length();
+
+  // Only preprocess at most kBMMaxShift last characters of pattern.
+  int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
+  int len = m - start;
+
+  // Run forwards to populate bad_char_table, so that *last* instance
+  // of character equivalence class is the one registered.
+  // Notice: Doesn't include last character.
+  for (int i = 0; i < kBMAlphabetSize; i++) {
+    bad_char_occurence[i] = start - 1;
+  }
+  for (int i = start; i < m; i++) {
+    uc32 c = pattern[i];
+    bad_char_occurence[c % kBMAlphabetSize] = i;
+    if (sizeof(schar) == 1 &&
+        sizeof(pchar) > 1 &&
+        c > String::kMaxAsciiCharCode) {
+      return -1;
+    }
+  }
+  // End of Bad Char computation.
+
+  // Compute Good Suffix shift table.
+  bmgs_buffers.init(m);
+
+  bmgs_buffers.shift(m-1) = 1;
+  bmgs_buffers.suffix(m) = m + 1;
+  pchar last_char = pattern[m - 1];
+  int suffix = m + 1;
+  for (int i = m; i > start;) {
+    for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
+      if (bmgs_buffers.shift(suffix) == len) {
+        bmgs_buffers.shift(suffix) = suffix - i;
+      }
+      suffix = bmgs_buffers.suffix(suffix);
+    }
+    i--;
+    suffix--;
+    bmgs_buffers.suffix(i) = suffix;
+    if (suffix == m) {
+      // no suffix to extend, so we check against last_char only.
+      while (i > start && pattern[i - 1] != last_char) {
+        if (bmgs_buffers.shift(m) == len) {
+          bmgs_buffers.shift(m) = m - i;
+        }
+        i--;
+        bmgs_buffers.suffix(i) = m;
+      }
+      if (i > start) {
+        i--;
+        suffix--;
+        bmgs_buffers.suffix(i) = suffix;
+      }
+    }
+  }
+  if (suffix < m) {
+    for (int i = start; i <= m; i++) {
+      if (bmgs_buffers.shift(i) == len) {
+        bmgs_buffers.shift(i) = suffix - start;
+      }
+      if (i == suffix) {
+        suffix = bmgs_buffers.suffix(suffix);
+      }
+    }
+  }
+  // End of Good Suffix computation.
+
+
+  // Perform search
+  for (int i = start_index; i <= n - m;) {
+    int j = m - 1;
+    schar c;
+    while (j >= 0 && pattern[j] == (c = subject[i + j])) j--;
+    if (j < 0) {
+      return i;
+    } else if (j < start) {
+      // we have matched more than our tables allow us to be smart about.
+      i += 1;
+    } else {
+      int gs_shift = bmgs_buffers.shift(j + 1);
+      int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
+      int bc_shift = j - bc_occ;
+      i += (gs_shift > bc_shift) ? gs_shift : bc_shift;
+    }
+  }
+  return -1;
+}
+
 template <typename schar, typename pchar>
 static int SingleCharIndexOf(Vector<const schar> string,
                             pchar pattern_char,
@ -976,124 +1115,12 @@ static int SimpleIndexOf(Vector<const schar> subject,
  pchar pattern_first_char = pattern[0];
  for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
    if (subject[i] != pattern_first_char) continue;
-
-    bool failure = false;
-    for (int j = 1; j < pattern_length; j++) {
-      if (pattern[j] != subject[j+i]) {
-        failure = true;
-        break;
-      }
-    }
-    if (!failure) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-// Maximal length (+1) of suffix that is indexed. Also the size of the
-// maximal bad-character skip.
-static const int kBMHSignificantSuffixLength = 0xff;
-
-// Significant bits taken from characters to use in bad-character
-// skips, to reduce size of the table for Unicode letters.
-static const int kBMHSignificantBitsMask = 0xff;
-
-// Number of elements in bad-char table.
-static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;
-
-// Simplified Boyer-Moore string matching. Only uses bad-char skipping,
-// and restricts table to a suffix of long strings (also restricting
-// the maximum possible skip-length) in order to reduce space.
-template <typename schar, typename pchar>
-static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,
-                                     Vector<const pchar> pattern,
-                                     int start_index) {
-  ASSERT(kBMHSignificantSuffixLength < 0x100);  // We can use bytes as skips.
-  static byte bad_char_map[kBMHBadCharCount];
-
-  int m = pattern.length();
-  int n = subject.length();
-  // Cap bad char table to last p chars of pattern. Also max skip value.
-  int p = m < kBMHSignificantSuffixLength ?  m : kBMHSignificantSuffixLength;
-
-  memset(bad_char_map, p, kBMHBadCharCount);
-
-  // Run forwards to populate bad_char_table, so that *last* instance
-  // of character equivalence class is the one registered.
-  // Notice: Doesn't include last character.
-  for (int i = p < m ? m - p : 0; i < m - 1; i++) {
-    uc32 c = pattern[i];
-    if (sizeof(schar) == 1 &&
-        sizeof(pchar) > 1 &&
-        c > String::kMaxAsciiCharCode) {
-      return -1;
-    }
-    bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
-  }
-
-  for (int i = start_index + m - 1, j = m - 1; i < n;) {
-    schar c = subject[i];
-    if (c == pattern[j]) {
-      if (j == 0) {
+    int j = 1;
+    while (pattern[j] == subject[j+i]) {
+      j++;
+      if (j == pattern_length) {
        return i;
      }
-      j--;
-      i--;
-    } else {
-      int skip = bad_char_map[c & kBMHSignificantBitsMask];
-      if (skip < (m - j)) {
-        skip = m - j;
-      }
-      i += skip;
-      j = m - 1;
-    }
-  }
-  return -1;
-}
-
-
-// Full KMP pattern match.
-template <typename schar, typename pchar>  // Pattern & subject char types
-static int KMPIndexOf(Vector<const schar> subject,
-                      Vector<const pchar> pattern,
-                      int start_index) {
-  int subject_length = subject.length();
-  int pattern_length = pattern.length();
-  SmartPointer<int> next_table(NewArray<int>(pattern_length));
-
-  // Compute KMP "next" table
-  int i = 0;
-  int j = -1;
-  next_table[0] = -1;
-
-  pchar p = pattern[0];
-  while (i < pattern_length - 1) {
-    while (j > -1 && p != pattern[j]) {
-      j = next_table[j];
-    }
-    i++;
-    j++;
-    p = pattern[i];
-    if (p == pattern[j]) {
-      next_table[i] = next_table[j];
-    } else {
-      next_table[i] = j;
-    }
-  }
-
-  // Search using the 'next' table.
-  int pattern_index = 0;
-  int subject_index = start_index;
-  while (subject_index < subject_length) {
-    schar subject_char = subject[subject_index];
-    while (pattern_index > -1 && pattern[pattern_index] != subject_char) {
-      pattern_index = next_table[pattern_index];
-    }
-    pattern_index++;
-    subject_index++;
-    if (pattern_index >= pattern_length) {
-      return subject_index - pattern_index;
    }
  }
  return -1;
@ -1105,19 +1132,15 @@ static int StringMatchStrategy(Vector<const schar> sub,
                               Vector<const pchar> pat,
                               int start_index) {
  int pattern_length = pat.length();
-  // Searching for one specific character is common.  For one
-  // character patterns the KMP algorithm is guaranteed to slow down
-  // the search, so we just run through the subject string.
-  if (pattern_length == 1) {
-    return SingleCharIndexOf(sub, pat[0], start_index);
-  }
+  ASSERT(pattern_length > 1);

  // For small searches, a complex sort is not worth the setup overhead.
-  if (sub.length() - start_index < 25) {
+  int subject_length = sub.length() - start_index;
+  if (subject_length < 100 || pattern_length < 4) {
    return SimpleIndexOf(sub, pat, start_index);
  }

-  return BoyerMooreHorspoolIndexOf(sub, pat, start_index);
+  return BoyerMooreIndexOf(sub, pat, start_index);
 }

 // Perform string match of pattern on subject, starting at start index.
@ -1136,6 +1159,17 @@ int Runtime::StringMatch(Handle<String> sub,
  if (start_index + pattern_length > subject_length) return -1;

  FlattenString(sub);
+  // Searching for one specific character is common.  For one
+  // character patterns linear search is necessary, so any smart
+  // algorithm is unnecessary overhead.
+  if (pattern_length == 1) {
+    AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
+    if (sub->is_ascii_representation()) {
+      return SingleCharIndexOf(sub->ToAsciiVector(), pat->Get(0), start_index);
+    }
+    return SingleCharIndexOf(sub->ToUC16Vector(), pat->Get(0), start_index);
+  }
+
  FlattenString(pat);

  AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
--- a/test/mjsunit/string-indexof.js
+++ b/test/mjsunit/string-indexof.js
@ -27,6 +27,12 @@

 var s = "test test test";

+assertEquals(0, s.indexOf("t"));
+assertEquals(3, s.indexOf("t", 1));
+assertEquals(5, s.indexOf("t", 4));
+assertEquals(1, s.indexOf("e"));
+assertEquals(2, s.indexOf("s"));
+
 assertEquals(5, s.indexOf("test", 4));
 assertEquals(5, s.indexOf("test", 5));
 assertEquals(10, s.indexOf("test", 6));
@ -47,3 +53,104 @@ assertEquals(4, reString.indexOf("[a-z]+"));
 assertEquals(10, reString.indexOf("(asdf)?"));

 assertEquals(1, String.prototype.indexOf.length);
+
+// Random greek letters
+var twoByteString = "\u039a\u0391\u03a3\u03a3\u0395";
+
+// Test single char pattern
+assertEquals(0, twoByteString.indexOf("\u039a"), "Lamda");
+assertEquals(1, twoByteString.indexOf("\u0391"), "Alpha");
+assertEquals(2, twoByteString.indexOf("\u03a3"), "First Sigma");
+assertEquals(3, twoByteString.indexOf("\u03a3",3), "Second Sigma");
+assertEquals(4, twoByteString.indexOf("\u0395"), "Epsilon");
+assertEquals(-1, twoByteString.indexOf("\u0392"), "Not beta");  
+
+// Test multi-char pattern
+assertEquals(0, twoByteString.indexOf("\u039a\u0391"), "lambda Alpha");
+assertEquals(1, twoByteString.indexOf("\u0391\u03a3"), "Alpha Sigma");
+assertEquals(2, twoByteString.indexOf("\u03a3\u03a3"), "Sigma Sigma");
+assertEquals(3, twoByteString.indexOf("\u03a3\u0395"), "Sigma Epsilon");
+
+assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"), 
+    "Not Alpha Sigma Epsilon");
+
+//single char pattern
+assertEquals(4, twoByteString.indexOf("\u0395"));
+
+// Test complex string indexOf algorithms. Only trigger for long strings.
+
+// Long string that isn't a simple repeat of a shorter string.
+var long = "A";
+for(var i = 66; i < 76; i++) {  // from 'B' to 'K'
+  long =  long + String.fromCharCode(i) + long;
+}
+
+// pattern of 15 chars, repeated every 16 chars in long
+var pattern = "ABACABADABACABA";
+for(var i = 0; i < long.length - pattern.length; i+= 7) {
+  var index = long.indexOf(pattern, i);
+  assertEquals((i + 15) & ~0xf, index, "Long ABACABA...-string at index " + i);
+}
+assertEquals(510, long.indexOf("AJABACA"), "Long AJABACA, First J");
+assertEquals(1534, long.indexOf("AJABACA", 511), "Long AJABACA, Second J");
+
+pattern = "JABACABADABACABA";
+assertEquals(511, long.indexOf(pattern), "Long JABACABA..., First J");
+assertEquals(1535, long.indexOf(pattern, 512), "Long JABACABA..., Second J");
+
+
+var lipsum = "lorem ipsum per se esse fugiendum. itaque aiunt hanc quasi "
+    + "naturalem atque insitam in animis nostris inesse notionem, ut "
+    + "alterum esse appetendum, alterum aspernandum sentiamus. Alii autem,"
+    + " quibus ego assentior, cum a philosophis compluribus permulta "
+    + "dicantur, cur nec voluptas in bonis sit numeranda nec in malis "
+    + "dolor, non existimant oportere nimium nos causae confidere, sed et"
+    + " argumentandum et accurate disserendum et rationibus conquisitis de"
+    + " voluptate et dolore disputandum putant.\n"
+    + "Sed ut perspiciatis, unde omnis iste natus error sit voluptatem "
+    + "accusantium doloremque laudantium, totam rem aperiam eaque ipsa,"
+    + "quae ab illo inventore veritatis et quasi architecto beatae vitae "
+    + "dicta sunt, explicabo. nemo enim ipsam voluptatem, quia voluptas"
+    + "sit, aspernatur aut odit aut fugit, sed quia consequuntur magni"
+    + " dolores eos, qui ratione voluptatem sequi nesciunt, neque porro"
+    + " quisquam est, qui dolorem ipsum, quia dolor sit, amet, "
+    + "consectetur, adipisci velit, sed quia non numquam eius modi"
+    + " tempora incidunt, ut labore et dolore magnam aliquam quaerat "
+    + "voluptatem. ut enim ad minima veniam, quis nostrum exercitationem "
+    + "ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi "
+    + "consequatur? quis autem vel eum iure reprehenderit, qui in ea "
+    + "voluptate velit esse, quam nihil molestiae consequatur, vel illum, "
+    + "qui dolorem eum fugiat, quo voluptas nulla pariatur?\n"
+    + "At vero eos et accusamus et iusto odio dignissimos ducimus, qui "
+    + "blanditiis praesentium voluptatum deleniti atque corrupti, quos "
+    + "dolores et quas molestias excepturi sint, obcaecati cupiditate "
+    + "non provident, similique sunt in culpa, qui officia deserunt "
+    + "mollitia animi, id est laborum et dolorum fuga. et harum quidem "
+    + "rerum facilis est et expedita distinctio. nam libero tempore, "
+    + "cum soluta nobis est eligendi optio, cumque nihil impedit, quo "
+    + "minus id, quod maxime placeat, facere possimus, omnis voluptas "
+    + "assumenda est, omnis dolor repellendus. temporibus autem "
+    + "quibusdam et aut officiis debitis aut rerum necessitatibus "
+    + "saepe eveniet, ut et voluptates repudiandae sint et molestiae "
+    + "non recusandae. itaque earum rerum hic tenetur a sapiente "
+    + "delectus, ut aut reiciendis voluptatibus maiores alias consequatur "
+    + "aut perferendis doloribus asperiores repellat.";
+
+assertEquals(893, lipsum.indexOf("lorem ipsum, quia dolor sit, amet"),
+        "Lipsum");
+// test a lot of substrings of differing length and start-position.
+for(var i = 255; i < lipsum.length; i += 3) {
+  for(var len = 661; i + len < lipsum.length; len += 4) {
+    var substring = lipsum.substring(i, i + len);
+    var index = -1;
+    do {
+      index = lipsum.indexOf(substring, index + 1);
+      assertTrue(index != -1, 
+                 "Lipsum substring " + i + ".." + (i + len-1) + " not found");
+      assertEquals(lipsum.substring(index, index + len), substring, 
+          "Wrong lipsum substring found: " + i + ".." + (i + len - 1) + "/" + 
+              index + ".." + (index + len - 1));
+    } while (index >= 0 && index < i);
+    assertEquals(i, index, "Lipsum match at " + i + ".." + (i + len - 1));
+  }
+}