Converted String.prototype.split with string to C++.

Review URL: http://codereview.chromium.org/875001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4134 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2010-03-15 15:27:11 +00:00 · 2010-03-15 15:27:11 +00:00 · d8aed420a9
commit d8aed420a9
parent 14547ef434
3 changed files with 261 additions and 48 deletions
--- a/src/runtime.cc
+++ b/src/runtime.cc
@ -2147,10 +2147,23 @@ class BMGoodSuffixBuffers {
 static int bad_char_occurrence[kBMAlphabetSize];
 static BMGoodSuffixBuffers bmgs_buffers;
 // State of the string match tables.
 // SIMPLE: No usable content in the buffers.
 // BOYER_MOORE_HORSPOOL: The bad_char_occurences table has been populated.
 // BOYER_MOORE: The bmgs_buffers tables have also been populated.
 // Whenever starting with a new needle, one should call InitializeStringSearch
 // to determine which search strategy to use, and in the case of a long-needle
 // strategy, the call also initializes the algorithm to SIMPLE.
 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
 static StringSearchAlgorithm algorithm;
 // Compute the bad-char table for Boyer-Moore in the static buffer.
 template <typename pchar>
-static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
+static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern) {
-                                          int start) {
+  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = pattern.length() < kBMMaxShift ? 0
                                             : pattern.length() - kBMMaxShift;
  // Run forwards to populate bad_char_table, so that *last* instance
  // of character equivalence class is the one registered.
  // Notice: Doesn't include the last character.
@ -2170,10 +2183,11 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
  }
 }
 template <typename pchar>
-static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
+static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern) {
                                              int start) {
  int m = pattern.length();
  int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
  int len = m - start;
  // Compute Good Suffix tables.
  bmgs_buffers.init(m);
@ -2220,6 +2234,7 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
  }
 }
 template <typename schar, typename pchar>
 static inline int CharOccurrence(int char_code) {
  if (sizeof(schar) == 1) {
@ -2234,6 +2249,7 @@ static inline int CharOccurrence(int char_code) {
  return bad_char_occurrence[char_code % kBMAlphabetSize];
 }
 // Restricted simplified Boyer-Moore string matching.
 // Uses only the bad-shift table of Boyer-Moore and only uses it
 // for the character compared to the last character of the needle.
@ -2242,14 +2258,13 @@ static int BoyerMooreHorspool(Vector<const schar> subject,
                              Vector<const pchar> pattern,
                              int start_index,
                              bool* complete) {
  ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
  int n = subject.length();
  int m = pattern.length();
  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
-  BoyerMoorePopulateBadCharTable(pattern, start);
+  int badness = -m;
-  int badness = -m;  // How bad we are doing without a good-suffix table.
+  // How bad we are doing without a good-suffix table.
  int idx;  // No matches found prior to this index.
  pchar last_char = pattern[m - 1];
  int last_char_shift = m - 1 - CharOccurrence<schar, pchar>(last_char);
@ -2294,13 +2309,12 @@ template <typename schar, typename pchar>
 static int BoyerMooreIndexOf(Vector<const schar> subject,
                             Vector<const pchar> pattern,
                             int idx) {
  ASSERT(algorithm <= BOYER_MOORE);
  int n = subject.length();
  int m = pattern.length();
  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
  // Build the Good Suffix table and continue searching.
  BoyerMoorePopulateGoodSuffixTable(pattern, start);
  pchar last_char = pattern[m - 1];
  // Continue search from i.
  while (idx <= n - m) {
@ -2336,9 +2350,9 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
 template <typename schar>
-static int SingleCharIndexOf(Vector<const schar> string,
+static inline int SingleCharIndexOf(Vector<const schar> string,
-                             schar pattern_char,
+                                    schar pattern_char,
-                             int start_index) {
+                                    int start_index) {
  for (int i = start_index, n = string.length(); i < n; i++) {
    if (pattern_char == string[i]) {
      return i;
@ -2376,10 +2390,10 @@ static int SimpleIndexOf(Vector<const schar> subject,
  // done enough work we decide it's probably worth switching to a better
  // algorithm.
  int badness = -10 - (pattern.length() << 2);
  // We know our pattern is at least 2 characters, we cache the first so
  // the common case of the first character not matching is faster.
  pchar pattern_first_char = pattern[0];
  for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {
    badness++;
    if (badness > 0) {
@ -2427,39 +2441,84 @@ static int SimpleIndexOf(Vector<const schar> subject,
 }
-// Dispatch to different algorithms.
+// Strategy for searching for a string in another string.
-template <typename schar, typename pchar>
+enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
 static int StringMatchStrategy(Vector<const schar> sub,
                               Vector<const pchar> pat,
                               int start_index) {
  ASSERT(pat.length() > 1);
 template <typename pchar>
 static inline StringSearchStrategy InitializeStringSearch(
    Vector<const pchar> pat, bool ascii_subject) {
  ASSERT(pat.length() > 1);
  // We have an ASCII haystack and a non-ASCII needle. Check if there
  // really is a non-ASCII character in the needle and bail out if there
  // is.
-  if (sizeof(schar) == 1 && sizeof(pchar) > 1) {
+  if (ascii_subject && sizeof(pchar) > 1) {
    for (int i = 0; i < pat.length(); i++) {
      uc16 c = pat[i];
      if (c > String::kMaxAsciiCharCode) {
-        return -1;
+        return SEARCH_FAIL;
      }
    }
  }
  if (pat.length() < kBMMinPatternLength) {
-    // We don't believe fancy searching can ever be more efficient.
+    return SEARCH_SHORT;
    // The max shift of Boyer-Moore on a pattern of this length does
    // not compensate for the overhead.
    return SimpleIndexOf(sub, pat, start_index);
  }
  algorithm = SIMPLE_SEARCH;
  return SEARCH_LONG;
 }
 // Dispatch long needle searches to different algorithms.
 template <typename schar, typename pchar>
 static int ComplexIndexOf(Vector<const schar> sub,
                          Vector<const pchar> pat,
                          int start_index) {
  ASSERT(pat.length() >= kBMMinPatternLength);
  // Try algorithms in order of increasing setup cost and expected performance.
  bool complete;
-  int idx = SimpleIndexOf(sub, pat, start_index, &complete);
+  int idx = start_index;
-  if (complete) return idx;
+  switch (algorithm) {
-  idx = BoyerMooreHorspool(sub, pat, idx, &complete);
+    case SIMPLE_SEARCH:
-  if (complete) return idx;
+      idx = SimpleIndexOf(sub, pat, idx, &complete);
-  return BoyerMooreIndexOf(sub, pat, idx);
+      if (complete) return idx;
      BoyerMoorePopulateBadCharTable(pat);
      algorithm = BOYER_MOORE_HORSPOOL;
      // FALLTHROUGH.
    case BOYER_MOORE_HORSPOOL:
      idx = BoyerMooreHorspool(sub, pat, idx, &complete);
      if (complete) return idx;
      // Build the Good Suffix table and continue searching.
      BoyerMoorePopulateGoodSuffixTable(pat);
      algorithm = BOYER_MOORE;
      // FALLTHROUGH.
    case BOYER_MOORE:
      return BoyerMooreIndexOf(sub, pat, idx);
  }
  UNREACHABLE();
  return -1;
 }
 // Dispatch to different search strategies for a single search.
 // If searching multiple times on the same needle, the search
 // strategy should only be computed once and then dispatch to different
 // loops.
 template <typename schar, typename pchar>
 static int StringSearch(Vector<const schar> sub,
                        Vector<const pchar> pat,
                        int start_index) {
  bool ascii_subject = (sizeof(schar) == 1);
  StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
  switch (strategy) {
    case SEARCH_FAIL: return -1;
    case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
    case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
  }
  UNREACHABLE();
  return -1;
 }
 // Perform string match of pattern on subject, starting at start index.
 // Caller must ensure that 0 <= start_index <= sub->length(),
 // and should check that pat->length() + start_index <= sub->length()
@ -2478,6 +2537,7 @@ int Runtime::StringMatch(Handle<String> sub,
  if (!sub->IsFlat()) {
    FlattenString(sub);
  }
  // Searching for one specific character is common.  For one
  // character patterns linear search is necessary, so any smart
  // algorithm is unnecessary overhead.
@ -2511,15 +2571,15 @@ int Runtime::StringMatch(Handle<String> sub,
  if (pat->IsAsciiRepresentation()) {
    Vector<const char> pat_vector = pat->ToAsciiVector();
    if (sub->IsAsciiRepresentation()) {
-      return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);
+      return StringSearch(sub->ToAsciiVector(), pat_vector, start_index);
    }
-    return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);
+    return StringSearch(sub->ToUC16Vector(), pat_vector, start_index);
  }
  Vector<const uc16> pat_vector = pat->ToUC16Vector();
  if (sub->IsAsciiRepresentation()) {
-    return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);
+    return StringSearch(sub->ToAsciiVector(), pat_vector, start_index);
  }
-  return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);
+  return StringSearch(sub->ToUC16Vector(), pat_vector, start_index);
 }
@ -4274,6 +4334,169 @@ static Object* Runtime_StringTrim(Arguments args) {
 }
 template <typename schar, typename pchar>
 void FindStringIndices(Vector<const schar> subject,
                       Vector<const pchar> pattern,
                       ZoneList<int>* indices,
                       unsigned int limit) {
  ASSERT(limit > 0);
  // Collect indices of pattern in subject, and the end-of-string index.
  // Stop after finding at most limit values.
  StringSearchStrategy strategy =
      InitializeStringSearch(pattern, sizeof(schar) == 1);
  switch (strategy) {
    case SEARCH_FAIL: return;
    case SEARCH_SHORT: {
      int pattern_length = pattern.length();
      int index = 0;
      while (limit > 0) {
        index = SimpleIndexOf(subject, pattern, index);
        if (index < 0) return;
        indices->Add(index);
        index += pattern_length;
        limit--;
      }
      return;
    }
    case SEARCH_LONG: {
      int pattern_length = pattern.length();
      int index = 0;
      while (limit > 0) {
        index = ComplexIndexOf(subject, pattern, index);
        if (index < 0) return;
        indices->Add(index);
        index += pattern_length;
        limit--;
      }
      return;
    }
    default:
      UNREACHABLE();
      return;
  }
 }
 template <typename schar>
 void inline FindCharIndices(Vector<const schar> subject,
                            const schar pattern_char,
                            ZoneList<int>* indices,
                            unsigned int limit) {
  // Collect indices of pattern_char in subject, and the end-of-string index.
  // Stop after finding at most limit values.
  int index = 0;
  while (limit > 0) {
    index = SingleCharIndexOf(subject, pattern_char, index);
    if (index < 0) return;
    indices->Add(index);
    index++;
    limit--;
  }
 }
 static Object* Runtime_StringSplit(Arguments args) {
  ASSERT(args.length() == 3);
  HandleScope handle_scope;
  CONVERT_ARG_CHECKED(String, subject, 0);
  CONVERT_ARG_CHECKED(String, pattern, 1);
  CONVERT_NUMBER_CHECKED(uint32_t, limit, Uint32, args[2]);
  int subject_length = subject->length();
  int pattern_length = pattern->length();
  RUNTIME_ASSERT(pattern_length > 0);
  // The limit can be very large (0xffffffffu), but since the pattern
  // isn't empty, we can never create more parts than ~half the length
  // of the subject.
  if (!subject->IsFlat()) FlattenString(subject);
  static const int kMaxInitialListCapacity = 16;
  ZoneScope scope(DELETE_ON_EXIT);
  // Find (up to limit) indices of separator and end-of-string in subject
  int initial_capacity = Min<uint32_t>(kMaxInitialListCapacity, limit);
  ZoneList<int> indices(initial_capacity);
  if (pattern_length == 1) {
    // Special case, go directly to fast single-character split.
    AssertNoAllocation nogc;
    uc16 pattern_char = pattern->Get(0);
    if (subject->IsTwoByteRepresentation()) {
      FindCharIndices(subject->ToUC16Vector(), pattern_char,
                      &indices,
                      limit);
    } else if (pattern_char <= String::kMaxAsciiCharCode) {
      FindCharIndices(subject->ToAsciiVector(),
                      static_cast<char>(pattern_char),
                      &indices,
                      limit);
    }
  } else {
    if (!pattern->IsFlat()) FlattenString(pattern);
    AssertNoAllocation nogc;
    if (subject->IsAsciiRepresentation()) {
      Vector<const char> subject_vector = subject->ToAsciiVector();
      if (pattern->IsAsciiRepresentation()) {
        FindStringIndices(subject_vector,
                          pattern->ToAsciiVector(),
                          &indices,
                          limit);
      } else {
        FindStringIndices(subject_vector,
                          pattern->ToUC16Vector(),
                          &indices,
                          limit);
      }
    } else {
      Vector<const uc16> subject_vector = subject->ToUC16Vector();
      if (pattern->IsAsciiRepresentation()) {
        FindStringIndices(subject_vector,
                          pattern->ToAsciiVector(),
                          &indices,
                          limit);
      } else {
        FindStringIndices(subject_vector,
                          pattern->ToUC16Vector(),
                          &indices,
                          limit);
      }
    }
  }
  if (static_cast<uint32_t>(indices.length()) < limit) {
    indices.Add(subject_length);
  }
  // The list indices now contains the end of each part to create.
  // Create JSArray of substrings separated by separator.
  int part_count = indices.length();
  Handle<JSArray> result = Factory::NewJSArray(part_count);
  result->set_length(Smi::FromInt(part_count));
  ASSERT(result->HasFastElements());
  if (part_count == 1 && indices.at(0) == subject_length) {
    FixedArray::cast(result->elements())->set(0, *subject);
    return *result;
  }
  Handle<FixedArray> elements(FixedArray::cast(result->elements()));
  int part_start = 0;
  for (int i = 0; i < part_count; i++) {
    HandleScope local_loop_handle;
    int part_end = indices.at(i);
    Handle<String> substring =
        Factory::NewSubString(subject, part_start, part_end);
    elements->set(i, *substring);
    part_start = part_end + pattern_length;
  }
  return *result;
 }
 // Copies ascii characters to the given fixed array looking up
 // one-char strings in the cache. Gives up on the first char that is
 // not in the cache and fills the remainder with smi zeros. Returns
--- a/src/runtime.h
+++ b/src/runtime.h
@ -93,6 +93,7 @@ namespace internal {
  F(StringParseFloat, 1, 1) \
  F(StringToLowerCase, 1, 1) \
  F(StringToUpperCase, 1, 1) \
  F(StringSplit, 3, 1) \
  F(CharFromCode, 1, 1) \
  F(URIEscape, 1, 1) \
  F(URIUnescape, 1, 1) \
--- a/src/string.js
+++ b/src/string.js
@ -557,7 +557,7 @@ function StringSplit(separator, limit) {
  // ECMA-262 says that if separator is undefined, the result should
  // be an array of size 1 containing the entire string.  SpiderMonkey
-  // and KJS have this behaviour only when no separator is given.  If
+  // and KJS have this behavior only when no separator is given.  If
  // undefined is explicitly given, they convert it to a string and
  // use that.  We do as SpiderMonkey and KJS.
  if (%_ArgumentsLength() === 0) {
@ -572,18 +572,7 @@ function StringSplit(separator, limit) {
    // If the separator string is empty then return the elements in the subject.
    if (separator_length === 0) return %StringToArray(subject);
-    var result = [];
+    var result = %StringSplit(subject, separator, limit);
    var start_index = 0;
    var index;
    while (true) {
      if (start_index + separator_length > length ||
          (index = %StringIndexOf(subject, separator, start_index)) === -1) {
        result.push(SubString(subject, start_index, length));
        break;
      }
      if (result.push(SubString(subject, start_index, index)) === limit) break;
      start_index = index + separator_length;
    }
    return result;
  }