From c7c7b8b0e75333cff585b1cdecca027e13b35c2e Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Tue, 7 Oct 2008 08:11:44 +0000 Subject: [PATCH] Fast direct-access version of KPM string match. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@452 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/jsregexp.cc | 11 ++- src/objects-inl.h | 5 + src/objects.h | 3 + src/runtime.cc | 242 +++++++++++++++++++++++++++++++++------------- src/runtime.h | 2 +- 5 files changed, 191 insertions(+), 72 deletions(-) diff --git a/src/jsregexp.cc b/src/jsregexp.cc index 0e4dee19de..0ec01ed802 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -218,7 +218,7 @@ Handle RegExpImpl::AtomExec(Handle re, } LOG(RegExpExecEvent(re, start_index, subject)); - int value = Runtime::StringMatchKmp(*subject, *needle, start_index); + int value = Runtime::StringMatchKmp(subject, needle, start_index); if (value == -1) return Factory::null_value(); Handle result = Factory::NewJSArray(2); SetElement(result, 0, Handle(Smi::FromInt(value))); @@ -231,13 +231,16 @@ Handle RegExpImpl::AtomExecGlobal(Handle re, Handle subject) { Handle needle(String::cast(re->data())); Handle result = Factory::NewJSArray(1); - bool keep_going = true; int index = 0; int match_count = 0; + int subject_length = subject->length(); int needle_length = needle->length(); - while (keep_going) { + while (true) { LOG(RegExpExecEvent(re, index, subject)); - int value = Runtime::StringMatchKmp(*subject, *needle, index); + int value = -1; + if (index + needle_length <= subject_length) { + value = Runtime::StringMatchKmp(subject, needle, index); + } if (value == -1) break; HandleScope scope; int end = value + needle_length; diff --git a/src/objects-inl.h b/src/objects-inl.h index 818454bc71..decaf392f6 100644 --- a/src/objects-inl.h +++ b/src/objects-inl.h @@ -1354,6 +1354,11 @@ Address AsciiString::GetCharsAddress() { } +Address TwoByteString::GetCharsAddress() { + return FIELD_ADDR(this, kHeaderSize); +} + + uint16_t TwoByteString::TwoByteStringGet(int index) { ASSERT(index >= 0 && index < length()); return READ_SHORT_FIELD(this, kHeaderSize + index * kShortSize); diff --git a/src/objects.h b/src/objects.h index e34b5dce4d..d718470bab 100644 --- a/src/objects.h +++ b/src/objects.h @@ -3138,6 +3138,9 @@ class TwoByteString: public SeqString { inline uint16_t TwoByteStringGet(int index); inline void TwoByteStringSet(int index, uint16_t value); + // Get the address of the characters in this string. + inline Address GetCharsAddress(); + // For regexp code. const uint16_t* TwoByteStringGetData(unsigned start); diff --git a/src/runtime.cc b/src/runtime.cc index 52e1d671f5..be1dbcddd3 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -42,6 +42,7 @@ #include "runtime.h" #include "scopeinfo.h" #include "v8threads.h" +#include "smart-pointer.h" namespace v8 { namespace internal { @@ -927,110 +928,217 @@ static Object* Runtime_CharFromCode(Arguments args) { } -static inline void ComputeKMPNextTable(String* pattern, int next_table[]) { +static Vector ToAsciiVector(String *string) { + ASSERT(string->IsAscii()); + ASSERT(string->IsFlat()); + + int offset = 0; + int length = string->length(); + StringRepresentationTag string_tag = string->representation_tag(); + if (string_tag == kSlicedStringTag) { + SlicedString* sliced = SlicedString::cast(string); + offset += sliced->start(); + string = String::cast(sliced->buffer()); + string_tag = string->representation_tag(); + } else if (string_tag == kConsStringTag) { + ConsString* cons = ConsString::cast(string); + ASSERT(String::cast(cons->second())->length() == 0); + string = String::cast(cons->first()); + string_tag = string->representation_tag(); + } + if (string_tag == kSeqStringTag) { + AsciiString* seq = AsciiString::cast(string); + char* start = reinterpret_cast(seq->GetCharsAddress()); + return Vector(start + offset, length); + } + ASSERT(string_tag == kExternalStringTag); + ExternalAsciiString* ext = ExternalAsciiString::cast(string); + const char* start = ext->resource()->data(); + return Vector(start + offset, length); +} + + +static Vector ToUC16Vector(String *string) { + ASSERT(string->IsTwoByteString()); + ASSERT(string->IsFlat()); + + int offset = 0; + int length = string->length(); + + StringRepresentationTag string_tag = string->representation_tag(); + if (string_tag == kSlicedStringTag) { + SlicedString* sliced = SlicedString::cast(string); + offset += sliced->start(); + string = String::cast(sliced->buffer()); + string_tag = string->representation_tag(); + } else if (string_tag == kConsStringTag) { + ConsString* cons = ConsString::cast(string); + ASSERT(String::cast(cons->second())->length() == 0); + string = String::cast(cons->first()); + string_tag = string->representation_tag(); + } + if (string_tag == kSeqStringTag) { + TwoByteString* seq = TwoByteString::cast(string); + uc16* start = reinterpret_cast(seq->GetCharsAddress()); + return Vector(start + offset, length); + } + ASSERT(string_tag == kExternalStringTag); + ExternalTwoByteString* ext = ExternalTwoByteString::cast(string); + const uc16* start = + reinterpret_cast(ext->resource()->data()); + return Vector(start + offset, length); +} + + +template +static int SingleCharIndexOf(Vector string, + pchar pattern_char, + int start_index) { + for (int i = start_index, n = string.length(); i < n; i++) { + if (pattern_char == string[i]) { + return i; + } + } + return -1; +} + +// Trivial string search for shorter strings. +template +static int SimpleIndexOf(Vector subject, + Vector pattern, + int start_index) { + int pattern_length = pattern.length(); + int subject_length = subject.length(); + // We know our pattern is at least 2 characters, we cache the first so + // the common case of the first character not matching is faster. + pchar pattern_first_char = pattern[0]; + for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) { + if (subject[i] != pattern_first_char) continue; + + bool failure = false; + for (int j = 1; j < pattern_length; j++) { + if (pattern[j] != subject[j+i]) { + failure = true; + break; + } + } + if (!failure) { + return i; + } + } + return -1; +} + +// Full KMP pattern match. +template // Pattern & subject char types +static int KMPIndexOf(Vector subject, + Vector pattern, + int start_index) { + int subject_length = subject.length(); + int pattern_length = pattern.length(); + SmartPointer next_table(NewArray(pattern_length)); + + // Compute KMP "next" table int i = 0; int j = -1; next_table[0] = -1; - Access buffer(&string_input_buffer); - buffer->Reset(pattern); - int length = pattern->length(); - uint16_t p = buffer->GetNext(); - while (i < length - 1) { - while (j > -1 && p != pattern->Get(j)) { + pchar p = pattern[0]; + while (i < pattern_length - 1) { + while (j > -1 && p != pattern[j]) { j = next_table[j]; } i++; j++; - p = buffer->GetNext(); - if (p == pattern->Get(j)) { + p = pattern[i]; + if (p == pattern[j]) { next_table[i] = next_table[j]; } else { next_table[i] = j; } } -} - -int Runtime::StringMatchKmp(String* sub, String* pat, int start_index) { - sub->TryFlatten(); - pat->TryFlatten(); - - int subject_length = sub->length(); - int pattern_length = pat->length(); - - if (start_index > subject_length) return -1; - if (pattern_length == 0) return start_index; - - // Searching for one specific character is common. For one - // character patterns the KMP algorithm is guaranteed to slow down - // the search, so we just run through the subject string. - if (pattern_length == 1) { - uint16_t pattern_char = pat->Get(0); - for (int i = start_index; i < subject_length; i++) { - if (sub->Get(i) == pattern_char) { - return i; - } - } - return -1; - } - - // For small searches, KMP is not worth the setup overhead. - if (subject_length < 100) { - // We know our pattern is at least 2 characters, we cache the first so - // the common case of the first character not matching is faster. - uint16_t pattern_first_char = pat->Get(0); - for (int i = start_index; i + pattern_length <= subject_length; i++) { - if (sub->Get(i) != pattern_first_char) continue; - - for (int j = 1; j < pattern_length; j++) { - if (pat->Get(j) != sub->Get(j + i)) break; - if (j == pattern_length - 1) return i; - } - } - return -1; - } - - // For patterns with a larger length we use the KMP algorithm. - // - // Compute the 'next' table. - int* next_table = NewArray(pattern_length); - ComputeKMPNextTable(pat, next_table); // Search using the 'next' table. int pattern_index = 0; - // We would like to use StringInputBuffer here, but it does not have - // the ability to start anywhere but the first character of a - // string. It would be nice to have efficient forward-seeking - // support on StringInputBuffers. int subject_index = start_index; while (subject_index < subject_length) { - uint16_t subject_char = sub->Get(subject_index); - while (pattern_index > -1 && pat->Get(pattern_index) != subject_char) { + schar subject_char = subject[subject_index]; + while (pattern_index > -1 && pattern[pattern_index] != subject_char) { pattern_index = next_table[pattern_index]; } pattern_index++; subject_index++; if (pattern_index >= pattern_length) { - DeleteArray(next_table); return subject_index - pattern_index; } } - DeleteArray(next_table); return -1; } +// Dispatch to different algorithms for different length of pattern/subject +template +static int StringMatchKMP(Vector sub, + Vector pat, + int start_index) { + // Searching for one specific character is common. For one + // character patterns the KMP algorithm is guaranteed to slow down + // the search, so we just run through the subject string. + if (pat.length() == 1) { + return SingleCharIndexOf(sub, pat[0], start_index); + } + + // For small searches, KMP is not worth the setup overhead. + if (sub.length() - start_index < 100) { + return SimpleIndexOf(sub, pat, start_index); + } + + // For patterns with a larger length we use the KMP algorithm. + return KMPIndexOf(sub, pat, start_index); +} + +// Perform string match of pattern on subject, starting at start index. +// Caller must ensure that 0 <= start_index <= sub->length(), +// and should check that pat->length() + start_index <= sub->length() +int Runtime::StringMatchKmp(Handle sub, + Handle pat, + int start_index) { + ASSERT(0 <= start_index); + ASSERT(start_index <= sub->length()); + + if (pat->length() == 0) return start_index; + FlattenString(sub); + FlattenString(pat); + + AssertNoAllocation no_heap_allocation; // ensure vectors stay valid + // dispatch on type of strings + if (pat->is_ascii()) { + Vector pat_vector = ToAsciiVector(*pat); + if (sub->is_ascii()) { + return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index); + } + return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index); + } + Vector pat_vector = ToUC16Vector(*pat); + if (sub->is_ascii()) { + return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index); + } + return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index); +} + static Object* Runtime_StringIndexOf(Arguments args) { - NoHandleAllocation ha; + HandleScope scope; // create a new handle scope ASSERT(args.length() == 3); - CONVERT_CHECKED(String, sub, args[0]); - CONVERT_CHECKED(String, pat, args[1]); + CONVERT_ARG_CHECKED(String, sub, 0); + CONVERT_ARG_CHECKED(String, pat, 1); + Object* index = args[2]; uint32_t start_index; if (!Array::IndexFromObject(index, &start_index)) return Smi::FromInt(-1); - return Smi::FromInt(Runtime::StringMatchKmp(sub, pat, start_index)); + int position = Runtime::StringMatchKmp(sub, pat, start_index); + return Smi::FromInt(position); } diff --git a/src/runtime.h b/src/runtime.h index d47003197b..23b4b1709f 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -332,7 +332,7 @@ class Runtime : public AllStatic { // Get the runtime function with the given name. static Function* FunctionForName(const char* name); - static int StringMatchKmp(String* sub, String* pat, int index); + static int StringMatchKmp(Handle sub, Handle pat, int index); // TODO(1240886): The following three methods are *not* handle safe, // but accept handle arguments. This seems fragile.