Fast direct-access version of KPM string match.
git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@452 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
f7174fd151
commit
c7c7b8b0e7
@ -218,7 +218,7 @@ Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
|
||||
}
|
||||
|
||||
LOG(RegExpExecEvent(re, start_index, subject));
|
||||
int value = Runtime::StringMatchKmp(*subject, *needle, start_index);
|
||||
int value = Runtime::StringMatchKmp(subject, needle, start_index);
|
||||
if (value == -1) return Factory::null_value();
|
||||
Handle<JSArray> result = Factory::NewJSArray(2);
|
||||
SetElement(result, 0, Handle<Smi>(Smi::FromInt(value)));
|
||||
@ -231,13 +231,16 @@ Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
|
||||
Handle<String> subject) {
|
||||
Handle<String> needle(String::cast(re->data()));
|
||||
Handle<JSArray> result = Factory::NewJSArray(1);
|
||||
bool keep_going = true;
|
||||
int index = 0;
|
||||
int match_count = 0;
|
||||
int subject_length = subject->length();
|
||||
int needle_length = needle->length();
|
||||
while (keep_going) {
|
||||
while (true) {
|
||||
LOG(RegExpExecEvent(re, index, subject));
|
||||
int value = Runtime::StringMatchKmp(*subject, *needle, index);
|
||||
int value = -1;
|
||||
if (index + needle_length <= subject_length) {
|
||||
value = Runtime::StringMatchKmp(subject, needle, index);
|
||||
}
|
||||
if (value == -1) break;
|
||||
HandleScope scope;
|
||||
int end = value + needle_length;
|
||||
|
@ -1354,6 +1354,11 @@ Address AsciiString::GetCharsAddress() {
|
||||
}
|
||||
|
||||
|
||||
Address TwoByteString::GetCharsAddress() {
|
||||
return FIELD_ADDR(this, kHeaderSize);
|
||||
}
|
||||
|
||||
|
||||
uint16_t TwoByteString::TwoByteStringGet(int index) {
|
||||
ASSERT(index >= 0 && index < length());
|
||||
return READ_SHORT_FIELD(this, kHeaderSize + index * kShortSize);
|
||||
|
@ -3138,6 +3138,9 @@ class TwoByteString: public SeqString {
|
||||
inline uint16_t TwoByteStringGet(int index);
|
||||
inline void TwoByteStringSet(int index, uint16_t value);
|
||||
|
||||
// Get the address of the characters in this string.
|
||||
inline Address GetCharsAddress();
|
||||
|
||||
// For regexp code.
|
||||
const uint16_t* TwoByteStringGetData(unsigned start);
|
||||
|
||||
|
242
src/runtime.cc
242
src/runtime.cc
@ -42,6 +42,7 @@
|
||||
#include "runtime.h"
|
||||
#include "scopeinfo.h"
|
||||
#include "v8threads.h"
|
||||
#include "smart-pointer.h"
|
||||
|
||||
namespace v8 { namespace internal {
|
||||
|
||||
@ -927,110 +928,217 @@ static Object* Runtime_CharFromCode(Arguments args) {
|
||||
}
|
||||
|
||||
|
||||
static inline void ComputeKMPNextTable(String* pattern, int next_table[]) {
|
||||
static Vector<const char> ToAsciiVector(String *string) {
|
||||
ASSERT(string->IsAscii());
|
||||
ASSERT(string->IsFlat());
|
||||
|
||||
int offset = 0;
|
||||
int length = string->length();
|
||||
StringRepresentationTag string_tag = string->representation_tag();
|
||||
if (string_tag == kSlicedStringTag) {
|
||||
SlicedString* sliced = SlicedString::cast(string);
|
||||
offset += sliced->start();
|
||||
string = String::cast(sliced->buffer());
|
||||
string_tag = string->representation_tag();
|
||||
} else if (string_tag == kConsStringTag) {
|
||||
ConsString* cons = ConsString::cast(string);
|
||||
ASSERT(String::cast(cons->second())->length() == 0);
|
||||
string = String::cast(cons->first());
|
||||
string_tag = string->representation_tag();
|
||||
}
|
||||
if (string_tag == kSeqStringTag) {
|
||||
AsciiString* seq = AsciiString::cast(string);
|
||||
char* start = reinterpret_cast<char*>(seq->GetCharsAddress());
|
||||
return Vector<const char>(start + offset, length);
|
||||
}
|
||||
ASSERT(string_tag == kExternalStringTag);
|
||||
ExternalAsciiString* ext = ExternalAsciiString::cast(string);
|
||||
const char* start = ext->resource()->data();
|
||||
return Vector<const char>(start + offset, length);
|
||||
}
|
||||
|
||||
|
||||
static Vector<const uc16> ToUC16Vector(String *string) {
|
||||
ASSERT(string->IsTwoByteString());
|
||||
ASSERT(string->IsFlat());
|
||||
|
||||
int offset = 0;
|
||||
int length = string->length();
|
||||
|
||||
StringRepresentationTag string_tag = string->representation_tag();
|
||||
if (string_tag == kSlicedStringTag) {
|
||||
SlicedString* sliced = SlicedString::cast(string);
|
||||
offset += sliced->start();
|
||||
string = String::cast(sliced->buffer());
|
||||
string_tag = string->representation_tag();
|
||||
} else if (string_tag == kConsStringTag) {
|
||||
ConsString* cons = ConsString::cast(string);
|
||||
ASSERT(String::cast(cons->second())->length() == 0);
|
||||
string = String::cast(cons->first());
|
||||
string_tag = string->representation_tag();
|
||||
}
|
||||
if (string_tag == kSeqStringTag) {
|
||||
TwoByteString* seq = TwoByteString::cast(string);
|
||||
uc16* start = reinterpret_cast<uc16*>(seq->GetCharsAddress());
|
||||
return Vector<const uc16>(start + offset, length);
|
||||
}
|
||||
ASSERT(string_tag == kExternalStringTag);
|
||||
ExternalTwoByteString* ext = ExternalTwoByteString::cast(string);
|
||||
const uc16* start =
|
||||
reinterpret_cast<const uc16*>(ext->resource()->data());
|
||||
return Vector<const uc16>(start + offset, length);
|
||||
}
|
||||
|
||||
|
||||
template <typename schar, typename pchar>
|
||||
static int SingleCharIndexOf(Vector<const schar> string,
|
||||
pchar pattern_char,
|
||||
int start_index) {
|
||||
for (int i = start_index, n = string.length(); i < n; i++) {
|
||||
if (pattern_char == string[i]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Trivial string search for shorter strings.
|
||||
template <typename pchar, typename schar>
|
||||
static int SimpleIndexOf(Vector<const schar> subject,
|
||||
Vector<const pchar> pattern,
|
||||
int start_index) {
|
||||
int pattern_length = pattern.length();
|
||||
int subject_length = subject.length();
|
||||
// We know our pattern is at least 2 characters, we cache the first so
|
||||
// the common case of the first character not matching is faster.
|
||||
pchar pattern_first_char = pattern[0];
|
||||
for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
|
||||
if (subject[i] != pattern_first_char) continue;
|
||||
|
||||
bool failure = false;
|
||||
for (int j = 1; j < pattern_length; j++) {
|
||||
if (pattern[j] != subject[j+i]) {
|
||||
failure = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!failure) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Full KMP pattern match.
|
||||
template <typename schar, typename pchar> // Pattern & subject char types
|
||||
static int KMPIndexOf(Vector<const schar> subject,
|
||||
Vector<const pchar> pattern,
|
||||
int start_index) {
|
||||
int subject_length = subject.length();
|
||||
int pattern_length = pattern.length();
|
||||
SmartPointer<int> next_table(NewArray<int>(pattern_length));
|
||||
|
||||
// Compute KMP "next" table
|
||||
int i = 0;
|
||||
int j = -1;
|
||||
next_table[0] = -1;
|
||||
|
||||
Access<StringInputBuffer> buffer(&string_input_buffer);
|
||||
buffer->Reset(pattern);
|
||||
int length = pattern->length();
|
||||
uint16_t p = buffer->GetNext();
|
||||
while (i < length - 1) {
|
||||
while (j > -1 && p != pattern->Get(j)) {
|
||||
pchar p = pattern[0];
|
||||
while (i < pattern_length - 1) {
|
||||
while (j > -1 && p != pattern[j]) {
|
||||
j = next_table[j];
|
||||
}
|
||||
i++;
|
||||
j++;
|
||||
p = buffer->GetNext();
|
||||
if (p == pattern->Get(j)) {
|
||||
p = pattern[i];
|
||||
if (p == pattern[j]) {
|
||||
next_table[i] = next_table[j];
|
||||
} else {
|
||||
next_table[i] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int Runtime::StringMatchKmp(String* sub, String* pat, int start_index) {
|
||||
sub->TryFlatten();
|
||||
pat->TryFlatten();
|
||||
|
||||
int subject_length = sub->length();
|
||||
int pattern_length = pat->length();
|
||||
|
||||
if (start_index > subject_length) return -1;
|
||||
if (pattern_length == 0) return start_index;
|
||||
|
||||
// Searching for one specific character is common. For one
|
||||
// character patterns the KMP algorithm is guaranteed to slow down
|
||||
// the search, so we just run through the subject string.
|
||||
if (pattern_length == 1) {
|
||||
uint16_t pattern_char = pat->Get(0);
|
||||
for (int i = start_index; i < subject_length; i++) {
|
||||
if (sub->Get(i) == pattern_char) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// For small searches, KMP is not worth the setup overhead.
|
||||
if (subject_length < 100) {
|
||||
// We know our pattern is at least 2 characters, we cache the first so
|
||||
// the common case of the first character not matching is faster.
|
||||
uint16_t pattern_first_char = pat->Get(0);
|
||||
for (int i = start_index; i + pattern_length <= subject_length; i++) {
|
||||
if (sub->Get(i) != pattern_first_char) continue;
|
||||
|
||||
for (int j = 1; j < pattern_length; j++) {
|
||||
if (pat->Get(j) != sub->Get(j + i)) break;
|
||||
if (j == pattern_length - 1) return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// For patterns with a larger length we use the KMP algorithm.
|
||||
//
|
||||
// Compute the 'next' table.
|
||||
int* next_table = NewArray<int>(pattern_length);
|
||||
ComputeKMPNextTable(pat, next_table);
|
||||
// Search using the 'next' table.
|
||||
int pattern_index = 0;
|
||||
// We would like to use StringInputBuffer here, but it does not have
|
||||
// the ability to start anywhere but the first character of a
|
||||
// string. It would be nice to have efficient forward-seeking
|
||||
// support on StringInputBuffers.
|
||||
int subject_index = start_index;
|
||||
while (subject_index < subject_length) {
|
||||
uint16_t subject_char = sub->Get(subject_index);
|
||||
while (pattern_index > -1 && pat->Get(pattern_index) != subject_char) {
|
||||
schar subject_char = subject[subject_index];
|
||||
while (pattern_index > -1 && pattern[pattern_index] != subject_char) {
|
||||
pattern_index = next_table[pattern_index];
|
||||
}
|
||||
pattern_index++;
|
||||
subject_index++;
|
||||
if (pattern_index >= pattern_length) {
|
||||
DeleteArray(next_table);
|
||||
return subject_index - pattern_index;
|
||||
}
|
||||
}
|
||||
DeleteArray(next_table);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Dispatch to different algorithms for different length of pattern/subject
|
||||
template <typename schar, typename pchar>
|
||||
static int StringMatchKMP(Vector<const schar> sub,
|
||||
Vector<const pchar> pat,
|
||||
int start_index) {
|
||||
// Searching for one specific character is common. For one
|
||||
// character patterns the KMP algorithm is guaranteed to slow down
|
||||
// the search, so we just run through the subject string.
|
||||
if (pat.length() == 1) {
|
||||
return SingleCharIndexOf(sub, pat[0], start_index);
|
||||
}
|
||||
|
||||
// For small searches, KMP is not worth the setup overhead.
|
||||
if (sub.length() - start_index < 100) {
|
||||
return SimpleIndexOf(sub, pat, start_index);
|
||||
}
|
||||
|
||||
// For patterns with a larger length we use the KMP algorithm.
|
||||
return KMPIndexOf(sub, pat, start_index);
|
||||
}
|
||||
|
||||
// Perform string match of pattern on subject, starting at start index.
|
||||
// Caller must ensure that 0 <= start_index <= sub->length(),
|
||||
// and should check that pat->length() + start_index <= sub->length()
|
||||
int Runtime::StringMatchKmp(Handle<String> sub,
|
||||
Handle<String> pat,
|
||||
int start_index) {
|
||||
ASSERT(0 <= start_index);
|
||||
ASSERT(start_index <= sub->length());
|
||||
|
||||
if (pat->length() == 0) return start_index;
|
||||
FlattenString(sub);
|
||||
FlattenString(pat);
|
||||
|
||||
AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
|
||||
// dispatch on type of strings
|
||||
if (pat->is_ascii()) {
|
||||
Vector<const char> pat_vector = ToAsciiVector(*pat);
|
||||
if (sub->is_ascii()) {
|
||||
return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
|
||||
}
|
||||
return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
|
||||
}
|
||||
Vector<const uc16> pat_vector = ToUC16Vector(*pat);
|
||||
if (sub->is_ascii()) {
|
||||
return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
|
||||
}
|
||||
return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
|
||||
}
|
||||
|
||||
|
||||
static Object* Runtime_StringIndexOf(Arguments args) {
|
||||
NoHandleAllocation ha;
|
||||
HandleScope scope; // create a new handle scope
|
||||
ASSERT(args.length() == 3);
|
||||
|
||||
CONVERT_CHECKED(String, sub, args[0]);
|
||||
CONVERT_CHECKED(String, pat, args[1]);
|
||||
CONVERT_ARG_CHECKED(String, sub, 0);
|
||||
CONVERT_ARG_CHECKED(String, pat, 1);
|
||||
|
||||
Object* index = args[2];
|
||||
uint32_t start_index;
|
||||
if (!Array::IndexFromObject(index, &start_index)) return Smi::FromInt(-1);
|
||||
|
||||
return Smi::FromInt(Runtime::StringMatchKmp(sub, pat, start_index));
|
||||
int position = Runtime::StringMatchKmp(sub, pat, start_index);
|
||||
return Smi::FromInt(position);
|
||||
}
|
||||
|
||||
|
||||
|
@ -332,7 +332,7 @@ class Runtime : public AllStatic {
|
||||
// Get the runtime function with the given name.
|
||||
static Function* FunctionForName(const char* name);
|
||||
|
||||
static int StringMatchKmp(String* sub, String* pat, int index);
|
||||
static int StringMatchKmp(Handle<String> sub, Handle<String> pat, int index);
|
||||
|
||||
// TODO(1240886): The following three methods are *not* handle safe,
|
||||
// but accept handle arguments. This seems fragile.
|
||||
|
Loading…
Reference in New Issue
Block a user