v8/src/jsregexp.cc

577 lines
19 KiB
C++
Raw Normal View History

// Copyright 2006-2008 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "v8.h"
#include "execution.h"
#include "factory.h"
#include "jsregexp.h"
#include "platform.h"
#include "runtime.h"
#include "top.h"
#include "compilation-cache.h"
// Including pcre.h undefines DEBUG to avoid getting debug output from
// the JSCRE implementation. Make sure to redefine it in debug mode
// after having included the header file.
#ifdef DEBUG
#include "third_party/jscre/pcre.h"
#define DEBUG
#else
#include "third_party/jscre/pcre.h"
#endif
namespace v8 { namespace internal {
#define CAPTURE_INDEX 0
#define INTERNAL_INDEX 1
static Failure* malloc_failure;
static void* JSREMalloc(size_t size) {
Object* obj = Heap::AllocateByteArray(size);
// If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
// will return NULL to the caller, performs GC there.
// Also pass failure information to the caller.
if (obj->IsFailure()) {
malloc_failure = Failure::cast(obj);
return NULL;
}
// Note: object is unrooted, the caller of jsRegExpCompile must
// create a handle for the return value before doing heap allocation.
return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
}
static void JSREFree(void* p) {
USE(p); // Do nothing, memory is garbage collected.
}
String* RegExpImpl::last_ascii_string_ = NULL;
String* RegExpImpl::two_byte_cached_string_ = NULL;
void RegExpImpl::NewSpaceCollectionPrologue() {
// The two byte string is always in the old space. The Ascii string may be
// in either place. If it is in the old space we don't need to do anything.
if (Heap::InNewSpace(last_ascii_string_)) {
// Invalidate the cache.
last_ascii_string_ = NULL;
two_byte_cached_string_ = NULL;
}
}
void RegExpImpl::OldSpaceCollectionPrologue() {
last_ascii_string_ = NULL;
two_byte_cached_string_ = NULL;
}
Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
Handle<String> pattern,
Handle<String> flags,
bool* has_pending_exception) {
// Ensure that the constructor function has been loaded.
if (!constructor->IsLoaded()) {
LoadLazy(constructor, has_pending_exception);
if (*has_pending_exception) return Handle<Object>();
}
// Call the construct code with 2 arguments.
Object** argv[2] = { Handle<Object>::cast(pattern).location(),
Handle<Object>::cast(flags).location() };
return Execution::New(constructor, 2, argv, has_pending_exception);
}
// Converts a source string to a 16 bit flat string or a SlicedString containing
// a 16 bit flat string).
Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
if (*subject == last_ascii_string_) {
ASSERT(two_byte_cached_string_ != NULL);
return Handle<String>(String::cast(two_byte_cached_string_));
}
Handle<String> two_byte_string = StringToTwoByte(subject);
last_ascii_string_ = *subject;
two_byte_cached_string_ = *two_byte_string;
return two_byte_string;
}
// Converts a source string to a 16 bit flat string or a SlicedString containing
// a 16 bit flat string).
Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
StringShape shape(*pattern);
if (!pattern->IsFlat(shape)) {
FlattenString(pattern);
shape = StringShape(*pattern);
}
Handle<String> flat_string(shape.IsCons() ?
String::cast(ConsString::cast(*pattern)->first()) :
*pattern);
ASSERT(flat_string->IsString());
StringShape flat_shape(*flat_string);
ASSERT(!flat_shape.IsCons());
ASSERT(flat_shape.IsSequential() ||
flat_shape.IsSliced() ||
flat_shape.IsExternal());
if (!flat_shape.IsAsciiRepresentation()) {
return flat_string;
}
int len = flat_string->length(flat_shape);
Handle<String> two_byte_string =
Factory::NewRawTwoByteString(len, TENURED);
uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
return two_byte_string;
}
static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
int flags = JSRegExp::NONE;
StringShape shape(*str);
for (int i = 0; i < str->length(shape); i++) {
switch (str->Get(shape, i)) {
case 'i':
flags |= JSRegExp::IGNORE_CASE;
break;
case 'g':
flags |= JSRegExp::GLOBAL;
break;
case 'm':
flags |= JSRegExp::MULTILINE;
break;
}
}
return JSRegExp::Flags(flags);
}
unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char;
Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
Handle<String> pattern,
Handle<String> flag_str) {
JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
bool in_cache = !cached.is_null();
Handle<Object> result;
StringShape shape(*pattern);
if (in_cache) {
re->set_data(*cached);
result = re;
} else {
bool is_atom = !flags.is_ignore_case();
for (int i = 0; is_atom && i < pattern->length(shape); i++) {
if (is_reg_exp_special_char.get(pattern->Get(shape, i)))
is_atom = false;
}
if (is_atom) {
result = AtomCompile(re, pattern, flags);
} else {
result = JsreCompile(re, pattern, flags);
}
Object* data = re->data();
if (data->IsFixedArray()) {
// If compilation succeeded then the data is set on the regexp
// and we can store it in the cache.
Handle<FixedArray> data(FixedArray::cast(re->data()));
CompilationCache::PutRegExp(pattern, flags, data);
}
}
LOG(RegExpCompileEvent(re, in_cache));
return result;
}
Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
Handle<String> subject,
Handle<Object> index) {
switch (regexp->TypeTag()) {
case JSRegExp::JSCRE:
return JsreExec(regexp, subject, index);
case JSRegExp::ATOM:
return AtomExec(regexp, subject, index);
default:
UNREACHABLE();
return Handle<Object>();
}
}
Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
Handle<String> subject) {
switch (regexp->TypeTag()) {
case JSRegExp::JSCRE:
return JsreExecGlobal(regexp, subject);
case JSRegExp::ATOM:
return AtomExecGlobal(regexp, subject);
default:
UNREACHABLE();
return Handle<Object>();
}
}
Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags) {
Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern);
return re;
}
Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
Handle<String> subject,
Handle<Object> index) {
Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
uint32_t start_index;
if (!Array::IndexFromObject(*index, &start_index)) {
return Handle<Smi>(Smi::FromInt(-1));
}
LOG(RegExpExecEvent(re, start_index, subject));
int value = Runtime::StringMatch(subject, needle, start_index);
if (value == -1) return Factory::null_value();
Handle<FixedArray> array = Factory::NewFixedArray(2);
array->set(0,
Smi::FromInt(value),
SKIP_WRITE_BARRIER);
array->set(1,
Smi::FromInt(value + needle->length()),
SKIP_WRITE_BARRIER);
return Factory::NewJSArrayWithElements(array);
}
Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
Handle<String> subject) {
Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
Handle<JSArray> result = Factory::NewJSArray(1);
int index = 0;
int match_count = 0;
int subject_length = subject->length();
int needle_length = needle->length();
while (true) {
LOG(RegExpExecEvent(re, index, subject));
int value = -1;
if (index + needle_length <= subject_length) {
value = Runtime::StringMatch(subject, needle, index);
}
if (value == -1) break;
HandleScope scope;
int end = value + needle_length;
Handle<FixedArray> array = Factory::NewFixedArray(2);
array->set(0,
Smi::FromInt(value),
SKIP_WRITE_BARRIER);
array->set(1,
Smi::FromInt(end),
SKIP_WRITE_BARRIER);
Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
SetElement(result, match_count, pair);
match_count++;
index = end;
if (needle_length == 0) index++;
}
return result;
}
static inline Object* DoCompile(String* pattern,
JSRegExp::Flags flags,
unsigned* number_of_captures,
const char** error_message,
JscreRegExp** code) {
JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
? JSRegExpIgnoreCase
: JSRegExpDoNotIgnoreCase;
JSRegExpMultilineOption multiline_option = flags.is_multiline()
? JSRegExpMultiline
: JSRegExpSingleLine;
*error_message = NULL;
malloc_failure = Failure::Exception();
*code = jsRegExpCompile(pattern->GetTwoByteData(),
pattern->length(),
case_option,
multiline_option,
number_of_captures,
error_message,
&JSREMalloc,
&JSREFree);
if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
malloc_failure->IsOutOfMemoryFailure())) {
return malloc_failure;
} else {
// It doesn't matter which object we return here, we just need to return
// a non-failure to indicate to the GC-retry code that there was no
// allocation failure.
return pattern;
}
}
void CompileWithRetryAfterGC(Handle<String> pattern,
JSRegExp::Flags flags,
unsigned* number_of_captures,
const char** error_message,
JscreRegExp** code) {
CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern,
flags,
number_of_captures,
error_message,
code));
}
Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags) {
Handle<String> two_byte_pattern = StringToTwoByte(pattern);
unsigned number_of_captures;
const char* error_message = NULL;
JscreRegExp* code = NULL;
FlattenString(pattern);
CompileWithRetryAfterGC(two_byte_pattern,
flags,
&number_of_captures,
&error_message,
&code);
if (code == NULL) {
// Throw an exception.
Handle<JSArray> array = Factory::NewJSArray(2);
SetElement(array, 0, pattern);
SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(
(error_message == NULL) ? "Unknown regexp error" : error_message)));
Handle<Object> regexp_err =
Factory::NewSyntaxError("malformed_regexp", array);
Top::Throw(*regexp_err);
return Handle<Object>();
}
// Convert the return address to a ByteArray pointer.
Handle<ByteArray> internal(
ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
Handle<FixedArray> value = Factory::NewFixedArray(2);
value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures));
value->set(INTERNAL_INDEX, *internal);
Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
return re;
}
Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp,
int num_captures,
Handle<String> subject,
int previous_index,
const uc16* two_byte_subject,
int* offsets_vector,
int offsets_vector_length) {
int rc;
{
AssertNoAllocation a;
ByteArray* internal = JsreInternal(regexp);
const JscreRegExp* js_regexp =
reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress());
LOG(RegExpExecEvent(regexp, previous_index, subject));
rc = jsRegExpExecute(js_regexp,
two_byte_subject,
subject->length(),
previous_index,
offsets_vector,
offsets_vector_length);
}
// The KJS JavaScript engine returns null (ie, a failed match) when
// JSRE's internal match limit is exceeded. We duplicate that behavior here.
if (rc == JSRegExpErrorNoMatch
|| rc == JSRegExpErrorHitLimit) {
return Factory::null_value();
}
// Other JSRE errors:
if (rc < 0) {
// Throw an exception.
Handle<Object> code(Smi::FromInt(rc));
Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
Handle<Object> regexp_err(
Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
return Handle<Object>(Top::Throw(*regexp_err));
}
Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
// The captures come in (start, end+1) pairs.
for (int i = 0; i < 2 * (num_captures+1); i += 2) {
array->set(i,
Smi::FromInt(offsets_vector[i]),
SKIP_WRITE_BARRIER);
array->set(i+1,
Smi::FromInt(offsets_vector[i+1]),
SKIP_WRITE_BARRIER);
}
return Factory::NewJSArrayWithElements(array);
}
class OffsetsVector {
public:
inline OffsetsVector(int num_captures) {
offsets_vector_length_ = (num_captures + 1) * 3;
if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
vector_ = NewArray<int>(offsets_vector_length_);
} else {
vector_ = static_offsets_vector_;
}
}
inline ~OffsetsVector() {
if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
DeleteArray(vector_);
vector_ = NULL;
}
}
inline int* vector() {
return vector_;
}
inline int length() {
return offsets_vector_length_;
}
private:
int* vector_;
int offsets_vector_length_;
static const int kStaticOffsetsVectorSize = 30;
static int static_offsets_vector_[kStaticOffsetsVectorSize];
};
int OffsetsVector::static_offsets_vector_[
OffsetsVector::kStaticOffsetsVectorSize];
Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp,
Handle<String> subject,
Handle<Object> index) {
// Prepare space for the return values.
int num_captures = JsreCapture(regexp);
OffsetsVector offsets(num_captures);
int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
Handle<String> subject16 = CachedStringToTwoByte(subject);
Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,
previous_index,
subject16->GetTwoByteData(),
offsets.vector(), offsets.length()));
return result;
}
Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp,
Handle<String> subject) {
// Prepare space for the return values.
int num_captures = JsreCapture(regexp);
OffsetsVector offsets(num_captures);
int previous_index = 0;
Handle<JSArray> result = Factory::NewJSArray(0);
int i = 0;
Handle<Object> matches;
Handle<String> subject16 = CachedStringToTwoByte(subject);
do {
if (previous_index > subject->length() || previous_index < 0) {
// Per ECMA-262 15.10.6.2, if the previous index is greater than the
// string length, there is no match.
matches = Factory::null_value();
} else {
matches = JsreExecOnce(regexp, num_captures, subject, previous_index,
subject16->GetTwoByteData(),
offsets.vector(), offsets.length());
if (matches->IsJSArray()) {
SetElement(result, i, matches);
i++;
previous_index = offsets.vector()[1];
if (offsets.vector()[0] == offsets.vector()[1]) {
previous_index++;
}
}
}
} while (matches->IsJSArray());
// If we exited the loop with an exception, throw it.
if (matches->IsNull()) { // Exited loop normally.
return result;
} else { // Exited loop with the exception in matches.
return matches;
}
}
int RegExpImpl::JsreCapture(Handle<JSRegExp> re) {
FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
return Smi::cast(value->get(CAPTURE_INDEX))->value();
}
ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) {
FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
return ByteArray::cast(value->get(INTERNAL_INDEX));
}
}} // namespace v8::internal