Refactoring of RegExp interface to better support calling several times in a row.

Review URL: http://codereview.chromium.org/1114001

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4190 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
lrn@chromium.org 2010-03-19 12:01:17 +00:00
parent 9582645a02
commit 4db15f1235
8 changed files with 189 additions and 122 deletions

View File

@ -648,16 +648,17 @@ Handle<Object> RegExpMacroAssemblerARM::GetCode(Handle<String> source) {
__ ldr(r0, MemOperand(frame_pointer(), kInputStart));
// Find negative length (offset of start relative to end).
__ sub(current_input_offset(), r0, end_of_input_address());
// Set r0 to address of char before start of input
// Set r0 to address of char before start of the input string
// (effectively string position -1).
__ ldr(r1, MemOperand(frame_pointer(), kStartIndex));
__ sub(r0, current_input_offset(), Operand(char_size()));
__ sub(r0, r0, Operand(r1, LSL, (mode_ == UC16) ? 1 : 0));
// Store this value in a local variable, for use when clearing
// position registers.
__ str(r0, MemOperand(frame_pointer(), kInputStartMinusOne));
// Determine whether the start index is zero, that is at the start of the
// string, and store that value in a local variable.
__ ldr(r1, MemOperand(frame_pointer(), kStartIndex));
__ tst(r1, Operand(r1));
__ mov(r1, Operand(1), LeaveCC, eq);
__ mov(r1, Operand(0), LeaveCC, ne);
@ -700,12 +701,15 @@ Handle<Object> RegExpMacroAssemblerARM::GetCode(Handle<String> source) {
// copy captures to output
__ ldr(r1, MemOperand(frame_pointer(), kInputStart));
__ ldr(r0, MemOperand(frame_pointer(), kRegisterOutput));
__ ldr(r2, MemOperand(frame_pointer(), kStartIndex));
__ sub(r1, end_of_input_address(), r1);
// r1 is length of input in bytes.
if (mode_ == UC16) {
__ mov(r1, Operand(r1, LSR, 1));
}
// r1 is length of input in characters.
__ add(r1, r1, Operand(r2));
// r1 is length of string in characters.
ASSERT_EQ(0, num_saved_registers_ % 2);
// Always an even number of capture registers. This allows us to

View File

@ -10584,15 +10584,7 @@ void RegExpExecStub::Generate(MacroAssembler* masm) {
__ j(negative, &done);
// Read the value from the static offsets vector buffer.
__ mov(edi, Operand(ecx, edx, times_int_size, 0));
// Perform explicit shift
ASSERT_EQ(0, kSmiTag);
__ shl(edi, kSmiTagSize);
// Add previous index (from its stack slot) if value is not negative.
Label capture_negative;
// Carry flag set by shift above.
__ j(negative, &capture_negative, not_taken);
__ add(edi, Operand(eax)); // Add previous index (adding smi to smi).
__ bind(&capture_negative);
__ SmiTag(edi);
// Store the smi value in the last match info.
__ mov(FieldOperand(ebx,
edx,

View File

@ -182,7 +182,8 @@ class MacroAssembler: public Assembler {
// Smi tagging support.
void SmiTag(Register reg) {
ASSERT(kSmiTag == 0);
shl(reg, kSmiTagSize);
ASSERT(kSmiTagSize == 1);
add(reg, Operand(reg));
}
void SmiUntag(Register reg) {
sar(reg, kSmiTagSize);

View File

@ -653,6 +653,8 @@ Handle<Object> RegExpMacroAssemblerIA32::GetCode(Handle<String> source) {
__ j(not_zero, &exit_label_);
__ bind(&stack_ok);
// Load start index for later use.
__ mov(ebx, Operand(ebp, kStartIndex));
// Allocate space on stack for registers.
__ sub(Operand(esp), Immediate(num_registers_ * kPointerSize));
@ -662,17 +664,23 @@ Handle<Object> RegExpMacroAssemblerIA32::GetCode(Handle<String> source) {
__ mov(edi, Operand(ebp, kInputStart));
// Set up edi to be negative offset from string end.
__ sub(edi, Operand(esi));
// Set eax to address of char before start of input
// Set eax to address of char before start of the string.
// (effectively string position -1).
__ lea(eax, Operand(edi, -char_size()));
__ neg(ebx);
if (mode_ == UC16) {
__ lea(eax, Operand(edi, ebx, times_2, -char_size()));
} else {
__ lea(eax, Operand(edi, ebx, times_1, -char_size()));
}
// Store this value in a local variable, for use when clearing
// position registers.
__ mov(Operand(ebp, kInputStartMinusOne), eax);
// Determine whether the start index is zero, that is at the start of the
// string, and store that value in a local variable.
__ mov(ebx, Operand(ebp, kStartIndex));
__ xor_(Operand(ecx), ecx); // setcc only operates on cl (lower byte of ecx).
// Register ebx still holds -stringIndex.
__ test(ebx, Operand(ebx));
__ setcc(zero, ecx); // 1 if 0 (start of string), 0 if positive.
__ mov(Operand(ebp, kAtStart), ecx);
@ -721,10 +729,17 @@ Handle<Object> RegExpMacroAssemblerIA32::GetCode(Handle<String> source) {
// copy captures to output
__ mov(ebx, Operand(ebp, kRegisterOutput));
__ mov(ecx, Operand(ebp, kInputEnd));
__ mov(edx, Operand(ebp, kStartIndex));
__ sub(ecx, Operand(ebp, kInputStart));
if (mode_ == UC16) {
__ lea(ecx, Operand(ecx, edx, times_2, 0));
} else {
__ add(ecx, Operand(edx));
}
for (int i = 0; i < num_saved_registers_; i++) {
__ mov(eax, register_location(i));
__ add(eax, Operand(ecx)); // Convert to index from start, not end.
// Convert to index from start of string, not end.
__ add(eax, Operand(ecx));
if (mode_ == UC16) {
__ sar(eax, 1); // Convert byte index to character index.
}

View File

@ -149,7 +149,7 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
AtomCompile(re, pattern, flags, atom_string);
} else {
IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
}
ASSERT(re->data()->IsFixedArray());
// Compilation succeeded so the data is set on the regexp
@ -341,10 +341,10 @@ Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
}
void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
int capture_count) {
void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
int capture_count) {
// Initialize compiled code entries to null.
Factory::SetRegExpIrregexpData(re,
JSRegExp::IRREGEXP,
@ -354,6 +354,94 @@ void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
}
int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
Handle<String> subject) {
if (!subject->IsFlat()) {
FlattenString(subject);
}
bool is_ascii = subject->IsAsciiRepresentation();
if (!EnsureCompiledIrregexp(regexp, is_ascii)) {
return -1;
}
#ifdef V8_NATIVE_REGEXP
// Native regexp only needs room to output captures. Registers are handled
// internally.
return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
#else // !V8_NATIVE_REGEXP
// Byte-code regexp needs space allocated for all its registers.
return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
#endif // V8_NATIVE_REGEXP
}
RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,
Handle<String> subject,
int index,
Vector<int> output) {
Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()));
ASSERT(index >= 0);
ASSERT(index <= subject->length());
ASSERT(subject->IsFlat());
#ifdef V8_NATIVE_REGEXP
ASSERT(output.length() >=
(IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
do {
bool is_ascii = subject->IsAsciiRepresentation();
Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii));
NativeRegExpMacroAssembler::Result res =
NativeRegExpMacroAssembler::Match(code,
subject,
output.start(),
output.length(),
index);
if (res != NativeRegExpMacroAssembler::RETRY) {
ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
Top::has_pending_exception());
STATIC_ASSERT(
static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
STATIC_ASSERT(
static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
== RE_EXCEPTION);
return static_cast<IrregexpResult>(res);
}
// If result is RETRY, the string has changed representation, and we
// must restart from scratch.
// In this case, it means we must make sure we are prepared to handle
// the, potentially, differen subject (the string can switch between
// being internal and external, and even between being ASCII and UC16,
// but the characters are always the same).
IrregexpPrepare(regexp, subject);
} while (true);
UNREACHABLE();
return RE_EXCEPTION;
#else // ndef V8_NATIVE_REGEXP
ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
bool is_ascii = subject->IsAsciiRepresentation();
// We must have done EnsureCompiledIrregexp, so we can get the number of
// registers.
int* register_vector = output.start();
int number_of_capture_registers =
(IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
for (int i = number_of_capture_registers - 1; i >= 0; i--) {
register_vector[i] = -1;
}
Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii));
if (IrregexpInterpreter::Match(byte_codes,
subject,
register_vector,
index)) {
return RE_SUCCESS;
}
return RE_FAILURE;
#endif // ndef V8_NATIVE_REGEXP
}
Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
Handle<String> subject,
int previous_index,
@ -361,9 +449,6 @@ Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
// Prepare space for the return values.
int number_of_capture_registers =
(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
#ifndef V8_NATIVE_REGEXP
#ifdef DEBUG
if (FLAG_trace_regexp_bytecodes) {
@ -373,101 +458,42 @@ Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
}
#endif
#endif
if (!subject->IsFlat()) {
FlattenString(subject);
}
last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
Handle<FixedArray> array;
// Dispatch to the correct RegExp implementation.
Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
#ifdef V8_NATIVE_REGEXP
OffsetsVector captures(number_of_capture_registers);
int* captures_vector = captures.vector();
NativeRegExpMacroAssembler::Result res;
do {
bool is_ascii = subject->IsAsciiRepresentation();
if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
return Handle<Object>::null();
}
Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
res = NativeRegExpMacroAssembler::Match(code,
subject,
captures_vector,
captures.length(),
previous_index);
// If result is RETRY, the string have changed representation, and we
// must restart from scratch.
} while (res == NativeRegExpMacroAssembler::RETRY);
if (res == NativeRegExpMacroAssembler::EXCEPTION) {
int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
if (required_registers < 0) {
// Compiling failed with an exception.
ASSERT(Top::has_pending_exception());
return Handle<Object>::null();
}
ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
|| res == NativeRegExpMacroAssembler::FAILURE);
if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
OffsetsVector registers(required_registers);
array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
// The captures come in (start, end+1) pairs.
for (int i = 0; i < number_of_capture_registers; i += 2) {
// Capture values are relative to start_offset only.
// Convert them to be relative to start of string.
if (captures_vector[i] >= 0) {
captures_vector[i] += previous_index;
IrregexpResult res = IrregexpExecOnce(jsregexp,
subject,
previous_index,
Vector<int>(registers.vector(),
registers.length()));
if (res == RE_SUCCESS) {
int capture_register_count =
(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
AssertNoAllocation no_gc;
int* register_vector = registers.vector();
FixedArray* array = FixedArray::cast(last_match_info->elements());
for (int i = 0; i < capture_register_count; i += 2) {
SetCapture(array, i, register_vector[i]);
SetCapture(array, i + 1, register_vector[i + 1]);
}
if (captures_vector[i + 1] >= 0) {
captures_vector[i + 1] += previous_index;
}
SetCapture(*array, i, captures_vector[i]);
SetCapture(*array, i + 1, captures_vector[i + 1]);
SetLastCaptureCount(array, capture_register_count);
SetLastSubject(array, *subject);
SetLastInput(array, *subject);
return last_match_info;
}
#else // ! V8_NATIVE_REGEXP
bool is_ascii = subject->IsAsciiRepresentation();
if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
if (res == RE_EXCEPTION) {
ASSERT(Top::has_pending_exception());
return Handle<Object>::null();
}
// Now that we have done EnsureCompiledIrregexp we can get the number of
// registers.
int number_of_registers =
IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
OffsetsVector registers(number_of_registers);
int* register_vector = registers.vector();
for (int i = number_of_capture_registers - 1; i >= 0; i--) {
register_vector[i] = -1;
}
Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
if (!IrregexpInterpreter::Match(byte_codes,
subject,
register_vector,
previous_index)) {
return Factory::null_value();
}
array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
// The captures come in (start, end+1) pairs.
for (int i = 0; i < number_of_capture_registers; i += 2) {
SetCapture(*array, i, register_vector[i]);
SetCapture(*array, i + 1, register_vector[i + 1]);
}
#endif // V8_NATIVE_REGEXP
SetLastCaptureCount(*array, number_of_capture_registers);
SetLastSubject(*array, *subject);
SetLastInput(*array, *subject);
return last_match_info;
ASSERT(res == RE_FAILURE);
return Factory::null_value();
}

View File

@ -77,10 +77,10 @@ class RegExpImpl {
Handle<JSArray> lastMatchInfo);
// Prepares a JSRegExp object with Irregexp-specific data.
static void IrregexpPrepare(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
int capture_register_count);
static void IrregexpInitialize(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
int capture_register_count);
static void AtomCompile(Handle<JSRegExp> re,
@ -93,6 +93,29 @@ class RegExpImpl {
int index,
Handle<JSArray> lastMatchInfo);
enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Handle<JSRegExp> regexp,
Handle<String> subject);
// Execute a regular expression once on the subject, starting from
// character "index".
// If successful, returns RE_SUCCESS and set the capture positions
// in the first registers.
// If matching fails, returns RE_FAILURE.
// If execution fails, sets a pending exception and returns RE_EXCEPTION.
static IrregexpResult IrregexpExecOnce(Handle<JSRegExp> regexp,
Handle<String> subject,
int index,
Vector<int32_t> registers);
// Execute an Irregexp bytecode pattern.
// On a successful match, the result is a JSArray containing
// captured positions. On a failure, the result is the null value.

View File

@ -7182,12 +7182,6 @@ void RegExpExecStub::Generate(MacroAssembler* masm) {
// Read the value from the static offsets vector buffer and make it a smi.
__ movl(rdi, Operand(rcx, rdx, times_int_size, 0));
__ Integer32ToSmi(rdi, rdi, &runtime);
// Add previous index (from its stack slot) if value is not negative.
Label capture_negative;
// Negative flag set by smi convertion above.
__ j(negative, &capture_negative);
__ SmiAdd(rdi, rdi, rax, &runtime); // Add previous index.
__ bind(&capture_negative);
// Store the smi value in the last match info.
__ movq(FieldOperand(rbx,
rdx,

View File

@ -711,9 +711,15 @@ Handle<Object> RegExpMacroAssemblerX64::GetCode(Handle<String> source) {
__ movq(rdi, Operand(rbp, kInputStart));
// Set up rdi to be negative offset from string end.
__ subq(rdi, rsi);
// Set rax to address of char before start of input
// Set rax to address of char before start of the string
// (effectively string position -1).
__ lea(rax, Operand(rdi, -char_size()));
__ movq(rbx, Operand(rbp, kStartIndex));
__ neg(rbx);
if (mode_ == UC16) {
__ lea(rax, Operand(rdi, rbx, times_2, -char_size()));
} else {
__ lea(rax, Operand(rdi, rbx, times_1, -char_size()));
}
// Store this value in a local variable, for use when clearing
// position registers.
__ movq(Operand(rbp, kInputStartMinusOne), rax);
@ -770,9 +776,15 @@ Handle<Object> RegExpMacroAssemblerX64::GetCode(Handle<String> source) {
__ bind(&success_label_);
if (num_saved_registers_ > 0) {
// copy captures to output
__ movq(rdx, Operand(rbp, kStartIndex));
__ movq(rbx, Operand(rbp, kRegisterOutput));
__ movq(rcx, Operand(rbp, kInputEnd));
__ subq(rcx, Operand(rbp, kInputStart));
if (mode_ == UC16) {
__ lea(rcx, Operand(rcx, rdx, times_2, 0));
} else {
__ addq(rcx, rdx);
}
for (int i = 0; i < num_saved_registers_; i++) {
__ movq(rax, register_location(i));
__ addq(rax, rcx); // Convert to index from start, not end.