// Copyright 2006-2008 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define _HAS_EXCEPTIONS 0 #include #include "v8.h" #include "ast.h" #include "execution.h" #include "factory.h" #include "jsregexp-inl.h" #include "platform.h" #include "runtime.h" #include "top.h" #include "compilation-cache.h" #include "string-stream.h" #include "parser.h" #include "regexp-macro-assembler.h" #include "regexp-macro-assembler-tracer.h" #include "regexp-macro-assembler-irregexp.h" #ifdef ARM #include "regexp-macro-assembler-arm.h" #else // IA32 #include "macro-assembler-ia32.h" #include "regexp-macro-assembler-ia32.h" #endif #include "interpreter-irregexp.h" // Including pcre.h undefines DEBUG to avoid getting debug output from // the JSCRE implementation. Make sure to redefine it in debug mode // after having included the header file. #ifdef DEBUG #include "third_party/jscre/pcre.h" #define DEBUG #else #include "third_party/jscre/pcre.h" #endif namespace v8 { namespace internal { static Failure* malloc_failure; static void* JSREMalloc(size_t size) { Object* obj = Heap::AllocateByteArray(size); // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile // will return NULL to the caller, performs GC there. // Also pass failure information to the caller. if (obj->IsFailure()) { malloc_failure = Failure::cast(obj); return NULL; } // Note: object is unrooted, the caller of jsRegExpCompile must // create a handle for the return value before doing heap allocation. return reinterpret_cast(ByteArray::cast(obj)->GetDataStartAddress()); } static void JSREFree(void* p) { USE(p); // Do nothing, memory is garbage collected. } String* RegExpImpl::last_ascii_string_ = NULL; String* RegExpImpl::two_byte_cached_string_ = NULL; void RegExpImpl::NewSpaceCollectionPrologue() { // The two byte string is always in the old space. The Ascii string may be // in either place. If it is in the old space we don't need to do anything. if (Heap::InNewSpace(last_ascii_string_)) { // Invalidate the cache. last_ascii_string_ = NULL; two_byte_cached_string_ = NULL; } } void RegExpImpl::OldSpaceCollectionPrologue() { last_ascii_string_ = NULL; two_byte_cached_string_ = NULL; } Handle RegExpImpl::CreateRegExpLiteral(Handle constructor, Handle pattern, Handle flags, bool* has_pending_exception) { // Ensure that the constructor function has been loaded. if (!constructor->IsLoaded()) { LoadLazy(constructor, has_pending_exception); if (*has_pending_exception) return Handle(); } // Call the construct code with 2 arguments. Object** argv[2] = { Handle::cast(pattern).location(), Handle::cast(flags).location() }; return Execution::New(constructor, 2, argv, has_pending_exception); } // Converts a source string to a 16 bit flat string or a SlicedString containing // a 16 bit flat string). Handle RegExpImpl::CachedStringToTwoByte(Handle subject) { if (*subject == last_ascii_string_) { ASSERT(two_byte_cached_string_ != NULL); return Handle(String::cast(two_byte_cached_string_)); } Handle two_byte_string = StringToTwoByte(subject); last_ascii_string_ = *subject; two_byte_cached_string_ = *two_byte_string; return two_byte_string; } // Converts a source string to a 16 bit flat string or a SlicedString containing // a 16 bit flat string). Handle RegExpImpl::StringToTwoByte(Handle pattern) { StringShape shape(*pattern); if (!pattern->IsFlat(shape)) { FlattenString(pattern); shape = StringShape(*pattern); } Handle flat_string(shape.IsCons() ? String::cast(ConsString::cast(*pattern)->first()) : *pattern); ASSERT(flat_string->IsString()); StringShape flat_shape(*flat_string); ASSERT(!flat_shape.IsCons()); ASSERT(flat_shape.IsSequential() || flat_shape.IsSliced() || flat_shape.IsExternal()); if (!flat_shape.IsAsciiRepresentation()) { return flat_string; } int len = flat_string->length(flat_shape); Handle two_byte_string = Factory::NewRawTwoByteString(len, TENURED); uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars(); String::WriteToFlat(*flat_string, flat_shape, dest, 0, len); return two_byte_string; } static JSRegExp::Flags RegExpFlagsFromString(Handle str) { int flags = JSRegExp::NONE; StringShape shape(*str); for (int i = 0; i < str->length(shape); i++) { switch (str->Get(shape, i)) { case 'i': flags |= JSRegExp::IGNORE_CASE; break; case 'g': flags |= JSRegExp::GLOBAL; break; case 'm': flags |= JSRegExp::MULTILINE; break; } } return JSRegExp::Flags(flags); } static inline void ThrowRegExpException(Handle re, Handle pattern, Handle error_text, const char* message) { Handle array = Factory::NewJSArray(2); SetElement(array, 0, pattern); SetElement(array, 1, error_text); Handle regexp_err = Factory::NewSyntaxError(message, array); Top::Throw(*regexp_err); } Handle RegExpImpl::Compile(Handle re, Handle pattern, Handle flag_str) { JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); Handle cached = CompilationCache::LookupRegExp(pattern, flags); bool in_cache = !cached.is_null(); Handle result; if (in_cache) { re->set_data(*cached); result = re; } else { FlattenString(pattern); RegExpParseResult parse_result; FlatStringReader reader(pattern); if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) { // Throw an exception if we fail to parse the pattern. ThrowRegExpException(re, pattern, parse_result.error, "malformed_regexp"); return Handle(); } RegExpAtom* atom = parse_result.tree->AsAtom(); if (atom != NULL && !flags.is_ignore_case()) { if (parse_result.has_character_escapes) { Vector atom_pattern = atom->data(); Handle atom_string = Factory::NewStringFromTwoByte(atom_pattern); result = AtomCompile(re, pattern, flags, atom_string); } else { result = AtomCompile(re, pattern, flags, pattern); } } else { RegExpNode* node = NULL; Handle irregexp_data = RegExpEngine::Compile(&parse_result, &node, flags.is_ignore_case(), flags.is_multiline()); if (irregexp_data.is_null()) { if (FLAG_disable_jscre) { UNIMPLEMENTED(); } result = JscrePrepare(re, pattern, flags); } else { result = IrregexpPrepare(re, pattern, flags, irregexp_data); } } Object* data = re->data(); if (data->IsFixedArray()) { // If compilation succeeded then the data is set on the regexp // and we can store it in the cache. Handle data(FixedArray::cast(re->data())); CompilationCache::PutRegExp(pattern, flags, data); } } LOG(RegExpCompileEvent(re, in_cache)); return result; } Handle RegExpImpl::Exec(Handle regexp, Handle subject, Handle index) { switch (regexp->TypeTag()) { case JSRegExp::JSCRE: if (FLAG_disable_jscre) { UNIMPLEMENTED(); } return JscreExec(regexp, subject, index); case JSRegExp::ATOM: return AtomExec(regexp, subject, index); case JSRegExp::IRREGEXP: return IrregexpExec(regexp, subject, index); default: UNREACHABLE(); return Handle(); } } Handle RegExpImpl::ExecGlobal(Handle regexp, Handle subject) { switch (regexp->TypeTag()) { case JSRegExp::JSCRE: if (FLAG_disable_jscre) { UNIMPLEMENTED(); } return JscreExecGlobal(regexp, subject); case JSRegExp::ATOM: return AtomExecGlobal(regexp, subject); case JSRegExp::IRREGEXP: return IrregexpExecGlobal(regexp, subject); default: UNREACHABLE(); return Handle(); } } Handle RegExpImpl::AtomCompile(Handle re, Handle pattern, JSRegExp::Flags flags, Handle match_pattern) { Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern); return re; } Handle RegExpImpl::AtomExec(Handle re, Handle subject, Handle index) { Handle needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); uint32_t start_index; if (!Array::IndexFromObject(*index, &start_index)) { return Handle(Smi::FromInt(-1)); } LOG(RegExpExecEvent(re, start_index, subject)); int value = Runtime::StringMatch(subject, needle, start_index); if (value == -1) return Factory::null_value(); Handle array = Factory::NewFixedArray(2); array->set(0, Smi::FromInt(value)); array->set(1, Smi::FromInt(value + needle->length())); return Factory::NewJSArrayWithElements(array); } Handle RegExpImpl::AtomExecGlobal(Handle re, Handle subject) { Handle needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); Handle result = Factory::NewJSArray(1); int index = 0; int match_count = 0; int subject_length = subject->length(); int needle_length = needle->length(); while (true) { LOG(RegExpExecEvent(re, index, subject)); int value = -1; if (index + needle_length <= subject_length) { value = Runtime::StringMatch(subject, needle, index); } if (value == -1) break; HandleScope scope; int end = value + needle_length; Handle array = Factory::NewFixedArray(2); array->set(0, Smi::FromInt(value)); array->set(1, Smi::FromInt(end)); Handle pair = Factory::NewJSArrayWithElements(array); SetElement(result, match_count, pair); match_count++; index = end; if (needle_length == 0) index++; } return result; } HandleRegExpImpl::JscrePrepare(Handle re, Handle pattern, JSRegExp::Flags flags) { Handle value(Heap::undefined_value()); Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); return re; } HandleRegExpImpl::IrregexpPrepare(Handle re, Handle pattern, JSRegExp::Flags flags, Handle irregexp_data) { Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data); return re; } static inline Object* DoCompile(String* pattern, JSRegExp::Flags flags, unsigned* number_of_captures, const char** error_message, v8::jscre::JscreRegExp** code) { v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case() ? v8::jscre::JSRegExpIgnoreCase : v8::jscre::JSRegExpDoNotIgnoreCase; v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline() ? v8::jscre::JSRegExpMultiline : v8::jscre::JSRegExpSingleLine; *error_message = NULL; malloc_failure = Failure::Exception(); *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(), pattern->length(), case_option, multiline_option, number_of_captures, error_message, &JSREMalloc, &JSREFree); if (*code == NULL && (malloc_failure->IsRetryAfterGC() || malloc_failure->IsOutOfMemoryFailure())) { return malloc_failure; } else { // It doesn't matter which object we return here, we just need to return // a non-failure to indicate to the GC-retry code that there was no // allocation failure. return pattern; } } void CompileWithRetryAfterGC(Handle pattern, JSRegExp::Flags flags, unsigned* number_of_captures, const char** error_message, v8::jscre::JscreRegExp** code) { CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern, flags, number_of_captures, error_message, code)); } Handle RegExpImpl::JscreCompile(Handle re) { ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE); ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()); Handle pattern(re->Pattern()); JSRegExp::Flags flags = re->GetFlags(); Handle two_byte_pattern = StringToTwoByte(pattern); unsigned number_of_captures; const char* error_message = NULL; v8::jscre::JscreRegExp* code = NULL; FlattenString(pattern); CompileWithRetryAfterGC(two_byte_pattern, flags, &number_of_captures, &error_message, &code); if (code == NULL) { // Throw an exception. Handle array = Factory::NewJSArray(2); SetElement(array, 0, pattern); SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector( (error_message == NULL) ? "Unknown regexp error" : error_message))); Handle regexp_err = Factory::NewSyntaxError("malformed_regexp", array); Top::Throw(*regexp_err); return Handle(); } // Convert the return address to a ByteArray pointer. Handle internal( ByteArray::FromDataStartAddress(reinterpret_cast
(code))); Handle value = Factory::NewFixedArray(kJscreDataLength); value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures)); value->set(kJscreInternalIndex, *internal); Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); return re; } Handle RegExpImpl::IrregexpExecOnce(Handle regexp, int num_captures, Handle two_byte_subject, int previous_index, int* offsets_vector, int offsets_vector_length) { #ifdef DEBUG if (FLAG_trace_regexp_bytecodes) { String* pattern = regexp->Pattern(); PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString())); } #endif ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation()); ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject))); bool rc; for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) { offsets_vector[i] = -1; } LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject)); FixedArray* irregexp = FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex)); int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value(); switch (tag) { case RegExpMacroAssembler::kIA32Implementation: { #ifndef ARM Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex)); Address start_addr = Handle::cast(two_byte_subject)->GetCharsAddress(); int string_offset = start_addr - reinterpret_cast
(*two_byte_subject); int start_offset = string_offset + previous_index * sizeof(uc16); int end_offset = string_offset + two_byte_subject->length() * sizeof(uc16); rc = RegExpMacroAssemblerIA32::Execute(code, two_byte_subject.location(), start_offset, end_offset, offsets_vector, previous_index == 0); if (rc) { // Capture values are relative to start_offset only. for (int i = 0; i < offsets_vector_length; i++) { if (offsets_vector[i] >= 0) { offsets_vector[i] += previous_index; } } } break; #else UNIMPLEMENTED(); rc = false; break; #endif } case RegExpMacroAssembler::kBytecodeImplementation: { Handle byte_codes = IrregexpCode(regexp); rc = IrregexpInterpreter::Match(byte_codes, two_byte_subject, offsets_vector, previous_index); break; } case RegExpMacroAssembler::kARMImplementation: default: UNREACHABLE(); rc = false; break; } if (!rc) { return Factory::null_value(); } Handle array = Factory::NewFixedArray(2 * (num_captures+1)); // The captures come in (start, end+1) pairs. for (int i = 0; i < 2 * (num_captures+1); i += 2) { array->set(i, Smi::FromInt(offsets_vector[i])); array->set(i+1, Smi::FromInt(offsets_vector[i+1])); } return Factory::NewJSArrayWithElements(array); } Handle RegExpImpl::JscreExecOnce(Handle regexp, int num_captures, Handle subject, int previous_index, const uc16* two_byte_subject, int* offsets_vector, int offsets_vector_length) { int rc; { AssertNoAllocation a; ByteArray* internal = JscreInternal(regexp); const v8::jscre::JscreRegExp* js_regexp = reinterpret_cast( internal->GetDataStartAddress()); LOG(RegExpExecEvent(regexp, previous_index, subject)); rc = v8::jscre::jsRegExpExecute(js_regexp, two_byte_subject, subject->length(), previous_index, offsets_vector, offsets_vector_length); } // The KJS JavaScript engine returns null (ie, a failed match) when // JSRE's internal match limit is exceeded. We duplicate that behavior here. if (rc == v8::jscre::JSRegExpErrorNoMatch || rc == v8::jscre::JSRegExpErrorHitLimit) { return Factory::null_value(); } // Other JSRE errors: if (rc < 0) { // Throw an exception. Handle code(Smi::FromInt(rc)); Handle args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code }; Handle regexp_err( Factory::NewTypeError("jsre_error", HandleVector(args, 2))); return Handle(Top::Throw(*regexp_err)); } Handle array = Factory::NewFixedArray(2 * (num_captures+1)); // The captures come in (start, end+1) pairs. for (int i = 0; i < 2 * (num_captures+1); i += 2) { array->set(i, Smi::FromInt(offsets_vector[i])); array->set(i+1, Smi::FromInt(offsets_vector[i+1])); } return Factory::NewJSArrayWithElements(array); } class OffsetsVector { public: inline OffsetsVector(int num_registers) : offsets_vector_length_(num_registers) { if (offsets_vector_length_ > kStaticOffsetsVectorSize) { vector_ = NewArray(offsets_vector_length_); } else { vector_ = static_offsets_vector_; } } inline ~OffsetsVector() { if (offsets_vector_length_ > kStaticOffsetsVectorSize) { DeleteArray(vector_); vector_ = NULL; } } inline int* vector() { return vector_; } inline int length() { return offsets_vector_length_; } private: int* vector_; int offsets_vector_length_; static const int kStaticOffsetsVectorSize = 50; static int static_offsets_vector_[kStaticOffsetsVectorSize]; }; int OffsetsVector::static_offsets_vector_[ OffsetsVector::kStaticOffsetsVectorSize]; Handle RegExpImpl::IrregexpExec(Handle regexp, Handle subject, Handle index) { ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined()); // Prepare space for the return values. int number_of_registers = IrregexpNumberOfRegisters(regexp); OffsetsVector offsets(number_of_registers); int num_captures = IrregexpNumberOfCaptures(regexp); int previous_index = static_cast(DoubleToInteger(index->Number())); Handle subject16 = CachedStringToTwoByte(subject); Handle result(IrregexpExecOnce(regexp, num_captures, subject16, previous_index, offsets.vector(), offsets.length())); return result; } Handle RegExpImpl::JscreExec(Handle regexp, Handle subject, Handle index) { ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE); if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) { Handle compile_result = JscreCompile(regexp); if (compile_result.is_null()) return compile_result; } ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray()); int num_captures = JscreNumberOfCaptures(regexp); OffsetsVector offsets((num_captures + 1) * 3); int previous_index = static_cast(DoubleToInteger(index->Number())); Handle subject16 = CachedStringToTwoByte(subject); Handle result(JscreExecOnce(regexp, num_captures, subject, previous_index, subject16->GetTwoByteData(), offsets.vector(), offsets.length())); return result; } Handle RegExpImpl::IrregexpExecGlobal(Handle regexp, Handle subject) { ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined()); // Prepare space for the return values. int number_of_registers = IrregexpNumberOfRegisters(regexp); OffsetsVector offsets(number_of_registers); int previous_index = 0; Handle result = Factory::NewJSArray(0); int i = 0; Handle matches; Handle subject16 = CachedStringToTwoByte(subject); do { if (previous_index > subject->length() || previous_index < 0) { // Per ECMA-262 15.10.6.2, if the previous index is greater than the // string length, there is no match. matches = Factory::null_value(); } else { matches = IrregexpExecOnce(regexp, IrregexpNumberOfCaptures(regexp), subject16, previous_index, offsets.vector(), offsets.length()); if (matches->IsJSArray()) { SetElement(result, i, matches); i++; previous_index = offsets.vector()[1]; if (offsets.vector()[0] == offsets.vector()[1]) { previous_index++; } } } } while (matches->IsJSArray()); // If we exited the loop with an exception, throw it. if (matches->IsNull()) { // Exited loop normally. return result; } else { // Exited loop with the exception in matches. return matches; } } Handle RegExpImpl::JscreExecGlobal(Handle regexp, Handle subject) { ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE); if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) { Handle compile_result = JscreCompile(regexp); if (compile_result.is_null()) return compile_result; } ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray()); // Prepare space for the return values. int num_captures = JscreNumberOfCaptures(regexp); OffsetsVector offsets((num_captures + 1) * 3); int previous_index = 0; Handle result = Factory::NewJSArray(0); int i = 0; Handle matches; Handle subject16 = CachedStringToTwoByte(subject); do { if (previous_index > subject->length() || previous_index < 0) { // Per ECMA-262 15.10.6.2, if the previous index is greater than the // string length, there is no match. matches = Factory::null_value(); } else { matches = JscreExecOnce(regexp, num_captures, subject, previous_index, subject16->GetTwoByteData(), offsets.vector(), offsets.length()); if (matches->IsJSArray()) { SetElement(result, i, matches); i++; previous_index = offsets.vector()[1]; if (offsets.vector()[0] == offsets.vector()[1]) { previous_index++; } } } } while (matches->IsJSArray()); // If we exited the loop with an exception, throw it. if (matches->IsNull()) { // Exited loop normally. return result; } else { // Exited loop with the exception in matches. return matches; } } int RegExpImpl::JscreNumberOfCaptures(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value(); } ByteArray* RegExpImpl::JscreInternal(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); return ByteArray::cast(value->get(kJscreInternalIndex)); } int RegExpImpl::IrregexpNumberOfCaptures(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value(); } int RegExpImpl::IrregexpNumberOfRegisters(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value(); } Handle RegExpImpl::IrregexpCode(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); return Handle(ByteArray::cast(value->get(kIrregexpCodeIndex))); } // ------------------------------------------------------------------- // Implmentation of the Irregexp regular expression engine. void RegExpTree::AppendToText(RegExpText* text) { UNREACHABLE(); } void RegExpAtom::AppendToText(RegExpText* text) { text->AddElement(TextElement::Atom(this)); } void RegExpCharacterClass::AppendToText(RegExpText* text) { text->AddElement(TextElement::CharClass(this)); } void RegExpText::AppendToText(RegExpText* text) { for (int i = 0; i < elements()->length(); i++) text->AddElement(elements()->at(i)); } TextElement TextElement::Atom(RegExpAtom* atom) { TextElement result = TextElement(ATOM); result.data.u_atom = atom; return result; } TextElement TextElement::CharClass( RegExpCharacterClass* char_class) { TextElement result = TextElement(CHAR_CLASS); result.data.u_char_class = char_class; return result; } DispatchTable* ChoiceNode::GetTable(bool ignore_case) { if (table_ == NULL) { table_ = new DispatchTable(); DispatchTableConstructor cons(table_, ignore_case); cons.BuildTable(this); } return table_; } class RegExpCompiler { public: RegExpCompiler(int capture_count, bool ignore_case); int AllocateRegister() { return next_register_++; } Handle Assemble(RegExpMacroAssembler* assembler, RegExpNode* start, int capture_count); inline void AddWork(RegExpNode* node) { work_list_->Add(node); } static const int kImplementationOffset = 0; static const int kNumberOfRegistersOffset = 0; static const int kCodeOffset = 1; RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } EndNode* accept() { return accept_; } EndNode* backtrack() { return backtrack_; } static const int kMaxRecursion = 100; inline int recursion_depth() { return recursion_depth_; } inline void IncrementRecursionDepth() { recursion_depth_++; } inline void DecrementRecursionDepth() { recursion_depth_--; } inline bool ignore_case() { return ignore_case_; } private: EndNode* accept_; EndNode* backtrack_; int next_register_; List* work_list_; int recursion_depth_; RegExpMacroAssembler* macro_assembler_; bool ignore_case_; }; // Attempts to compile the regexp using an Irregexp code generator. Returns // a fixed array or a null handle depending on whether it succeeded. RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) : next_register_(2 * (capture_count + 1)), work_list_(NULL), recursion_depth_(0), ignore_case_(ignore_case) { accept_ = new EndNode(EndNode::ACCEPT); backtrack_ = new EndNode(EndNode::BACKTRACK); } Handle RegExpCompiler::Assemble( RegExpMacroAssembler* macro_assembler, RegExpNode* start, int capture_count) { #ifdef DEBUG if (FLAG_trace_regexp_assembler) macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); else #endif macro_assembler_ = macro_assembler; List work_list(0); work_list_ = &work_list; Label fail; macro_assembler_->PushBacktrack(&fail); if (!start->GoTo(this)) { fail.Unuse(); return Handle::null(); } while (!work_list.is_empty()) { if (!work_list.RemoveLast()->GoTo(this)) { fail.Unuse(); return Handle::null(); } } macro_assembler_->Bind(&fail); macro_assembler_->Fail(); Handle array = Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength); array->set(RegExpImpl::kIrregexpImplementationIndex, Smi::FromInt(macro_assembler_->Implementation())); array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex, Smi::FromInt(next_register_)); array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex, Smi::FromInt(capture_count)); Handle code = macro_assembler_->GetCode(); array->set(RegExpImpl::kIrregexpCodeIndex, *code); work_list_ = NULL; #ifdef DEBUG if (FLAG_trace_regexp_assembler) { delete macro_assembler_; } #endif return array; } bool RegExpNode::GoTo(RegExpCompiler* compiler) { // TODO(erikcorry): Implement support. if (info_.follows_word_interest || info_.follows_newline_interest || info_.follows_start_interest) { return false; } if (label_.is_bound()) { compiler->macro_assembler()->GoTo(&label_); return true; } else { if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) { compiler->macro_assembler()->GoTo(&label_); compiler->AddWork(this); return true; } else { compiler->IncrementRecursionDepth(); bool how_it_went = Emit(compiler); compiler->DecrementRecursionDepth(); return how_it_went; } } } // EndNodes are special. Because they can be very common and they are very // short we normally inline them. That is, if we are asked to emit a GoTo // we just emit the entire node. Since they don't have successors this // works. bool EndNode::GoTo(RegExpCompiler* compiler) { if (info()->follows_word_interest || info()->follows_newline_interest || info()->follows_start_interest) { return false; } return Emit(compiler); } Label* RegExpNode::label() { return &label_; } bool EndNode::Emit(RegExpCompiler* compiler) { RegExpMacroAssembler* macro = compiler->macro_assembler(); switch (action_) { case ACCEPT: if (!label()->is_bound()) Bind(macro); if (info()->at_end) { Label succeed; // LoadCurrentCharacter will go to the label if we are at the end of the // input string. macro->LoadCurrentCharacter(0, &succeed); macro->Backtrack(); macro->Bind(&succeed); } macro->Succeed(); return true; case BACKTRACK: if (!label()->is_bound()) Bind(macro); ASSERT(!info()->at_end); macro->Backtrack(); return true; } return false; } void GuardedAlternative::AddGuard(Guard* guard) { if (guards_ == NULL) guards_ = new ZoneList(1); guards_->Add(guard); } ActionNode* ActionNode::StoreRegister(int reg, int val, RegExpNode* on_success) { ActionNode* result = new ActionNode(STORE_REGISTER, on_success); result->data_.u_store_register.reg = reg; result->data_.u_store_register.value = val; return result; } ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); result->data_.u_increment_register.reg = reg; return result; } ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) { ActionNode* result = new ActionNode(STORE_POSITION, on_success); result->data_.u_position_register.reg = reg; return result; } ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) { ActionNode* result = new ActionNode(RESTORE_POSITION, on_success); result->data_.u_position_register.reg = reg; return result; } ActionNode* ActionNode::BeginSubmatch(int stack_reg, int position_reg, RegExpNode* on_success) { ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); result->data_.u_submatch.stack_pointer_register = stack_reg; result->data_.u_submatch.current_position_register = position_reg; return result; } ActionNode* ActionNode::EscapeSubmatch(int stack_reg, bool restore_position, int position_reg, RegExpNode* on_success) { ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success); result->data_.u_submatch.stack_pointer_register = stack_reg; if (restore_position) { result->data_.u_submatch.current_position_register = position_reg; } else { result->data_.u_submatch.current_position_register = -1; } return result; } #define DEFINE_ACCEPT(Type) \ void Type##Node::Accept(NodeVisitor* visitor) { \ visitor->Visit##Type(this); \ } FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) #undef DEFINE_ACCEPT // ------------------------------------------------------------------- // Emit code. void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, Label* on_failure) { switch (guard->op()) { case Guard::LT: macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure); break; case Guard::GEQ: macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure); break; } } static unibrow::Mapping uncanonicalize; static unibrow::Mapping canonrange; static inline void EmitAtomNonLetters( RegExpMacroAssembler* macro_assembler, TextElement elm, Vector quarks, Label* on_failure, int cp_offset) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; for (int i = quarks.length() - 1; i >= 0; i--) { uc16 c = quarks[i]; int length = uncanonicalize.get(c, '\0', chars); if (length <= 1) { macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); macro_assembler->CheckNotCharacter(c, on_failure); } } } static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, uc16 c1, uc16 c2, Label* on_failure) { uc16 exor = c1 ^ c2; // Check whether exor has only one bit set. if (((exor - 1) & exor) == 0) { // If c1 and c2 differ only by one bit. // Ecma262UnCanonicalize always gives the highest number last. ASSERT(c2 > c1); macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure); return true; } ASSERT(c2 > c1); uc16 diff = c2 - c1; if (((diff - 1) & diff) == 0 && c1 >= diff) { // If the characters differ by 2^n but don't differ by one bit then // subtract the difference from the found character, then do the or // trick. We avoid the theoretical case where negative numbers are // involved in order to simplify code generation. macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff, diff, on_failure); return true; } return false; } static inline void EmitAtomLetters( RegExpMacroAssembler* macro_assembler, TextElement elm, Vector quarks, Label* on_failure, int cp_offset) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; for (int i = quarks.length() - 1; i >= 0; i--) { uc16 c = quarks[i]; int length = uncanonicalize.get(c, '\0', chars); if (length <= 1) continue; macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); Label ok; ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); switch (length) { case 2: { if (ShortCutEmitCharacterPair(macro_assembler, chars[0], chars[1], on_failure)) { ok.Unuse(); } else { macro_assembler->CheckCharacter(chars[0], &ok); macro_assembler->CheckNotCharacter(chars[1], on_failure); macro_assembler->Bind(&ok); } break; } case 4: macro_assembler->CheckCharacter(chars[3], &ok); // Fall through! case 3: macro_assembler->CheckCharacter(chars[0], &ok); macro_assembler->CheckCharacter(chars[1], &ok); macro_assembler->CheckNotCharacter(chars[2], on_failure); macro_assembler->Bind(&ok); break; default: UNREACHABLE(); break; } } } static void EmitCharClass(RegExpMacroAssembler* macro_assembler, RegExpCharacterClass* cc, int cp_offset, Label* on_failure) { macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); cp_offset++; ZoneList* ranges = cc->ranges(); Label success; Label* char_is_in_class = cc->is_negated() ? on_failure : &success; int range_count = ranges->length(); if (range_count == 0) { if (!cc->is_negated()) { macro_assembler->GoTo(on_failure); } return; } for (int i = 0; i < range_count - 1; i++) { CharacterRange& range = ranges->at(i); Label next_range; uc16 from = range.from(); uc16 to = range.to(); if (to == from) { macro_assembler->CheckCharacter(to, char_is_in_class); } else { if (from != 0) { macro_assembler->CheckCharacterLT(from, &next_range); } if (to != 0xffff) { macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); } else { macro_assembler->GoTo(char_is_in_class); } } macro_assembler->Bind(&next_range); } CharacterRange& range = ranges->at(range_count - 1); uc16 from = range.from(); uc16 to = range.to(); if (to == from) { if (cc->is_negated()) { macro_assembler->CheckCharacter(to, on_failure); } else { macro_assembler->CheckNotCharacter(to, on_failure); } } else { if (from != 0) { if (cc->is_negated()) { macro_assembler->CheckCharacterLT(from, &success); } else { macro_assembler->CheckCharacterLT(from, on_failure); } } if (to != 0xffff) { if (cc->is_negated()) { macro_assembler->CheckCharacterLT(to + 1, on_failure); } else { macro_assembler->CheckCharacterGT(to, on_failure); } } else { if (cc->is_negated()) { macro_assembler->GoTo(on_failure); } } } macro_assembler->Bind(&success); } bool TextNode::Emit(RegExpCompiler* compiler) { RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); Bind(macro_assembler); int element_count = elms_->length(); ASSERT(element_count != 0); int cp_offset = 0; if (info()->at_end) { macro_assembler->Backtrack(); return true; } // First, handle straight character matches. for (int i = 0; i < element_count; i++) { TextElement elm = elms_->at(i); if (elm.type == TextElement::ATOM) { Vector quarks = elm.data.u_atom->data(); if (compiler->ignore_case()) { EmitAtomNonLetters(macro_assembler, elm, quarks, on_failure_->label(), cp_offset); } else { macro_assembler->CheckCharacters(quarks, cp_offset, on_failure_->label()); } cp_offset += quarks.length(); } else { ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); cp_offset++; } } // Second, handle case independent letter matches if any. if (compiler->ignore_case()) { cp_offset = 0; for (int i = 0; i < element_count; i++) { TextElement elm = elms_->at(i); if (elm.type == TextElement::ATOM) { Vector quarks = elm.data.u_atom->data(); EmitAtomLetters(macro_assembler, elm, quarks, on_failure_->label(), cp_offset); cp_offset += quarks.length(); } else { cp_offset++; } } } // If the fast character matches passed then do the character classes. cp_offset = 0; for (int i = 0; i < element_count; i++) { TextElement elm = elms_->at(i); if (elm.type == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.data.u_char_class; EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label()); cp_offset++; } else { cp_offset += elm.data.u_atom->data().length(); } } compiler->AddWork(on_failure_); macro_assembler->AdvanceCurrentPosition(cp_offset); return on_success()->GoTo(compiler); } void TextNode::MakeCaseIndependent() { int element_count = elms_->length(); for (int i = 0; i < element_count; i++) { TextElement elm = elms_->at(i); if (elm.type == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.data.u_char_class; ZoneList* ranges = cc->ranges(); int range_count = ranges->length(); for (int i = 0; i < range_count; i++) { ranges->at(i).AddCaseEquivalents(ranges); } } } } bool ChoiceNode::Emit(RegExpCompiler* compiler) { int choice_count = alternatives_->length(); RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); Bind(macro_assembler); // For now we just call all choices one after the other. The idea ultimately // is to use the Dispatch table to try only the relevant ones. for (int i = 0; i < choice_count - 1; i++) { GuardedAlternative alternative = alternatives_->at(i); Label after; Label after_no_pop_cp; ZoneList* guards = alternative.guards(); if (guards != NULL) { int guard_count = guards->length(); for (int j = 0; j < guard_count; j++) { GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp); } } macro_assembler->PushCurrentPosition(); macro_assembler->PushBacktrack(&after); if (!alternative.node()->GoTo(compiler)) { after.Unuse(); after_no_pop_cp.Unuse(); return false; } macro_assembler->Bind(&after); macro_assembler->PopCurrentPosition(); macro_assembler->Bind(&after_no_pop_cp); } GuardedAlternative alternative = alternatives_->at(choice_count - 1); ZoneList* guards = alternative.guards(); if (guards != NULL) { int guard_count = guards->length(); for (int j = 0; j < guard_count; j++) { GenerateGuard(macro_assembler, guards->at(j), on_failure_->label()); } } if (!on_failure_->IsBacktrack()) { ASSERT_NOT_NULL(on_failure_ -> label()); macro_assembler->PushBacktrack(on_failure_->label()); compiler->AddWork(on_failure_); } if (!alternative.node()->GoTo(compiler)) { return false; } return true; } bool ActionNode::Emit(RegExpCompiler* compiler) { RegExpMacroAssembler* macro = compiler->macro_assembler(); Bind(macro); switch (type_) { case STORE_REGISTER: macro->SetRegister(data_.u_store_register.reg, data_.u_store_register.value); break; case INCREMENT_REGISTER: { Label undo; macro->PushBacktrack(&undo); macro->AdvanceRegister(data_.u_increment_register.reg, 1); bool ok = on_success()->GoTo(compiler); if (!ok) { undo.Unuse(); return false; } macro->Bind(&undo); macro->AdvanceRegister(data_.u_increment_register.reg, -1); macro->Backtrack(); break; } case STORE_POSITION: { Label undo; macro->PushRegister(data_.u_position_register.reg); macro->PushBacktrack(&undo); macro->WriteCurrentPositionToRegister(data_.u_position_register.reg); bool ok = on_success()->GoTo(compiler); if (!ok) { undo.Unuse(); return false; } macro->Bind(&undo); macro->PopRegister(data_.u_position_register.reg); macro->Backtrack(); break; } case RESTORE_POSITION: macro->ReadCurrentPositionFromRegister( data_.u_position_register.reg); break; case BEGIN_SUBMATCH: macro->WriteCurrentPositionToRegister( data_.u_submatch.current_position_register); macro->WriteStackPointerToRegister( data_.u_submatch.stack_pointer_register); break; case ESCAPE_SUBMATCH: if (info()->at_end) { Label at_end; // Load current character jumps to the label if we are beyond the string // end. macro->LoadCurrentCharacter(0, &at_end); macro->Backtrack(); macro->Bind(&at_end); } if (data_.u_submatch.current_position_register != -1) { macro->ReadCurrentPositionFromRegister( data_.u_submatch.current_position_register); } macro->ReadStackPointerFromRegister( data_.u_submatch.stack_pointer_register); break; default: UNREACHABLE(); return false; } return on_success()->GoTo(compiler); } bool BackReferenceNode::Emit(RegExpCompiler* compiler) { RegExpMacroAssembler* macro = compiler->macro_assembler(); Bind(macro); // Check whether the registers are uninitialized and always // succeed if they are. macro->IfRegisterLT(start_reg_, 0, on_success()->label()); macro->IfRegisterLT(end_reg_, 0, on_success()->label()); ASSERT_EQ(start_reg_ + 1, end_reg_); if (info()->at_end) { // If we are constrained to match at the end of the input then succeed // iff the back reference is empty. macro->CheckNotRegistersEqual(start_reg_, end_reg_, on_failure_->label()); } else { if (compiler->ignore_case()) { macro->CheckNotBackReferenceIgnoreCase(start_reg_, on_failure_->label()); } else { macro->CheckNotBackReference(start_reg_, on_failure_->label()); } } return on_success()->GoTo(compiler); } // ------------------------------------------------------------------- // Dot/dotty output #ifdef DEBUG class DotPrinter: public NodeVisitor { public: explicit DotPrinter(bool ignore_case) : ignore_case_(ignore_case), stream_(&alloc_) { } void PrintNode(const char* label, RegExpNode* node); void Visit(RegExpNode* node); void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure); void PrintAttributes(RegExpNode* from); StringStream* stream() { return &stream_; } #define DECLARE_VISIT(Type) \ virtual void Visit##Type(Type##Node* that); FOR_EACH_NODE_TYPE(DECLARE_VISIT) #undef DECLARE_VISIT private: bool ignore_case_; HeapStringAllocator alloc_; StringStream stream_; std::set seen_; }; void DotPrinter::PrintNode(const char* label, RegExpNode* node) { stream()->Add("digraph G {\n graph [label=\""); for (int i = 0; label[i]; i++) { switch (label[i]) { case '\\': stream()->Add("\\\\"); break; case '"': stream()->Add("\""); break; default: stream()->Put(label[i]); break; } } stream()->Add("\"];\n"); Visit(node); stream()->Add("}\n"); printf("%s", *(stream()->ToCString())); } void DotPrinter::Visit(RegExpNode* node) { if (seen_.find(node) != seen_.end()) return; seen_.insert(node); node->Accept(this); } void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { if (on_failure->IsBacktrack()) return; stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); Visit(on_failure); } class TableEntryBodyPrinter { public: TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) : stream_(stream), choice_(choice) { } void Call(uc16 from, DispatchTable::Entry entry) { OutSet* out_set = entry.out_set(); for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { if (out_set->Get(i)) { stream()->Add(" n%p:s%io%i -> n%p;\n", choice(), from, i, choice()->alternatives()->at(i).node()); } } } private: StringStream* stream() { return stream_; } ChoiceNode* choice() { return choice_; } StringStream* stream_; ChoiceNode* choice_; }; class TableEntryHeaderPrinter { public: explicit TableEntryHeaderPrinter(StringStream* stream) : first_(true), stream_(stream) { } void Call(uc16 from, DispatchTable::Entry entry) { if (first_) { first_ = false; } else { stream()->Add("|"); } stream()->Add("{\\%k-\\%k|{", from, entry.to()); OutSet* out_set = entry.out_set(); int priority = 0; for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { if (out_set->Get(i)) { if (priority > 0) stream()->Add("|"); stream()->Add(" %i", from, i, priority); priority++; } } stream()->Add("}}"); } private: bool first_; StringStream* stream() { return stream_; } StringStream* stream_; }; class AttributePrinter { public: explicit AttributePrinter(DotPrinter* out) : out_(out), first_(true) { } void PrintSeparator() { if (first_) { first_ = false; } else { out_->stream()->Add("|"); } } void PrintBit(const char* name, bool value) { if (!value) return; PrintSeparator(); out_->stream()->Add("{%s}", name); } void PrintPositive(const char* name, int value) { if (value < 0) return; PrintSeparator(); out_->stream()->Add("{%s|%x}", name, value); } private: DotPrinter* out_; bool first_; }; void DotPrinter::PrintAttributes(RegExpNode* that) { stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " "margin=0.1, fontsize=10, label=\"{", that); AttributePrinter printer(this); NodeInfo* info = that->info(); printer.PrintBit("NI", info->follows_newline_interest); printer.PrintBit("WI", info->follows_word_interest); printer.PrintBit("SI", info->follows_start_interest); printer.PrintBit("DN", info->determine_newline); printer.PrintBit("DW", info->determine_word); printer.PrintBit("DS", info->determine_start); printer.PrintBit("DDN", info->does_determine_newline); printer.PrintBit("DDW", info->does_determine_word); printer.PrintBit("DDS", info->does_determine_start); printer.PrintPositive("IW", info->is_word); printer.PrintPositive("IN", info->is_newline); printer.PrintPositive("FN", info->follows_newline); printer.PrintPositive("FW", info->follows_word); printer.PrintPositive("FS", info->follows_start); Label* label = that->label(); if (label->is_bound()) printer.PrintPositive("@", label->pos()); stream()->Add("}\"];\n"); stream()->Add(" a%p -> n%p [style=dashed, color=grey, " "arrowhead=none];\n", that, that); } static const bool kPrintDispatchTable = false; void DotPrinter::VisitChoice(ChoiceNode* that) { if (kPrintDispatchTable) { stream()->Add(" n%p [shape=Mrecord, label=\"", that); TableEntryHeaderPrinter header_printer(stream()); that->GetTable(ignore_case_)->ForEach(&header_printer); stream()->Add("\"]\n", that); PrintAttributes(that); TableEntryBodyPrinter body_printer(stream(), that); that->GetTable(ignore_case_)->ForEach(&body_printer); PrintOnFailure(that, that->on_failure()); } else { stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); for (int i = 0; i < that->alternatives()->length(); i++) { GuardedAlternative alt = that->alternatives()->at(i); stream()->Add(" n%p -> n%p;\n", that, alt.node()); } } for (int i = 0; i < that->alternatives()->length(); i++) { GuardedAlternative alt = that->alternatives()->at(i); alt.node()->Accept(this); } } void DotPrinter::VisitText(TextNode* that) { stream()->Add(" n%p [label=\"", that); for (int i = 0; i < that->elements()->length(); i++) { if (i > 0) stream()->Add(" "); TextElement elm = that->elements()->at(i); switch (elm.type) { case TextElement::ATOM: { stream()->Add("'%w'", elm.data.u_atom->data()); break; } case TextElement::CHAR_CLASS: { RegExpCharacterClass* node = elm.data.u_char_class; stream()->Add("["); if (node->is_negated()) stream()->Add("^"); for (int j = 0; j < node->ranges()->length(); j++) { CharacterRange range = node->ranges()->at(j); stream()->Add("%k-%k", range.from(), range.to()); } stream()->Add("]"); break; } default: UNREACHABLE(); } } stream()->Add("\", shape=box, peripheries=2];\n"); PrintAttributes(that); stream()->Add(" n%p -> n%p;\n", that, that->on_success()); Visit(that->on_success()); PrintOnFailure(that, that->on_failure()); } void DotPrinter::VisitBackReference(BackReferenceNode* that) { stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", that, that->start_register(), that->end_register()); PrintAttributes(that); stream()->Add(" n%p -> n%p;\n", that, that->on_success()); Visit(that->on_success()); PrintOnFailure(that, that->on_failure()); } void DotPrinter::VisitEnd(EndNode* that) { stream()->Add(" n%p [style=bold, shape=point];\n", that); PrintAttributes(that); } void DotPrinter::VisitAction(ActionNode* that) { stream()->Add(" n%p [", that); switch (that->type_) { case ActionNode::STORE_REGISTER: stream()->Add("label=\"$%i:=%i\", shape=octagon", that->data_.u_store_register.reg, that->data_.u_store_register.value); break; case ActionNode::INCREMENT_REGISTER: stream()->Add("label=\"$%i++\", shape=octagon", that->data_.u_increment_register.reg); break; case ActionNode::STORE_POSITION: stream()->Add("label=\"$%i:=$pos\", shape=octagon", that->data_.u_position_register.reg); break; case ActionNode::RESTORE_POSITION: stream()->Add("label=\"$pos:=$%i\", shape=octagon", that->data_.u_position_register.reg); break; case ActionNode::BEGIN_SUBMATCH: stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", that->data_.u_submatch.current_position_register); break; case ActionNode::ESCAPE_SUBMATCH: stream()->Add("label=\"escape\", shape=septagon"); break; } stream()->Add("];\n"); PrintAttributes(that); stream()->Add(" n%p -> n%p;\n", that, that->on_success()); Visit(that->on_success()); } class DispatchTableDumper { public: explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } void Call(uc16 key, DispatchTable::Entry entry); StringStream* stream() { return stream_; } private: StringStream* stream_; }; void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { stream()->Add("[%k-%k]: {", key, entry.to()); OutSet* set = entry.out_set(); bool first = true; for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { if (set->Get(i)) { if (first) { first = false; } else { stream()->Add(", "); } stream()->Add("%i", i); } } stream()->Add("}\n"); } void DispatchTable::Dump() { HeapStringAllocator alloc; StringStream stream(&alloc); DispatchTableDumper dumper(&stream); tree()->ForEach(&dumper); OS::PrintError("%s", *stream.ToCString()); } void RegExpEngine::DotPrint(const char* label, RegExpNode* node, bool ignore_case) { DotPrinter printer(ignore_case); printer.PrintNode(label, node); } #endif // DEBUG // ------------------------------------------------------------------- // Tree to graph conversion RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { ZoneList* elms = new ZoneList(1); elms->Add(TextElement::Atom(this)); return new TextNode(elms, on_success, on_failure); } RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { return new TextNode(elements(), on_success, on_failure); } RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { ZoneList* elms = new ZoneList(1); elms->Add(TextElement::CharClass(this)); return new TextNode(elms, on_success, on_failure); } RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { ZoneList* alternatives = this->alternatives(); int length = alternatives->length(); ChoiceNode* result = new ChoiceNode(length, on_failure); for (int i = 0; i < length; i++) { GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, on_success, on_failure)); result->AddAlternative(alternative); } return result; } RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { return ToNode(min(), max(), is_greedy(), body(), compiler, on_success, on_failure); } RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, RegExpTree* body, RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { // x{f, t} becomes this: // // (r++)<-. // | ` // | (x) // v ^ // (r=0)-->(?)---/ [if r < t] // | // [if r >= f] \----> ... // // // TODO(someone): clear captures on repetition and handle empty // matches. bool has_min = min > 0; bool has_max = max < RegExpQuantifier::kInfinity; bool needs_counter = has_min || has_max; int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1; ChoiceNode* center = new ChoiceNode(2, on_failure); RegExpNode* loop_return = needs_counter ? static_cast(ActionNode::IncrementRegister(reg_ctr, center)) : static_cast(center); RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure); GuardedAlternative body_alt(body_node); if (has_max) { Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); body_alt.AddGuard(body_guard); } GuardedAlternative rest_alt(on_success); if (has_min) { Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min); rest_alt.AddGuard(rest_guard); } if (is_greedy) { center->AddAlternative(body_alt); center->AddAlternative(rest_alt); } else { center->AddAlternative(rest_alt); center->AddAlternative(body_alt); } if (needs_counter) { return ActionNode::StoreRegister(reg_ctr, 0, center); } else { return center; } } RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { NodeInfo info; switch (type()) { case START_OF_LINE: info.follows_newline_interest = true; break; case START_OF_INPUT: info.follows_start_interest = true; break; case BOUNDARY: case NON_BOUNDARY: info.follows_word_interest = true; break; case END_OF_INPUT: info.at_end = true; break; case END_OF_LINE: // This is wrong but has the effect of making the compiler abort. info.at_end = true; } return on_success->PropagateForward(&info); } RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { return new BackReferenceNode(RegExpCapture::StartRegister(index()), RegExpCapture::EndRegister(index()), on_success, on_failure); } RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { return on_success; } RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { int stack_pointer_register = compiler->AllocateRegister(); int position_register = compiler->AllocateRegister(); if (is_positive()) { // begin submatch scope // $reg = $pos // if [body] // then // $pos = $reg // escape submatch scope (drop all backtracks created in scope) // succeed // else // end submatch scope (nothing to clean up, just exit the scope) // fail return ActionNode::BeginSubmatch( stack_pointer_register, position_register, body()->ToNode( compiler, ActionNode::EscapeSubmatch( stack_pointer_register, true, // Also restore input position. position_register, on_success), on_failure)); } else { // begin submatch scope // try // first if (body) // then // escape submatch scope // fail // else // backtrack // second // end submatch scope // restore current position // succeed ChoiceNode* try_node = new ChoiceNode(1, ActionNode::RestorePosition(position_register, on_success)); RegExpNode* body_node = body()->ToNode( compiler, ActionNode::EscapeSubmatch(stack_pointer_register, false, // Don't also restore position 0, // Unused arguments. on_failure), compiler->backtrack()); GuardedAlternative body_alt(body_node); try_node->AddAlternative(body_alt); return ActionNode::BeginSubmatch(stack_pointer_register, position_register, try_node); } } RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { return ToNode(body(), index(), compiler, on_success, on_failure); } RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { int start_reg = RegExpCapture::StartRegister(index); int end_reg = RegExpCapture::EndRegister(index); RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success); RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure); return ActionNode::StorePosition(start_reg, body_node); } RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, RegExpNode* on_success, RegExpNode* on_failure) { ZoneList* children = nodes(); RegExpNode* current = on_success; for (int i = children->length() - 1; i >= 0; i--) { current = children->at(i)->ToNode(compiler, current, on_failure); } return current; } static const int kSpaceRangeCount = 20; static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 }; static const int kWordRangeCount = 8; static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; static const int kDigitRangeCount = 2; static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' }; static const int kLineTerminatorRangeCount = 6; static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 }; static void AddClass(const uc16* elmv, int elmc, ZoneList* ranges) { for (int i = 0; i < elmc; i += 2) { ASSERT(elmv[i] <= elmv[i + 1]); ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); } } static void AddClassNegated(const uc16 *elmv, int elmc, ZoneList* ranges) { ASSERT(elmv[0] != 0x0000); ASSERT(elmv[elmc-1] != 0xFFFF); uc16 last = 0x0000; for (int i = 0; i < elmc; i += 2) { ASSERT(last <= elmv[i] - 1); ASSERT(elmv[i] <= elmv[i + 1]); ranges->Add(CharacterRange(last, elmv[i] - 1)); last = elmv[i + 1] + 1; } ranges->Add(CharacterRange(last, 0xFFFF)); } void CharacterRange::AddClassEscape(uc16 type, ZoneList* ranges) { switch (type) { case 's': AddClass(kSpaceRanges, kSpaceRangeCount, ranges); break; case 'S': AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); break; case 'w': AddClass(kWordRanges, kWordRangeCount, ranges); break; case 'W': AddClassNegated(kWordRanges, kWordRangeCount, ranges); break; case 'd': AddClass(kDigitRanges, kDigitRangeCount, ranges); break; case 'D': AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); break; case '.': AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges); break; // This is not a character range as defined by the spec but a // convenient shorthand for a character class that matches any // character. case '*': ranges->Add(CharacterRange::Everything()); break; default: UNREACHABLE(); } } Vector CharacterRange::GetWordBounds() { return Vector(kWordRanges, kWordRangeCount); } class CharacterRangeSplitter { public: CharacterRangeSplitter(ZoneList** included, ZoneList** excluded) : included_(included), excluded_(excluded) { } void Call(uc16 from, DispatchTable::Entry entry); static const int kInBase = 0; static const int kInOverlay = 1; private: ZoneList** included_; ZoneList** excluded_; }; void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { if (!entry.out_set()->Get(kInBase)) return; ZoneList** target = entry.out_set()->Get(kInOverlay) ? included_ : excluded_; if (*target == NULL) *target = new ZoneList(2); (*target)->Add(CharacterRange(entry.from(), entry.to())); } void CharacterRange::Split(ZoneList* base, Vector overlay, ZoneList** included, ZoneList** excluded) { ASSERT_EQ(NULL, *included); ASSERT_EQ(NULL, *excluded); DispatchTable table; for (int i = 0; i < base->length(); i++) table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); for (int i = 0; i < overlay.length(); i += 2) { table.AddRange(CharacterRange(overlay[i], overlay[i+1]), CharacterRangeSplitter::kInOverlay); } CharacterRangeSplitter callback(included, excluded); table.ForEach(&callback); } void CharacterRange::AddCaseEquivalents(ZoneList* ranges) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; if (IsSingleton()) { // If this is a singleton we just expand the one character. int length = uncanonicalize.get(from(), '\0', chars); for (int i = 0; i < length; i++) { uc32 chr = chars[i]; if (chr != from()) { ranges->Add(CharacterRange::Singleton(chars[i])); } } } else if (from() <= kRangeCanonicalizeMax && to() <= kRangeCanonicalizeMax) { // If this is a range we expand the characters block by block, // expanding contiguous subranges (blocks) one at a time. // The approach is as follows. For a given start character we // look up the block that contains it, for instance 'a' if the // start character is 'c'. A block is characterized by the property // that all characters uncanonicalize in the same way as the first // element, except that each entry in the result is incremented // by the distance from the first element. So a-z is a block // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter // uncanonicalizes to ['a' + k, 'A' + k]. // Once we've found the start point we look up its uncanonicalization // and produce a range for each element. For instance for [c-f] // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only // add a range if it is not already contained in the input, so [c-f] // will be skipped but [C-F] will be added. If this range is not // completely contained in a block we do this for all the blocks // covered by the range. unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; // First, look up the block that contains the 'from' character. int length = canonrange.get(from(), '\0', range); if (length == 0) { range[0] = from(); } else { ASSERT_EQ(1, length); } int pos = from(); // The start of the current block. Note that except for the first // iteration 'start' is always equal to 'pos'. int start; // If it is not the start point of a block the entry contains the // offset of the character from the start point. if ((range[0] & kStartMarker) == 0) { start = pos - range[0]; } else { start = pos; } // Then we add the ranges on at a time, incrementing the current // position to be after the last block each time. The position // always points to the start of a block. while (pos < to()) { length = canonrange.get(start, '\0', range); if (length == 0) { range[0] = start; } else { ASSERT_EQ(1, length); } ASSERT((range[0] & kStartMarker) != 0); // The start point of a block contains the distance to the end // of the range. int block_end = start + (range[0] & kPayloadMask) - 1; int end = (block_end > to()) ? to() : block_end; length = uncanonicalize.get(start, '\0', range); for (int i = 0; i < length; i++) { uc32 c = range[i]; uc16 range_from = c + (pos - start); uc16 range_to = c + (end - start); if (!(from() <= range_from && range_to <= to())) { ranges->Add(CharacterRange(range_from, range_to)); } } start = pos = block_end + 1; } } else { // TODO(plesner) when we've fixed the 2^11 bug in unibrow. } } // ------------------------------------------------------------------- // Interest propagation RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { for (int i = 0; i < siblings_.length(); i++) { RegExpNode* sibling = siblings_.Get(i); if (sibling->info()->Matches(info)) return sibling; } return NULL; } RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) { ASSERT_EQ(false, *cloned); ASSERT(!info->HasAssertions()); siblings_.Ensure(this); RegExpNode* result = TryGetSibling(info); if (result != NULL) return result; result = this->Clone(); NodeInfo* new_info = result->info(); new_info->ResetCompilationState(); new_info->AddFromPreceding(info); AddSibling(result); *cloned = true; return result; } template static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) { NodeInfo full_info(*node->info()); full_info.AddFromPreceding(info); bool cloned = false; return RegExpNode::EnsureSibling(node, &full_info, &cloned); } RegExpNode* ActionNode::PropagateForward(NodeInfo* info) { NodeInfo full_info(*this->info()); full_info.AddFromPreceding(info); bool cloned = false; ActionNode* action = EnsureSibling(this, &full_info, &cloned); if (cloned && type_ != ESCAPE_SUBMATCH) { action->set_on_success(action->on_success()->PropagateForward(info)); } return action; } RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) { NodeInfo full_info(*this->info()); full_info.AddFromPreceding(info); bool cloned = false; ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned); if (cloned) { ZoneList* old_alternatives = alternatives(); int count = old_alternatives->length(); choice->alternatives_ = new ZoneList(count); for (int i = 0; i < count; i++) { GuardedAlternative alternative = old_alternatives->at(i); alternative.set_node(alternative.node()->PropagateForward(info)); choice->alternatives()->Add(alternative); } if (!choice->on_failure_->IsBacktrack()) { choice->on_failure_ = choice->on_failure_->PropagateForward(info); } } return choice; } RegExpNode* EndNode::PropagateForward(NodeInfo* info) { return PropagateToEndpoint(this, info); } RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) { NodeInfo full_info(*this->info()); full_info.AddFromPreceding(info); bool cloned = false; BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned); if (cloned) { // TODO(erikcorry): A back reference has to have two successors (by default // the same node). The first is used if the back reference matches a non- // empty back reference, the second if it matches an empty one. This // doesn't matter for at_end, which is the only one implemented right now, // but it will matter for other pieces of info. back_ref->set_on_success(back_ref->on_success()->PropagateForward(info)); } return back_ref; } RegExpNode* TextNode::PropagateForward(NodeInfo* info) { return PropagateToEndpoint(this, info); } // ------------------------------------------------------------------- // Splay tree OutSet* OutSet::Extend(unsigned value) { if (Get(value)) return this; if (successors() != NULL) { for (int i = 0; i < successors()->length(); i++) { OutSet* successor = successors()->at(i); if (successor->Get(value)) return successor; } } else { successors_ = new ZoneList(2); } OutSet* result = new OutSet(first_, remaining_); result->Set(value); successors()->Add(result); return result; } void OutSet::Set(unsigned value) { if (value < kFirstLimit) { first_ |= (1 << value); } else { if (remaining_ == NULL) remaining_ = new ZoneList(1); if (remaining_->is_empty() || !remaining_->Contains(value)) remaining_->Add(value); } } bool OutSet::Get(unsigned value) { if (value < kFirstLimit) { return (first_ & (1 << value)) != 0; } else if (remaining_ == NULL) { return false; } else { return remaining_->Contains(value); } } const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; const DispatchTable::Entry DispatchTable::Config::kNoValue; void DispatchTable::AddRange(CharacterRange full_range, int value) { CharacterRange current = full_range; if (tree()->is_empty()) { // If this is the first range we just insert into the table. ZoneSplayTree::Locator loc; ASSERT_RESULT(tree()->Insert(current.from(), &loc)); loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); return; } // First see if there is a range to the left of this one that // overlaps. ZoneSplayTree::Locator loc; if (tree()->FindGreatestLessThan(current.from(), &loc)) { Entry* entry = &loc.value(); // If we've found a range that overlaps with this one, and it // starts strictly to the left of this one, we have to fix it // because the following code only handles ranges that start on // or after the start point of the range we're adding. if (entry->from() < current.from() && entry->to() >= current.from()) { // Snap the overlapping range in half around the start point of // the range we're adding. CharacterRange left(entry->from(), current.from() - 1); CharacterRange right(current.from(), entry->to()); // The left part of the overlapping range doesn't overlap. // Truncate the whole entry to be just the left part. entry->set_to(left.to()); // The right part is the one that overlaps. We add this part // to the map and let the next step deal with merging it with // the range we're adding. ZoneSplayTree::Locator loc; ASSERT_RESULT(tree()->Insert(right.from(), &loc)); loc.set_value(Entry(right.from(), right.to(), entry->out_set())); } } while (current.is_valid()) { if (tree()->FindLeastGreaterThan(current.from(), &loc) && (loc.value().from() <= current.to()) && (loc.value().to() >= current.from())) { Entry* entry = &loc.value(); // We have overlap. If there is space between the start point of // the range we're adding and where the overlapping range starts // then we have to add a range covering just that space. if (current.from() < entry->from()) { ZoneSplayTree::Locator ins; ASSERT_RESULT(tree()->Insert(current.from(), &ins)); ins.set_value(Entry(current.from(), entry->from() - 1, empty()->Extend(value))); current.set_from(entry->from()); } ASSERT_EQ(current.from(), entry->from()); // If the overlapping range extends beyond the one we want to add // we have to snap the right part off and add it separately. if (entry->to() > current.to()) { ZoneSplayTree::Locator ins; ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set())); entry->set_to(current.to()); } ASSERT(entry->to() <= current.to()); // The overlapping range is now completely contained by the range // we're adding so we can just update it and move the start point // of the range we're adding just past it. entry->AddValue(value); // Bail out if the last interval ended at 0xFFFF since otherwise // adding 1 will wrap around to 0. if (entry->to() == 0xFFFF) break; ASSERT(entry->to() + 1 > current.from()); current.set_from(entry->to() + 1); } else { // There is no overlap so we can just add the range ZoneSplayTree::Locator ins; ASSERT_RESULT(tree()->Insert(current.from(), &ins)); ins.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); break; } } } OutSet* DispatchTable::Get(uc16 value) { ZoneSplayTree::Locator loc; if (!tree()->FindGreatestLessThan(value, &loc)) return empty(); Entry* entry = &loc.value(); if (value <= entry->to()) return entry->out_set(); else return empty(); } // ------------------------------------------------------------------- // Analysis void Analysis::EnsureAnalyzed(RegExpNode* that) { if (that->info()->been_analyzed || that->info()->being_analyzed) return; that->info()->being_analyzed = true; that->Accept(this); that->info()->being_analyzed = false; that->info()->been_analyzed = true; } void Analysis::VisitEnd(EndNode* that) { // nothing to do } void Analysis::VisitText(TextNode* that) { if (ignore_case_) { that->MakeCaseIndependent(); } EnsureAnalyzed(that->on_success()); EnsureAnalyzed(that->on_failure()); NodeInfo* info = that->info(); NodeInfo* next_info = that->on_success()->info(); // If the following node is interested in what it follows then this // node must determine it. info->determine_newline = next_info->follows_newline_interest; info->determine_word = next_info->follows_word_interest; info->determine_start = next_info->follows_start_interest; } void Analysis::VisitAction(ActionNode* that) { EnsureAnalyzed(that->on_success()); // If the next node is interested in what it follows then this node // has to be interested too so it can pass the information on. that->info()->AddFromFollowing(that->on_success()->info()); } void Analysis::VisitChoice(ChoiceNode* that) { NodeInfo* info = that->info(); for (int i = 0; i < that->alternatives()->length(); i++) { RegExpNode* node = that->alternatives()->at(i).node(); EnsureAnalyzed(node); // Anything the following nodes need to know has to be known by // this node also, so it can pass it on. info->AddFromFollowing(node->info()); } EnsureAnalyzed(that->on_failure()); } void Analysis::VisitBackReference(BackReferenceNode* that) { EnsureAnalyzed(that->on_success()); EnsureAnalyzed(that->on_failure()); } // ------------------------------------------------------------------- // Assumption expansion RegExpNode* RegExpNode::EnsureExpanded(NodeInfo* info) { siblings_.Ensure(this); NodeInfo new_info = *this->info(); if (new_info.follows_word_interest) new_info.follows_word = info->follows_word; if (new_info.follows_newline_interest) new_info.follows_newline = info->follows_newline; // If the following node should determine something we need to get // a sibling that determines it. new_info.does_determine_newline = new_info.determine_newline; new_info.does_determine_word = new_info.determine_word; new_info.does_determine_start = new_info.determine_start; RegExpNode* sibling = TryGetSibling(&new_info); if (sibling == NULL) { sibling = ExpandLocal(&new_info); siblings_.Add(sibling); sibling->info()->being_expanded = true; sibling->ExpandChildren(); sibling->info()->being_expanded = false; sibling->info()->been_expanded = true; } else { NodeInfo* sib_info = sibling->info(); if (!sib_info->been_expanded && !sib_info->being_expanded) { sibling->info()->being_expanded = true; sibling->ExpandChildren(); sibling->info()->being_expanded = false; sibling->info()->been_expanded = true; } } return sibling; } RegExpNode* ChoiceNode::ExpandLocal(NodeInfo* info) { ChoiceNode* clone = this->Clone(); clone->info()->ResetCompilationState(); clone->info()->AddAssumptions(info); return clone; } void ChoiceNode::ExpandChildren() { ZoneList* alts = alternatives(); ZoneList* new_alts = new ZoneList(alts->length()); for (int i = 0; i < alts->length(); i++) { GuardedAlternative next = alts->at(i); next.set_node(next.node()->EnsureExpanded(info())); new_alts->Add(next); } alternatives_ = new_alts; } RegExpNode* TextNode::ExpandLocal(NodeInfo* info) { TextElement last = elements()->last(); if (last.type == TextElement::CHAR_CLASS) { RegExpCharacterClass* char_class = last.data.u_char_class; if (info->does_determine_word) { ZoneList* word = NULL; ZoneList* non_word = NULL; CharacterRange::Split(char_class->ranges(), CharacterRange::GetWordBounds(), &word, &non_word); if (non_word == NULL) { // This node contains no non-word characters so it must be // all word. this->info()->is_word = NodeInfo::TRUE; } else if (word == NULL) { // Vice versa. this->info()->is_word = NodeInfo::FALSE; } else { // If this character class contains both word and non-word // characters we need to split it into two. ChoiceNode* result = new ChoiceNode(2, on_failure()); // Welcome to the family, son! result->set_siblings(this->siblings()); *result->info() = *this->info(); result->info()->ResetCompilationState(); result->info()->AddAssumptions(info); RegExpNode* word_node = new TextNode(new RegExpCharacterClass(word, false), on_success(), on_failure()); word_node->info()->determine_word = true; word_node->info()->does_determine_word = true; word_node->info()->is_word = NodeInfo::TRUE; result->alternatives()->Add(GuardedAlternative(word_node)); RegExpNode* non_word_node = new TextNode(new RegExpCharacterClass(non_word, false), on_success(), on_failure()); non_word_node->info()->determine_word = true; non_word_node->info()->does_determine_word = true; non_word_node->info()->is_word = NodeInfo::FALSE; result->alternatives()->Add(GuardedAlternative(non_word_node)); return result; } } } TextNode* clone = this->Clone(); clone->info()->ResetCompilationState(); clone->info()->AddAssumptions(info); return clone; } void TextNode::ExpandAtomChildren(RegExpAtom* that) { NodeInfo new_info = *info(); uc16 last = that->data()[that->data().length() - 1]; if (info()->determine_word) { new_info.follows_word = IsRegExpWord(last) ? NodeInfo::TRUE : NodeInfo::FALSE; } else { new_info.follows_word = NodeInfo::UNKNOWN; } if (info()->determine_newline) { new_info.follows_newline = IsRegExpNewline(last) ? NodeInfo::TRUE : NodeInfo::FALSE; } else { new_info.follows_newline = NodeInfo::UNKNOWN; } if (info()->determine_start) { new_info.follows_start = NodeInfo::FALSE; } else { new_info.follows_start = NodeInfo::UNKNOWN; } set_on_success(on_success()->EnsureExpanded(&new_info)); } void TextNode::ExpandCharClassChildren(RegExpCharacterClass* that) { if (info()->does_determine_word) { // ASSERT(info()->is_word != NodeInfo::UNKNOWN); NodeInfo next_info = *on_success()->info(); next_info.follows_word = info()->is_word; set_on_success(on_success()->EnsureExpanded(&next_info)); } else { set_on_success(on_success()->EnsureExpanded(info())); } } void TextNode::ExpandChildren() { TextElement last = elements()->last(); switch (last.type) { case TextElement::ATOM: ExpandAtomChildren(last.data.u_atom); break; case TextElement::CHAR_CLASS: ExpandCharClassChildren(last.data.u_char_class); break; default: UNREACHABLE(); } } RegExpNode* ActionNode::ExpandLocal(NodeInfo* info) { ActionNode* clone = this->Clone(); clone->info()->ResetCompilationState(); clone->info()->AddAssumptions(info); return clone; } void ActionNode::ExpandChildren() { set_on_success(on_success()->EnsureExpanded(info())); } RegExpNode* BackReferenceNode::ExpandLocal(NodeInfo* info) { BackReferenceNode* clone = this->Clone(); clone->info()->ResetCompilationState(); clone->info()->AddAssumptions(info); return clone; } void BackReferenceNode::ExpandChildren() { set_on_success(on_success()->EnsureExpanded(info())); } RegExpNode* EndNode::ExpandLocal(NodeInfo* info) { EndNode* clone = this->Clone(); clone->info()->ResetCompilationState(); clone->info()->AddAssumptions(info); return clone; } void EndNode::ExpandChildren() { // nothing to do } // ------------------------------------------------------------------- // Dispatch table construction void DispatchTableConstructor::VisitEnd(EndNode* that) { AddRange(CharacterRange::Everything()); } void DispatchTableConstructor::BuildTable(ChoiceNode* node) { node->set_being_calculated(true); ZoneList* alternatives = node->alternatives(); for (int i = 0; i < alternatives->length(); i++) { set_choice_index(i); alternatives->at(i).node()->Accept(this); } node->set_being_calculated(false); } class AddDispatchRange { public: explicit AddDispatchRange(DispatchTableConstructor* constructor) : constructor_(constructor) { } void Call(uc32 from, DispatchTable::Entry entry); private: DispatchTableConstructor* constructor_; }; void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { CharacterRange range(from, entry.to()); constructor_->AddRange(range); } void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { if (node->being_calculated()) return; DispatchTable* table = node->GetTable(ignore_case_); AddDispatchRange adder(this); table->ForEach(&adder); } void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { // TODO(160): Find the node that we refer back to and propagate its start // set back to here. For now we just accept anything. AddRange(CharacterRange::Everything()); } static int CompareRangeByFrom(const CharacterRange* a, const CharacterRange* b) { return Compare(a->from(), b->from()); } void DispatchTableConstructor::AddInverse(ZoneList* ranges) { ranges->Sort(CompareRangeByFrom); uc16 last = 0; for (int i = 0; i < ranges->length(); i++) { CharacterRange range = ranges->at(i); if (last < range.from()) AddRange(CharacterRange(last, range.from() - 1)); if (range.to() >= last) { if (range.to() == 0xFFFF) { return; } else { last = range.to() + 1; } } } AddRange(CharacterRange(last, 0xFFFF)); } void DispatchTableConstructor::VisitText(TextNode* that) { TextElement elm = that->elements()->at(0); switch (elm.type) { case TextElement::ATOM: { uc16 c = elm.data.u_atom->data()[0]; AddRange(CharacterRange(c, c)); break; } case TextElement::CHAR_CLASS: { RegExpCharacterClass* tree = elm.data.u_char_class; ZoneList* ranges = tree->ranges(); if (tree->is_negated()) { AddInverse(ranges); } else { for (int i = 0; i < ranges->length(); i++) AddRange(ranges->at(i)); } break; } default: { UNIMPLEMENTED(); } } } void DispatchTableConstructor::VisitAction(ActionNode* that) { that->on_success()->Accept(this); } Handle RegExpEngine::Compile(RegExpParseResult* input, RegExpNode** node_return, bool ignore_case, bool is_multiline) { RegExpCompiler compiler(input->capture_count, ignore_case); // Wrap the body of the regexp in capture #0. RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, 0, &compiler, compiler.accept(), compiler.backtrack()); // Add a .*? at the beginning, outside the body capture. // Note: We could choose to not add this if the regexp is anchored at // the start of the input but I'm not sure how best to do that and // since we don't even handle ^ yet I'm saving that optimization for // later. RegExpNode* node = RegExpQuantifier::ToNode(0, RegExpQuantifier::kInfinity, false, new RegExpCharacterClass('*'), &compiler, captured_body, compiler.backtrack()); if (node_return != NULL) *node_return = node; Analysis analysis(ignore_case); analysis.EnsureAnalyzed(node); NodeInfo info = *node->info(); node = node->EnsureExpanded(&info); if (!FLAG_irregexp) { return Handle::null(); } if (is_multiline && !FLAG_attempt_multiline_irregexp) { return Handle::null(); } if (FLAG_irregexp_native) { #ifdef ARM // Unimplemented, fall-through to bytecode implementation. #else // IA32 RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16, (input->capture_count + 1) * 2); return compiler.Assemble(¯o_assembler, node, input->capture_count); #endif } EmbeddedVector codes; RegExpMacroAssemblerIrregexp macro_assembler(codes); return compiler.Assemble(¯o_assembler, node, input->capture_count); } }} // namespace v8::internal