From 48756fcf74e3a65456fa676492d6aaeec07f034d Mon Sep 17 00:00:00 2001 From: Jakob Gruber Date: Mon, 21 Oct 2019 13:37:40 +0200 Subject: [PATCH] [regexp] Add a backtracking limit in the interpreter V8 uses a backtracking regexp engine, which has the caveat that some regexp patterns can have exponential runtime behavior when excessive backtracking is involved. Especially when regexp patterns are user-controlled, it would be useful to be able to set an upper limit for a single regexp execution. This CL takes an initial step in that direction by adding a backtracking limit (intended to approximate execution time): - The limit is stored in the JSRegExp's data array. - A limit can currently only be set through the %NewRegExpWithLimit runtime function. - The limit is applied during interpreter execution. When exceeded, the interpreter stops execution and returns FAILURE (even if continued execution would at some later point have resulted in SUCCESS). In follow-up CLs, this mechanism will be extended to work in jitted regexp code, and exposed through the V8 API. Bug: v8:9695 Change-Id: Iadb5c100052f4a63b26f1ec49cf97c6713a66b9b Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1864934 Commit-Queue: Ulan Degenbaev Auto-Submit: Jakob Gruber Reviewed-by: Ulan Degenbaev Reviewed-by: Peter Marshall Cr-Commit-Position: refs/heads/master@{#64417} --- src/diagnostics/objects-debug.cc | 1 + src/heap/factory.cc | 5 ++- src/heap/factory.h | 2 +- src/objects/js-regexp.cc | 5 +++ src/objects/js-regexp.h | 26 +++++++++++----- src/objects/objects.cc | 23 +++++++------- src/regexp/regexp-interpreter.cc | 22 +++++++++++--- src/regexp/regexp-interpreter.h | 3 +- src/regexp/regexp.cc | 42 +++++++++++++++++--------- src/regexp/regexp.h | 2 +- src/runtime/runtime-test.cc | 17 +++++++++++ src/runtime/runtime.h | 1 + test/cctest/test-regexp.cc | 13 ++++---- test/mjsunit/regexp-backtrack-limit.js | 31 +++++++++++++++++++ 14 files changed, 145 insertions(+), 48 deletions(-) create mode 100644 test/mjsunit/regexp-backtrack-limit.js diff --git a/src/diagnostics/objects-debug.cc b/src/diagnostics/objects-debug.cc index c047ad2e37..553ad81d40 100644 --- a/src/diagnostics/objects-debug.cc +++ b/src/diagnostics/objects-debug.cc @@ -1453,6 +1453,7 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) { CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi()); CHECK(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex).IsSmi()); CHECK(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex).IsSmi()); + CHECK(arr.get(JSRegExp::kIrregexpBacktrackLimit).IsSmi()); break; } default: diff --git a/src/heap/factory.cc b/src/heap/factory.cc index 6fec666744..30745be38d 100644 --- a/src/heap/factory.cc +++ b/src/heap/factory.cc @@ -3900,7 +3900,9 @@ void Factory::SetRegExpAtomData(Handle regexp, JSRegExp::Type type, void Factory::SetRegExpIrregexpData(Handle regexp, JSRegExp::Type type, Handle source, - JSRegExp::Flags flags, int capture_count) { + JSRegExp::Flags flags, int capture_count, + uint32_t backtrack_limit) { + DCHECK(Smi::IsValid(backtrack_limit)); Handle store = NewFixedArray(JSRegExp::kIrregexpDataSize); Smi uninitialized = Smi::FromInt(JSRegExp::kUninitializedValue); Smi ticks_until_tier_up = FLAG_regexp_tier_up @@ -3917,6 +3919,7 @@ void Factory::SetRegExpIrregexpData(Handle regexp, store->set(JSRegExp::kIrregexpCaptureCountIndex, Smi::FromInt(capture_count)); store->set(JSRegExp::kIrregexpCaptureNameMapIndex, uninitialized); store->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, ticks_until_tier_up); + store->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit)); regexp->set_data(*store); } diff --git a/src/heap/factory.h b/src/heap/factory.h index 3289793584..06b765ec31 100644 --- a/src/heap/factory.h +++ b/src/heap/factory.h @@ -889,7 +889,7 @@ class V8_EXPORT_PRIVATE Factory { // irregexp regexp and stores it in the regexp. void SetRegExpIrregexpData(Handle regexp, JSRegExp::Type type, Handle source, JSRegExp::Flags flags, - int capture_count); + int capture_count, uint32_t backtrack_limit); // Returns the value for a known global constant (a property of the global // object which is neither configurable nor writable) like 'undefined'. diff --git a/src/objects/js-regexp.cc b/src/objects/js-regexp.cc index c7f96fe278..e8240d97e6 100644 --- a/src/objects/js-regexp.cc +++ b/src/objects/js-regexp.cc @@ -114,5 +114,10 @@ Handle JSRegExpResultIndices::BuildIndices( return indices; } +uint32_t JSRegExp::BacktrackLimit() const { + CHECK_EQ(TypeTag(), IRREGEXP); + return static_cast(Smi::ToInt(DataAt(kIrregexpBacktrackLimit))); +} + } // namespace internal } // namespace v8 diff --git a/src/objects/js-regexp.h b/src/objects/js-regexp.h index 03efd4913c..dd440a4bfd 100644 --- a/src/objects/js-regexp.h +++ b/src/objects/js-regexp.h @@ -84,17 +84,24 @@ class JSRegExp : public TorqueGeneratedJSRegExp { DECL_ACCESSORS(last_index, Object) - V8_EXPORT_PRIVATE static MaybeHandle New(Isolate* isolate, - Handle source, - Flags flags); + // If the backtrack limit is set to this marker value, no limit is applied. + static constexpr uint32_t kNoBacktrackLimit = 0; + + V8_EXPORT_PRIVATE static MaybeHandle New( + Isolate* isolate, Handle source, Flags flags, + uint32_t backtrack_limit = kNoBacktrackLimit); static Handle Copy(Handle regexp); - static MaybeHandle Initialize(Handle regexp, - Handle source, Flags flags); + static MaybeHandle Initialize( + Handle regexp, Handle source, Flags flags, + uint32_t backtrack_limit = kNoBacktrackLimit); static MaybeHandle Initialize(Handle regexp, Handle source, Handle flags_string); + static Flags FlagsFromString(Isolate* isolate, Handle flags, + bool* success); + bool MarkedForTierUp(); void ResetLastTierUpTick(); void TierUpTick(); @@ -131,6 +138,8 @@ class JSRegExp : public TorqueGeneratedJSRegExp { inline bool HasCompiledCode() const; inline void DiscardCompiledCodeForSerialization(); + uint32_t BacktrackLimit() const; + // Dispatched behavior. DECL_PRINTER(JSRegExp) DECL_VERIFIER(JSRegExp) @@ -182,8 +191,11 @@ class JSRegExp : public TorqueGeneratedJSRegExp { // happens once the ticks reach zero. // This value is ignored if the regexp-tier-up flag isn't turned on. static const int kIrregexpTicksUntilTierUpIndex = kDataIndex + 7; - - static const int kIrregexpDataSize = kIrregexpTicksUntilTierUpIndex + 1; + // A smi containing either the backtracking limit or kNoBacktrackLimit. + // TODO(jgruber): If needed, this limit could be packed into other fields + // above to save space. + static const int kIrregexpBacktrackLimit = kDataIndex + 8; + static const int kIrregexpDataSize = kDataIndex + 9; // In-object fields. static const int kLastIndexFieldIndex = 0; diff --git a/src/objects/objects.cc b/src/objects/objects.cc index 22c70b9a14..4b793c4d2e 100644 --- a/src/objects/objects.cc +++ b/src/objects/objects.cc @@ -6090,10 +6090,9 @@ Handle JSPromise::TriggerPromiseReactions(Isolate* isolate, return isolate->factory()->undefined_value(); } -namespace { - -JSRegExp::Flags RegExpFlagsFromString(Isolate* isolate, Handle flags, - bool* success) { +// static +JSRegExp::Flags JSRegExp::FlagsFromString(Isolate* isolate, + Handle flags, bool* success) { STATIC_ASSERT(JSRegExp::FlagFromChar('g') == JSRegExp::kGlobal); STATIC_ASSERT(JSRegExp::FlagFromChar('i') == JSRegExp::kIgnoreCase); STATIC_ASSERT(JSRegExp::FlagFromChar('m') == JSRegExp::kMultiline); @@ -6136,16 +6135,14 @@ JSRegExp::Flags RegExpFlagsFromString(Isolate* isolate, Handle flags, return value; } -} // namespace - // static MaybeHandle JSRegExp::New(Isolate* isolate, Handle pattern, - Flags flags) { + Flags flags, uint32_t backtrack_limit) { Handle constructor = isolate->regexp_function(); Handle regexp = Handle::cast(isolate->factory()->NewJSObject(constructor)); - return JSRegExp::Initialize(regexp, pattern, flags); + return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit); } // static @@ -6323,7 +6320,7 @@ MaybeHandle JSRegExp::Initialize(Handle regexp, Handle flags_string) { Isolate* isolate = regexp->GetIsolate(); bool success = false; - Flags flags = RegExpFlagsFromString(isolate, flags_string, &success); + Flags flags = JSRegExp::FlagsFromString(isolate, flags_string, &success); if (!success) { THROW_NEW_ERROR( isolate, @@ -6335,7 +6332,8 @@ MaybeHandle JSRegExp::Initialize(Handle regexp, // static MaybeHandle JSRegExp::Initialize(Handle regexp, - Handle source, Flags flags) { + Handle source, Flags flags, + uint32_t backtrack_limit) { Isolate* isolate = regexp->GetIsolate(); Factory* factory = isolate->factory(); // If source is the empty string we set it to "(?:)" instead as @@ -6344,8 +6342,9 @@ MaybeHandle JSRegExp::Initialize(Handle regexp, source = String::Flatten(isolate, source); - RETURN_ON_EXCEPTION(isolate, RegExp::Compile(isolate, regexp, source, flags), - JSRegExp); + RETURN_ON_EXCEPTION( + isolate, RegExp::Compile(isolate, regexp, source, flags, backtrack_limit), + JSRegExp); Handle escaped_source; ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source, diff --git a/src/regexp/regexp-interpreter.cc b/src/regexp/regexp-interpreter.cc index df72951afb..3e02d368b8 100644 --- a/src/regexp/regexp-interpreter.cc +++ b/src/regexp/regexp-interpreter.cc @@ -302,7 +302,8 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, String subject_string, Vector subject, int* registers, int current, uint32_t current_char, - RegExp::CallOrigin call_origin) { + RegExp::CallOrigin call_origin, + const uint32_t backtrack_limit) { DisallowHeapAllocation no_gc; #if V8_USE_COMPUTED_GOTO @@ -358,6 +359,8 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, BacktrackStack backtrack_stack; + uint32_t backtrack_count = 0; + #ifdef DEBUG if (FLAG_trace_regexp_bytecodes) { PrintF("\n\nStart bytecode interpreter\n\n"); @@ -434,6 +437,12 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, DISPATCH(); } BYTECODE(POP_BT) { + STATIC_ASSERT(JSRegExp::kNoBacktrackLimit == 0); + if (++backtrack_count == backtrack_limit) { + // Exceeded limits are treated as a failed match. + return IrregexpInterpreter::FAILURE; + } + IrregexpInterpreter::Result return_code = HandleInterrupts(isolate, call_origin, &code_array, &subject_string, &code_base, &subject, &pc); @@ -963,13 +972,14 @@ IrregexpInterpreter::Result IrregexpInterpreter::Match( ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte)); return MatchInternal(isolate, code_array, subject_string, registers, - registers_length, start_position, call_origin); + registers_length, start_position, call_origin, + regexp.BacktrackLimit()); } IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( Isolate* isolate, ByteArray code_array, String subject_string, int* registers, int registers_length, int start_position, - RegExp::CallOrigin call_origin) { + RegExp::CallOrigin call_origin, uint32_t backtrack_limit) { DCHECK(subject_string.IsFlat()); // Note: Heap allocation *is* allowed in two situations if calling from @@ -992,13 +1002,15 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( Vector subject_vector = subject_content.ToOneByteVector(); if (start_position != 0) previous_char = subject_vector[start_position - 1]; return RawMatch(isolate, code_array, subject_string, subject_vector, - registers, start_position, previous_char, call_origin); + registers, start_position, previous_char, call_origin, + backtrack_limit); } else { DCHECK(subject_content.IsTwoByte()); Vector subject_vector = subject_content.ToUC16Vector(); if (start_position != 0) previous_char = subject_vector[start_position - 1]; return RawMatch(isolate, code_array, subject_string, subject_vector, - registers, start_position, previous_char, call_origin); + registers, start_position, previous_char, call_origin, + backtrack_limit); } } diff --git a/src/regexp/regexp-interpreter.h b/src/regexp/regexp-interpreter.h index 2d0b74f136..d77b5db896 100644 --- a/src/regexp/regexp-interpreter.h +++ b/src/regexp/regexp-interpreter.h @@ -46,7 +46,8 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { static Result MatchInternal(Isolate* isolate, ByteArray code_array, String subject_string, int* registers, int registers_length, int start_position, - RegExp::CallOrigin call_origin); + RegExp::CallOrigin call_origin, + uint32_t backtrack_limit); private: static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string, diff --git a/src/regexp/regexp.cc b/src/regexp/regexp.cc index a4ab48ed0e..cd105deca3 100644 --- a/src/regexp/regexp.cc +++ b/src/regexp/regexp.cc @@ -33,7 +33,8 @@ class RegExpImpl final : public AllStatic { // Prepares a JSRegExp object with Irregexp-specific data. static void IrregexpInitialize(Isolate* isolate, Handle re, Handle pattern, JSRegExp::Flags flags, - int capture_register_count); + int capture_register_count, + uint32_t backtrack_limit); static void AtomCompile(Isolate* isolate, Handle re, Handle pattern, JSRegExp::Flags flags, @@ -131,17 +132,27 @@ static bool HasFewDifferentCharacters(Handle pattern) { // static MaybeHandle RegExp::Compile(Isolate* isolate, Handle re, Handle pattern, - JSRegExp::Flags flags) { + JSRegExp::Flags flags, + uint32_t backtrack_limit) { DCHECK(pattern->IsFlat()); + // Caching is based only on the pattern and flags, but code also differs when + // a backtrack limit is set. A present backtrack limit is very much *not* the + // common case, so just skip the cache for these. + const bool is_compilation_cache_enabled = + (backtrack_limit == JSRegExp::kNoBacktrackLimit); + Zone zone(isolate->allocator(), ZONE_NAME); - CompilationCache* compilation_cache = isolate->compilation_cache(); - MaybeHandle maybe_cached = - compilation_cache->LookupRegExp(pattern, flags); - Handle cached; - if (maybe_cached.ToHandle(&cached)) { - re->set_data(*cached); - return re; + CompilationCache* compilation_cache = nullptr; + if (is_compilation_cache_enabled) { + compilation_cache = isolate->compilation_cache(); + MaybeHandle maybe_cached = + compilation_cache->LookupRegExp(pattern, flags); + Handle cached; + if (maybe_cached.ToHandle(&cached)) { + re->set_data(*cached); + return re; + } } PostponeInterruptsScope postpone(isolate); @@ -176,13 +187,15 @@ MaybeHandle RegExp::Compile(Isolate* isolate, Handle re, } if (!has_been_compiled) { RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags, - parse_result.capture_count); + parse_result.capture_count, backtrack_limit); } DCHECK(re->data().IsFixedArray()); // Compilation succeeded so the data is set on the regexp // and we can store it in the cache. Handle data(FixedArray::cast(re->data()), isolate); - compilation_cache->PutRegExp(pattern, flags, data); + if (is_compilation_cache_enabled) { + compilation_cache->PutRegExp(pattern, flags, data); + } return re; } @@ -469,10 +482,11 @@ Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) { void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle re, Handle pattern, - JSRegExp::Flags flags, int capture_count) { + JSRegExp::Flags flags, int capture_count, + uint32_t backtrack_limit) { // Initialize compiled code entries to null. - isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern, - flags, capture_count); + isolate->factory()->SetRegExpIrregexpData( + re, JSRegExp::IRREGEXP, pattern, flags, capture_count, backtrack_limit); } // static diff --git a/src/regexp/regexp.h b/src/regexp/regexp.h index 6625b063bc..9b64cd31c7 100644 --- a/src/regexp/regexp.h +++ b/src/regexp/regexp.h @@ -66,7 +66,7 @@ class RegExp final : public AllStatic { // Returns false if compilation fails. V8_WARN_UNUSED_RESULT static MaybeHandle Compile( Isolate* isolate, Handle re, Handle pattern, - JSRegExp::Flags flags); + JSRegExp::Flags flags, uint32_t backtrack_limit); enum CallOrigin : int { kFromRuntime = 0, diff --git a/src/runtime/runtime-test.cc b/src/runtime/runtime-test.cc index 2364e7bd48..4c771b0eb5 100644 --- a/src/runtime/runtime-test.cc +++ b/src/runtime/runtime-test.cc @@ -1424,5 +1424,22 @@ RUNTIME_FUNCTION(Runtime_EnableCodeLoggingForTesting) { return ReadOnlyRoots(isolate).undefined_value(); } +RUNTIME_FUNCTION(Runtime_NewRegExpWithBacktrackLimit) { + HandleScope scope(isolate); + DCHECK_EQ(3, args.length()); + + CONVERT_ARG_HANDLE_CHECKED(String, pattern, 0); + CONVERT_ARG_HANDLE_CHECKED(String, flags_string, 1); + CONVERT_UINT32_ARG_CHECKED(backtrack_limit, 2); + + bool success = false; + JSRegExp::Flags flags = + JSRegExp::FlagsFromString(isolate, flags_string, &success); + CHECK(success); + + RETURN_RESULT_OR_FAILURE( + isolate, JSRegExp::New(isolate, pattern, flags, backtrack_limit)); +} + } // namespace internal } // namespace v8 diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 8319aabe2c..c11a087654 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -503,6 +503,7 @@ namespace internal { F(NotifyContextDisposed, 0, 1) \ F(OptimizeFunctionOnNextCall, -1, 1) \ F(OptimizeOsr, -1, 1) \ + F(NewRegExpWithBacktrackLimit, 3, 1) \ F(PrepareFunctionForOptimization, -1, 1) \ F(PrintWithNameForAssert, 2, 1) \ F(RedirectToWasmInterpreter, 2, 1) \ diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 95e752bece..907c89fa0c 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -643,7 +643,8 @@ static Handle CreateJSRegExp(Handle source, Handle code, Handle::cast(factory->NewJSObject(constructor)); factory->SetRegExpIrregexpData(regexp, JSRegExp::IRREGEXP, source, - JSRegExp::kNone, 0); + JSRegExp::kNone, 0, + JSRegExp::kNoBacktrackLimit); regexp->SetDataAt(is_unicode ? JSRegExp::kIrregexpUC16CodeIndex : JSRegExp::kIrregexpLatin1CodeIndex, *code); @@ -1284,8 +1285,8 @@ TEST(MacroAssembler) { Vector(str1, 6)).ToHandleChecked(); CHECK(IrregexpInterpreter::MatchInternal(isolate, *array, *f1_16, captures, 5, - 0, - RegExp::CallOrigin::kFromRuntime)); + 0, RegExp::CallOrigin::kFromRuntime, + JSRegExp::kNoBacktrackLimit)); CHECK_EQ(0, captures[0]); CHECK_EQ(3, captures[1]); CHECK_EQ(1, captures[2]); @@ -1296,9 +1297,9 @@ TEST(MacroAssembler) { Handle f2_16 = factory->NewStringFromTwoByte( Vector(str2, 6)).ToHandleChecked(); - CHECK(!IrregexpInterpreter::MatchInternal(isolate, *array, *f2_16, captures, - 5, 0, - RegExp::CallOrigin::kFromRuntime)); + CHECK(!IrregexpInterpreter::MatchInternal( + isolate, *array, *f2_16, captures, 5, 0, RegExp::CallOrigin::kFromRuntime, + JSRegExp::kNoBacktrackLimit)); CHECK_EQ(42, captures[0]); } diff --git a/test/mjsunit/regexp-backtrack-limit.js b/test/mjsunit/regexp-backtrack-limit.js new file mode 100644 index 0000000000..16ee425b04 --- /dev/null +++ b/test/mjsunit/regexp-backtrack-limit.js @@ -0,0 +1,31 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Flags: --allow-natives-syntax --regexp-interpret-all + +const kNoBacktrackLimit = 0; // To match JSRegExp::kNoBacktrackLimit. +const re0 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", kNoBacktrackLimit); +const re1 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", 50); + +// Backtracks remain below the limit on this subject string. +{ + let s = "3333ax3333x"; + assertArrayEquals(["3333x", "3333"], re0.exec(s)); + assertEquals(["3333x", "3333"], re1.exec(s)); +} + +// A longer subject exceeds the limit. +{ + let s = "333333333ax3333x"; + assertArrayEquals(["3333x", "3333"], re0.exec(s)); + assertEquals(null, re1.exec(s)); +} + +// ATOM regexp construction with a limit; in this case the limit should just be +// ignored, ATOMs never backtrack. +{ + const re = %NewRegExpWithBacktrackLimit("ax", "", 50); + let s = "3333ax3333x"; + assertArrayEquals(["ax"], re.exec(s)); +}