[regexp] Add a backtracking limit in the interpreter

V8 uses a backtracking regexp engine, which has the caveat that some
regexp patterns can have exponential runtime behavior when excessive
backtracking is involved.

Especially when regexp patterns are user-controlled, it would be useful
to be able to set an upper limit for a single regexp execution. This CL
takes an initial step in that direction by adding a backtracking limit
(intended to approximate execution time):

- The limit is stored in the JSRegExp's data array.
- A limit can currently only be set through the %NewRegExpWithLimit
runtime function.
- The limit is applied during interpreter execution. When exceeded, the
interpreter stops execution and returns FAILURE (even if continued
execution would at some later point have resulted in SUCCESS).

In follow-up CLs, this mechanism will be extended to work in jitted
regexp code, and exposed through the V8 API.

Bug: v8:9695
Change-Id: Iadb5c100052f4a63b26f1ec49cf97c6713a66b9b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1864934
Commit-Queue: Ulan Degenbaev <ulan@chromium.org>
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: Ulan Degenbaev <ulan@chromium.org>
Reviewed-by: Peter Marshall <petermarshall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64417}
This commit is contained in:
Jakob Gruber 2019-10-21 13:37:40 +02:00 committed by Commit Bot
parent 89cdae9890
commit 48756fcf74
14 changed files with 145 additions and 48 deletions

View File

@ -1453,6 +1453,7 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi());
CHECK(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex).IsSmi());
CHECK(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex).IsSmi());
CHECK(arr.get(JSRegExp::kIrregexpBacktrackLimit).IsSmi());
break;
}
default:

View File

@ -3900,7 +3900,9 @@ void Factory::SetRegExpAtomData(Handle<JSRegExp> regexp, JSRegExp::Type type,
void Factory::SetRegExpIrregexpData(Handle<JSRegExp> regexp,
JSRegExp::Type type, Handle<String> source,
JSRegExp::Flags flags, int capture_count) {
JSRegExp::Flags flags, int capture_count,
uint32_t backtrack_limit) {
DCHECK(Smi::IsValid(backtrack_limit));
Handle<FixedArray> store = NewFixedArray(JSRegExp::kIrregexpDataSize);
Smi uninitialized = Smi::FromInt(JSRegExp::kUninitializedValue);
Smi ticks_until_tier_up = FLAG_regexp_tier_up
@ -3917,6 +3919,7 @@ void Factory::SetRegExpIrregexpData(Handle<JSRegExp> regexp,
store->set(JSRegExp::kIrregexpCaptureCountIndex, Smi::FromInt(capture_count));
store->set(JSRegExp::kIrregexpCaptureNameMapIndex, uninitialized);
store->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, ticks_until_tier_up);
store->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
regexp->set_data(*store);
}

View File

@ -889,7 +889,7 @@ class V8_EXPORT_PRIVATE Factory {
// irregexp regexp and stores it in the regexp.
void SetRegExpIrregexpData(Handle<JSRegExp> regexp, JSRegExp::Type type,
Handle<String> source, JSRegExp::Flags flags,
int capture_count);
int capture_count, uint32_t backtrack_limit);
// Returns the value for a known global constant (a property of the global
// object which is neither configurable nor writable) like 'undefined'.

View File

@ -114,5 +114,10 @@ Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices(
return indices;
}
uint32_t JSRegExp::BacktrackLimit() const {
CHECK_EQ(TypeTag(), IRREGEXP);
return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit)));
}
} // namespace internal
} // namespace v8

View File

@ -84,17 +84,24 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
DECL_ACCESSORS(last_index, Object)
V8_EXPORT_PRIVATE static MaybeHandle<JSRegExp> New(Isolate* isolate,
Handle<String> source,
Flags flags);
// If the backtrack limit is set to this marker value, no limit is applied.
static constexpr uint32_t kNoBacktrackLimit = 0;
V8_EXPORT_PRIVATE static MaybeHandle<JSRegExp> New(
Isolate* isolate, Handle<String> source, Flags flags,
uint32_t backtrack_limit = kNoBacktrackLimit);
static Handle<JSRegExp> Copy(Handle<JSRegExp> regexp);
static MaybeHandle<JSRegExp> Initialize(Handle<JSRegExp> regexp,
Handle<String> source, Flags flags);
static MaybeHandle<JSRegExp> Initialize(
Handle<JSRegExp> regexp, Handle<String> source, Flags flags,
uint32_t backtrack_limit = kNoBacktrackLimit);
static MaybeHandle<JSRegExp> Initialize(Handle<JSRegExp> regexp,
Handle<String> source,
Handle<String> flags_string);
static Flags FlagsFromString(Isolate* isolate, Handle<String> flags,
bool* success);
bool MarkedForTierUp();
void ResetLastTierUpTick();
void TierUpTick();
@ -131,6 +138,8 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
inline bool HasCompiledCode() const;
inline void DiscardCompiledCodeForSerialization();
uint32_t BacktrackLimit() const;
// Dispatched behavior.
DECL_PRINTER(JSRegExp)
DECL_VERIFIER(JSRegExp)
@ -182,8 +191,11 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
// happens once the ticks reach zero.
// This value is ignored if the regexp-tier-up flag isn't turned on.
static const int kIrregexpTicksUntilTierUpIndex = kDataIndex + 7;
static const int kIrregexpDataSize = kIrregexpTicksUntilTierUpIndex + 1;
// A smi containing either the backtracking limit or kNoBacktrackLimit.
// TODO(jgruber): If needed, this limit could be packed into other fields
// above to save space.
static const int kIrregexpBacktrackLimit = kDataIndex + 8;
static const int kIrregexpDataSize = kDataIndex + 9;
// In-object fields.
static const int kLastIndexFieldIndex = 0;

View File

@ -6090,10 +6090,9 @@ Handle<Object> JSPromise::TriggerPromiseReactions(Isolate* isolate,
return isolate->factory()->undefined_value();
}
namespace {
JSRegExp::Flags RegExpFlagsFromString(Isolate* isolate, Handle<String> flags,
bool* success) {
// static
JSRegExp::Flags JSRegExp::FlagsFromString(Isolate* isolate,
Handle<String> flags, bool* success) {
STATIC_ASSERT(JSRegExp::FlagFromChar('g') == JSRegExp::kGlobal);
STATIC_ASSERT(JSRegExp::FlagFromChar('i') == JSRegExp::kIgnoreCase);
STATIC_ASSERT(JSRegExp::FlagFromChar('m') == JSRegExp::kMultiline);
@ -6136,16 +6135,14 @@ JSRegExp::Flags RegExpFlagsFromString(Isolate* isolate, Handle<String> flags,
return value;
}
} // namespace
// static
MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern,
Flags flags) {
Flags flags, uint32_t backtrack_limit) {
Handle<JSFunction> constructor = isolate->regexp_function();
Handle<JSRegExp> regexp =
Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor));
return JSRegExp::Initialize(regexp, pattern, flags);
return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit);
}
// static
@ -6323,7 +6320,7 @@ MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
Handle<String> flags_string) {
Isolate* isolate = regexp->GetIsolate();
bool success = false;
Flags flags = RegExpFlagsFromString(isolate, flags_string, &success);
Flags flags = JSRegExp::FlagsFromString(isolate, flags_string, &success);
if (!success) {
THROW_NEW_ERROR(
isolate,
@ -6335,7 +6332,8 @@ MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
// static
MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
Handle<String> source, Flags flags) {
Handle<String> source, Flags flags,
uint32_t backtrack_limit) {
Isolate* isolate = regexp->GetIsolate();
Factory* factory = isolate->factory();
// If source is the empty string we set it to "(?:)" instead as
@ -6344,8 +6342,9 @@ MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
source = String::Flatten(isolate, source);
RETURN_ON_EXCEPTION(isolate, RegExp::Compile(isolate, regexp, source, flags),
JSRegExp);
RETURN_ON_EXCEPTION(
isolate, RegExp::Compile(isolate, regexp, source, flags, backtrack_limit),
JSRegExp);
Handle<String> escaped_source;
ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source,

View File

@ -302,7 +302,8 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
String subject_string,
Vector<const Char> subject, int* registers,
int current, uint32_t current_char,
RegExp::CallOrigin call_origin) {
RegExp::CallOrigin call_origin,
const uint32_t backtrack_limit) {
DisallowHeapAllocation no_gc;
#if V8_USE_COMPUTED_GOTO
@ -358,6 +359,8 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
BacktrackStack backtrack_stack;
uint32_t backtrack_count = 0;
#ifdef DEBUG
if (FLAG_trace_regexp_bytecodes) {
PrintF("\n\nStart bytecode interpreter\n\n");
@ -434,6 +437,12 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
DISPATCH();
}
BYTECODE(POP_BT) {
STATIC_ASSERT(JSRegExp::kNoBacktrackLimit == 0);
if (++backtrack_count == backtrack_limit) {
// Exceeded limits are treated as a failed match.
return IrregexpInterpreter::FAILURE;
}
IrregexpInterpreter::Result return_code =
HandleInterrupts(isolate, call_origin, &code_array, &subject_string,
&code_base, &subject, &pc);
@ -963,13 +972,14 @@ IrregexpInterpreter::Result IrregexpInterpreter::Match(
ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte));
return MatchInternal(isolate, code_array, subject_string, registers,
registers_length, start_position, call_origin);
registers_length, start_position, call_origin,
regexp.BacktrackLimit());
}
IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
Isolate* isolate, ByteArray code_array, String subject_string,
int* registers, int registers_length, int start_position,
RegExp::CallOrigin call_origin) {
RegExp::CallOrigin call_origin, uint32_t backtrack_limit) {
DCHECK(subject_string.IsFlat());
// Note: Heap allocation *is* allowed in two situations if calling from
@ -992,13 +1002,15 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
if (start_position != 0) previous_char = subject_vector[start_position - 1];
return RawMatch(isolate, code_array, subject_string, subject_vector,
registers, start_position, previous_char, call_origin);
registers, start_position, previous_char, call_origin,
backtrack_limit);
} else {
DCHECK(subject_content.IsTwoByte());
Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
if (start_position != 0) previous_char = subject_vector[start_position - 1];
return RawMatch(isolate, code_array, subject_string, subject_vector,
registers, start_position, previous_char, call_origin);
registers, start_position, previous_char, call_origin,
backtrack_limit);
}
}

View File

@ -46,7 +46,8 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
static Result MatchInternal(Isolate* isolate, ByteArray code_array,
String subject_string, int* registers,
int registers_length, int start_position,
RegExp::CallOrigin call_origin);
RegExp::CallOrigin call_origin,
uint32_t backtrack_limit);
private:
static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string,

View File

@ -33,7 +33,8 @@ class RegExpImpl final : public AllStatic {
// Prepares a JSRegExp object with Irregexp-specific data.
static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
int capture_register_count);
int capture_register_count,
uint32_t backtrack_limit);
static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
@ -131,17 +132,27 @@ static bool HasFewDifferentCharacters(Handle<String> pattern) {
// static
MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags) {
JSRegExp::Flags flags,
uint32_t backtrack_limit) {
DCHECK(pattern->IsFlat());
// Caching is based only on the pattern and flags, but code also differs when
// a backtrack limit is set. A present backtrack limit is very much *not* the
// common case, so just skip the cache for these.
const bool is_compilation_cache_enabled =
(backtrack_limit == JSRegExp::kNoBacktrackLimit);
Zone zone(isolate->allocator(), ZONE_NAME);
CompilationCache* compilation_cache = isolate->compilation_cache();
MaybeHandle<FixedArray> maybe_cached =
compilation_cache->LookupRegExp(pattern, flags);
Handle<FixedArray> cached;
if (maybe_cached.ToHandle(&cached)) {
re->set_data(*cached);
return re;
CompilationCache* compilation_cache = nullptr;
if (is_compilation_cache_enabled) {
compilation_cache = isolate->compilation_cache();
MaybeHandle<FixedArray> maybe_cached =
compilation_cache->LookupRegExp(pattern, flags);
Handle<FixedArray> cached;
if (maybe_cached.ToHandle(&cached)) {
re->set_data(*cached);
return re;
}
}
PostponeInterruptsScope postpone(isolate);
@ -176,13 +187,15 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
}
if (!has_been_compiled) {
RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
parse_result.capture_count);
parse_result.capture_count, backtrack_limit);
}
DCHECK(re->data().IsFixedArray());
// Compilation succeeded so the data is set on the regexp
// and we can store it in the cache.
Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
compilation_cache->PutRegExp(pattern, flags, data);
if (is_compilation_cache_enabled) {
compilation_cache->PutRegExp(pattern, flags, data);
}
return re;
}
@ -469,10 +482,11 @@ Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags, int capture_count) {
JSRegExp::Flags flags, int capture_count,
uint32_t backtrack_limit) {
// Initialize compiled code entries to null.
isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
flags, capture_count);
isolate->factory()->SetRegExpIrregexpData(
re, JSRegExp::IRREGEXP, pattern, flags, capture_count, backtrack_limit);
}
// static

View File

@ -66,7 +66,7 @@ class RegExp final : public AllStatic {
// Returns false if compilation fails.
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
JSRegExp::Flags flags);
JSRegExp::Flags flags, uint32_t backtrack_limit);
enum CallOrigin : int {
kFromRuntime = 0,

View File

@ -1424,5 +1424,22 @@ RUNTIME_FUNCTION(Runtime_EnableCodeLoggingForTesting) {
return ReadOnlyRoots(isolate).undefined_value();
}
RUNTIME_FUNCTION(Runtime_NewRegExpWithBacktrackLimit) {
HandleScope scope(isolate);
DCHECK_EQ(3, args.length());
CONVERT_ARG_HANDLE_CHECKED(String, pattern, 0);
CONVERT_ARG_HANDLE_CHECKED(String, flags_string, 1);
CONVERT_UINT32_ARG_CHECKED(backtrack_limit, 2);
bool success = false;
JSRegExp::Flags flags =
JSRegExp::FlagsFromString(isolate, flags_string, &success);
CHECK(success);
RETURN_RESULT_OR_FAILURE(
isolate, JSRegExp::New(isolate, pattern, flags, backtrack_limit));
}
} // namespace internal
} // namespace v8

View File

@ -503,6 +503,7 @@ namespace internal {
F(NotifyContextDisposed, 0, 1) \
F(OptimizeFunctionOnNextCall, -1, 1) \
F(OptimizeOsr, -1, 1) \
F(NewRegExpWithBacktrackLimit, 3, 1) \
F(PrepareFunctionForOptimization, -1, 1) \
F(PrintWithNameForAssert, 2, 1) \
F(RedirectToWasmInterpreter, 2, 1) \

View File

@ -643,7 +643,8 @@ static Handle<JSRegExp> CreateJSRegExp(Handle<String> source, Handle<Code> code,
Handle<JSRegExp>::cast(factory->NewJSObject(constructor));
factory->SetRegExpIrregexpData(regexp, JSRegExp::IRREGEXP, source,
JSRegExp::kNone, 0);
JSRegExp::kNone, 0,
JSRegExp::kNoBacktrackLimit);
regexp->SetDataAt(is_unicode ? JSRegExp::kIrregexpUC16CodeIndex
: JSRegExp::kIrregexpLatin1CodeIndex,
*code);
@ -1284,8 +1285,8 @@ TEST(MacroAssembler) {
Vector<const uc16>(str1, 6)).ToHandleChecked();
CHECK(IrregexpInterpreter::MatchInternal(isolate, *array, *f1_16, captures, 5,
0,
RegExp::CallOrigin::kFromRuntime));
0, RegExp::CallOrigin::kFromRuntime,
JSRegExp::kNoBacktrackLimit));
CHECK_EQ(0, captures[0]);
CHECK_EQ(3, captures[1]);
CHECK_EQ(1, captures[2]);
@ -1296,9 +1297,9 @@ TEST(MacroAssembler) {
Handle<String> f2_16 = factory->NewStringFromTwoByte(
Vector<const uc16>(str2, 6)).ToHandleChecked();
CHECK(!IrregexpInterpreter::MatchInternal(isolate, *array, *f2_16, captures,
5, 0,
RegExp::CallOrigin::kFromRuntime));
CHECK(!IrregexpInterpreter::MatchInternal(
isolate, *array, *f2_16, captures, 5, 0, RegExp::CallOrigin::kFromRuntime,
JSRegExp::kNoBacktrackLimit));
CHECK_EQ(42, captures[0]);
}

View File

@ -0,0 +1,31 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --allow-natives-syntax --regexp-interpret-all
const kNoBacktrackLimit = 0; // To match JSRegExp::kNoBacktrackLimit.
const re0 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", kNoBacktrackLimit);
const re1 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", 50);
// Backtracks remain below the limit on this subject string.
{
let s = "3333ax3333x";
assertArrayEquals(["3333x", "3333"], re0.exec(s));
assertEquals(["3333x", "3333"], re1.exec(s));
}
// A longer subject exceeds the limit.
{
let s = "333333333ax3333x";
assertArrayEquals(["3333x", "3333"], re0.exec(s));
assertEquals(null, re1.exec(s));
}
// ATOM regexp construction with a limit; in this case the limit should just be
// ignored, ATOMs never backtrack.
{
const re = %NewRegExpWithBacktrackLimit("ax", "", 50);
let s = "3333ax3333x";
assertArrayEquals(["ax"], re.exec(s));
}