From 51fcfd585c5f5deebd5da0c145020d8e58fbad0d Mon Sep 17 00:00:00 2001 From: Jakob Gruber Date: Thu, 11 Feb 2021 07:01:12 +0100 Subject: [PATCH] [regexp] Don't update last match info in @@split special case V8 implements a fast-path for RegExp.prototype.split which diverges from the spec: instead of creating a new sticky regexp instance `splitter` and running it in a loop, we reuse the existing non-sticky regexp without looping through each character. This works fine in most cases, but we run into issues when matching at the very end of the string. According to the spec, matches at the end of the string are impossible in @@split, but in our fast-path implementation they can happen. The obvious fix would be to remove our fast-path but this comes with high performance costs. The fix implemented in this CL adds a special flag to `exec` s.t. matches at the end of the string can be treated as failures. This is only relevant for @@split. Bug: chromium:1075514 Change-Id: Ifb790ed116793998d7aeb37e307f3f3f764023d3 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2681950 Commit-Queue: Jakob Gruber Auto-Submit: Jakob Gruber Reviewed-by: Shu-yu Guo Cr-Commit-Position: refs/heads/master@{#72644} --- src/builtins/builtins-regexp-gen.cc | 50 +++++++++++------ src/builtins/builtins-regexp-gen.h | 10 ++-- src/regexp/experimental/experimental.cc | 16 +++++- src/regexp/experimental/experimental.h | 10 ++-- src/regexp/regexp.cc | 23 +++++--- src/regexp/regexp.h | 16 +++++- src/runtime/runtime-regexp.cc | 74 ++++++++++++++++++++----- src/runtime/runtime.h | 22 ++++---- test/mjsunit/regress/regress-1075514.js | 21 +++++++ 9 files changed, 182 insertions(+), 60 deletions(-) create mode 100644 test/mjsunit/regress/regress-1075514.js diff --git a/src/builtins/builtins-regexp-gen.cc b/src/builtins/builtins-regexp-gen.cc index 97212afc6c..479c0a7ab4 100644 --- a/src/builtins/builtins-regexp-gen.cc +++ b/src/builtins/builtins-regexp-gen.cc @@ -18,7 +18,6 @@ #include "src/objects/js-regexp-string-iterator.h" #include "src/objects/js-regexp.h" #include "src/objects/regexp-match-info.h" -#include "src/regexp/regexp.h" namespace v8 { namespace internal { @@ -436,7 +435,8 @@ void RegExpBuiltinsAssembler::GetStringPointers( TNode RegExpBuiltinsAssembler::RegExpExecInternal( TNode context, TNode regexp, TNode string, - TNode last_index, TNode match_info) { + TNode last_index, TNode match_info, + RegExp::ExecQuirks exec_quirks) { ToDirectStringAssembler to_direct(state(), string); TVARIABLE(HeapObject, var_result); @@ -676,6 +676,14 @@ TNode RegExpBuiltinsAssembler::RegExpExecInternal( BIND(&if_success); { + if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) { + static constexpr int kMatchStartOffset = 0; + TNode value = ChangeInt32ToIntPtr(UncheckedCast( + Load(MachineType::Int32(), static_offsets_vector_address, + IntPtrConstant(kMatchStartOffset)))); + GotoIf(UintPtrGreaterThanOrEqual(value, int_string_length), &if_failure); + } + // Check that the last match info has space for the capture registers and // the additional information. Ensure no overflow in add. STATIC_ASSERT(FixedArray::kMaxLength < kMaxInt - FixedArray::kLengthOffset); @@ -747,15 +755,22 @@ TNode RegExpBuiltinsAssembler::RegExpExecInternal( BIND(&retry_experimental); { - var_result = - CAST(CallRuntime(Runtime::kRegExpExperimentalOneshotExec, context, - regexp, string, last_index, match_info)); + auto target_fn = + exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure + ? Runtime::kRegExpExperimentalOneshotExecTreatMatchAtEndAsFailure + : Runtime::kRegExpExperimentalOneshotExec; + var_result = CAST(CallRuntime(target_fn, context, regexp, string, + last_index, match_info)); Goto(&out); } BIND(&runtime); { - var_result = CAST(CallRuntime(Runtime::kRegExpExec, context, regexp, string, + auto target_fn = + exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure + ? Runtime::kRegExpExecTreatMatchAtEndAsFailure + : Runtime::kRegExpExec; + var_result = CAST(CallRuntime(target_fn, context, regexp, string, last_index, match_info)); Goto(&out); } @@ -951,6 +966,14 @@ TF_BUILTIN(RegExpExecAtom, RegExpBuiltinsAssembler) { const TNode needle_string = CAST(UnsafeLoadFixedArrayElement(data, JSRegExp::kAtomPatternIndex)); + // ATOM patterns are guaranteed to not be the empty string (these are + // intercepted and replaced in JSRegExp::Initialize. + // + // This is especially relevant for crbug.com/1075514: atom patterns are + // non-empty and thus guaranteed not to match at the end of the string. + CSA_ASSERT(this, IntPtrGreaterThan(LoadStringLengthAsWord(needle_string), + IntPtrConstant(0))); + const TNode match_from = CAST(CallBuiltin(Builtins::kStringIndexOf, context, subject_string, needle_string, last_index)); @@ -1609,9 +1632,9 @@ TNode RegExpBuiltinsAssembler::RegExpPrototypeSplitBody( const TNode last_match_info = LoadContextElement( native_context, Context::REGEXP_LAST_MATCH_INFO_INDEX); - const TNode match_indices_ho = - CAST(CallBuiltin(Builtins::kRegExpExecInternal, context, regexp, string, - next_search_from, last_match_info)); + const TNode match_indices_ho = RegExpExecInternal( + context, regexp, string, next_search_from, CAST(last_match_info), + RegExp::ExecQuirks::kTreatMatchAtEndAsFailure); // We're done if no match was found. { @@ -1623,16 +1646,9 @@ TNode RegExpBuiltinsAssembler::RegExpPrototypeSplitBody( TNode match_indices = CAST(match_indices_ho); const TNode match_from = CAST(UnsafeLoadFixedArrayElement( match_indices, RegExpMatchInfo::kFirstCaptureIndex)); - - // We're done if the match starts beyond the string. - { - Label next(this); - Branch(SmiEqual(match_from, string_length), &push_suffix_and_out, &next); - BIND(&next); - } - const TNode match_to = CAST(UnsafeLoadFixedArrayElement( match_indices, RegExpMatchInfo::kFirstCaptureIndex + 1)); + CSA_ASSERT(this, SmiNotEqual(match_from, string_length)); // Advance index and continue if the match is empty. { diff --git a/src/builtins/builtins-regexp-gen.h b/src/builtins/builtins-regexp-gen.h index 0538b77165..e55af65f81 100644 --- a/src/builtins/builtins-regexp-gen.h +++ b/src/builtins/builtins-regexp-gen.h @@ -8,6 +8,7 @@ #include "src/base/optional.h" #include "src/codegen/code-stub-assembler.h" #include "src/common/message-template.h" +#include "src/regexp/regexp.h" namespace v8 { namespace internal { @@ -51,11 +52,10 @@ class RegExpBuiltinsAssembler : public CodeStubAssembler { TVariable* var_string_end); // Low level logic around the actual call into pattern matching code. - TNode RegExpExecInternal(TNode context, - TNode regexp, - TNode string, - TNode last_index, - TNode match_info); + TNode RegExpExecInternal( + TNode context, TNode regexp, TNode string, + TNode last_index, TNode match_info, + RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone); TNode ConstructNewResultFromMatchInfo( TNode context, TNode regexp, diff --git a/src/regexp/experimental/experimental.cc b/src/regexp/experimental/experimental.cc index d23c34c573..500269c40e 100644 --- a/src/regexp/experimental/experimental.cc +++ b/src/regexp/experimental/experimental.cc @@ -214,7 +214,8 @@ int32_t ExperimentalRegExp::MatchForCallFromJs( MaybeHandle ExperimentalRegExp::Exec( Isolate* isolate, Handle regexp, Handle subject, - int subject_index, Handle last_match_info) { + int subject_index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks) { DCHECK(FLAG_enable_experimental_regexp_engine); DCHECK_EQ(regexp->TypeTag(), JSRegExp::EXPERIMENTAL); #ifdef VERIFY_HEAP @@ -248,6 +249,11 @@ MaybeHandle ExperimentalRegExp::Exec( if (num_matches > 0) { DCHECK_EQ(num_matches, 1); + if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) { + if (output_registers[0] >= subject->length()) { + return isolate->factory()->null_value(); + } + } return RegExp::SetLastMatchInfo(isolate, last_match_info, subject, capture_count, output_registers); } else if (num_matches == 0) { @@ -285,7 +291,8 @@ int32_t ExperimentalRegExp::OneshotExecRaw(Isolate* isolate, MaybeHandle ExperimentalRegExp::OneshotExec( Isolate* isolate, Handle regexp, Handle subject, - int subject_index, Handle last_match_info) { + int subject_index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks) { DCHECK(FLAG_enable_experimental_regexp_engine_on_excessive_backtracks); DCHECK_NE(regexp->TypeTag(), JSRegExp::NOT_COMPILED); @@ -306,6 +313,11 @@ MaybeHandle ExperimentalRegExp::OneshotExec( if (num_matches > 0) { DCHECK_EQ(num_matches, 1); + if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) { + if (output_registers[0] >= subject->length()) { + return isolate->factory()->null_value(); + } + } return RegExp::SetLastMatchInfo(isolate, last_match_info, subject, capture_count, output_registers); } else if (num_matches == 0) { diff --git a/src/regexp/experimental/experimental.h b/src/regexp/experimental/experimental.h index a0ee8d1081..1b44100cc8 100644 --- a/src/regexp/experimental/experimental.h +++ b/src/regexp/experimental/experimental.h @@ -36,9 +36,10 @@ class ExperimentalRegExp final : public AllStatic { Address backtrack_stack, RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp); - static MaybeHandle Exec(Isolate* isolate, Handle regexp, - Handle subject, int index, - Handle last_match_info); + static MaybeHandle Exec( + Isolate* isolate, Handle regexp, Handle subject, + int index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone); static int32_t ExecRaw(Isolate* isolate, RegExp::CallOrigin call_origin, JSRegExp regexp, String subject, int32_t* output_registers, @@ -48,7 +49,8 @@ class ExperimentalRegExp final : public AllStatic { // its type tag. The regexp itself is not changed (apart from lastIndex). static MaybeHandle OneshotExec( Isolate* isolate, Handle regexp, Handle subject, - int index, Handle last_match_info); + int index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone); static int32_t OneshotExecRaw(Isolate* isolate, Handle regexp, Handle subject, int32_t* output_registers, diff --git a/src/regexp/regexp.cc b/src/regexp/regexp.cc index f88e950eb1..5f83269a8f 100644 --- a/src/regexp/regexp.cc +++ b/src/regexp/regexp.cc @@ -76,7 +76,8 @@ class RegExpImpl final : public AllStatic { // Returns an empty handle in case of an exception. V8_WARN_UNUSED_RESULT static MaybeHandle IrregexpExec( Isolate* isolate, Handle regexp, Handle subject, - int index, Handle last_match_info); + int index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone); static bool CompileIrregexp(Isolate* isolate, Handle re, Handle sample_subject, bool is_one_byte); @@ -268,15 +269,17 @@ bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle re, // static MaybeHandle RegExp::ExperimentalOneshotExec( Isolate* isolate, Handle regexp, Handle subject, - int index, Handle last_match_info) { + int index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks) { return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index, - last_match_info); + last_match_info, exec_quirks); } // static MaybeHandle RegExp::Exec(Isolate* isolate, Handle regexp, Handle subject, int index, - Handle last_match_info) { + Handle last_match_info, + ExecQuirks exec_quirks) { switch (regexp->TypeTag()) { case JSRegExp::NOT_COMPILED: UNREACHABLE(); @@ -285,10 +288,10 @@ MaybeHandle RegExp::Exec(Isolate* isolate, Handle regexp, last_match_info); case JSRegExp::IRREGEXP: return RegExpImpl::IrregexpExec(isolate, regexp, subject, index, - last_match_info); + last_match_info, exec_quirks); case JSRegExp::EXPERIMENTAL: return ExperimentalRegExp::Exec(isolate, regexp, subject, index, - last_match_info); + last_match_info, exec_quirks); } } @@ -641,7 +644,8 @@ int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle regexp, MaybeHandle RegExpImpl::IrregexpExec( Isolate* isolate, Handle regexp, Handle subject, - int previous_index, Handle last_match_info) { + int previous_index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks) { DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); subject = String::Flatten(isolate, subject); @@ -691,6 +695,11 @@ MaybeHandle RegExpImpl::IrregexpExec( output_registers, required_registers); if (res == RegExp::RE_SUCCESS) { + if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) { + if (output_registers[0] >= subject->length()) { + return isolate->factory()->null_value(); + } + } int capture_count = regexp->CaptureCount(); return RegExp::SetLastMatchInfo(isolate, last_match_info, subject, capture_count, output_registers); diff --git a/src/regexp/regexp.h b/src/regexp/regexp.h index 3e20b5f80c..40fe832fd7 100644 --- a/src/regexp/regexp.h +++ b/src/regexp/regexp.h @@ -86,16 +86,28 @@ class RegExp final : public AllStatic { kFromJs = 1, }; + enum class ExecQuirks { + kNone, + // Used to work around an issue in the RegExpPrototypeSplit fast path, + // which diverges from the spec by not creating a sticky copy of the RegExp + // instance and calling `exec` in a loop. If called in this context, we + // must not update the last_match_info on a successful match at the subject + // string end. See crbug.com/1075514 for more information. + kTreatMatchAtEndAsFailure, + }; + // See ECMA-262 section 15.10.6.2. // This function calls the garbage collector if necessary. V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle Exec( Isolate* isolate, Handle regexp, Handle subject, - int index, Handle last_match_info); + int index, Handle last_match_info, + ExecQuirks exec_quirks = ExecQuirks::kNone); V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle ExperimentalOneshotExec(Isolate* isolate, Handle regexp, Handle subject, int index, - Handle last_match_info); + Handle last_match_info, + ExecQuirks exec_quirks = ExecQuirks::kNone); // Integral return values used throughout regexp code layers. static constexpr int kInternalRegExpFailure = 0; diff --git a/src/runtime/runtime-regexp.cc b/src/runtime/runtime-regexp.cc index b5f262081d..403d83bef9 100644 --- a/src/runtime/runtime-regexp.cc +++ b/src/runtime/runtime-regexp.cc @@ -861,6 +861,36 @@ RUNTIME_FUNCTION(Runtime_StringSplit) { return *result; } +namespace { + +MaybeHandle RegExpExec(Isolate* isolate, Handle regexp, + Handle subject, int32_t index, + Handle last_match_info, + RegExp::ExecQuirks exec_quirks) { + // Due to the way the JS calls are constructed this must be less than the + // length of a string, i.e. it is always a Smi. We check anyway for security. + CHECK_LE(0, index); + CHECK_GE(subject->length(), index); + isolate->counters()->regexp_entry_runtime()->Increment(); + return RegExp::Exec(isolate, regexp, subject, index, last_match_info, + exec_quirks); +} + +MaybeHandle ExperimentalOneshotExec( + Isolate* isolate, Handle regexp, Handle subject, + int32_t index, Handle last_match_info, + RegExp::ExecQuirks exec_quirks) { + // Due to the way the JS calls are constructed this must be less than the + // length of a string, i.e. it is always a Smi. We check anyway for security. + CHECK_LE(0, index); + CHECK_GE(subject->length(), index); + isolate->counters()->regexp_entry_runtime()->Increment(); + return RegExp::ExperimentalOneshotExec(isolate, regexp, subject, index, + last_match_info, exec_quirks); +} + +} // namespace + RUNTIME_FUNCTION(Runtime_RegExpExec) { HandleScope scope(isolate); DCHECK_EQ(4, args.length()); @@ -868,13 +898,21 @@ RUNTIME_FUNCTION(Runtime_RegExpExec) { CONVERT_ARG_HANDLE_CHECKED(String, subject, 1); CONVERT_INT32_ARG_CHECKED(index, 2); CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3); - // Due to the way the JS calls are constructed this must be less than the - // length of a string, i.e. it is always a Smi. We check anyway for security. - CHECK_LE(0, index); - CHECK_GE(subject->length(), index); - isolate->counters()->regexp_entry_runtime()->Increment(); RETURN_RESULT_OR_FAILURE( - isolate, RegExp::Exec(isolate, regexp, subject, index, last_match_info)); + isolate, RegExpExec(isolate, regexp, subject, index, last_match_info, + RegExp::ExecQuirks::kNone)); +} + +RUNTIME_FUNCTION(Runtime_RegExpExecTreatMatchAtEndAsFailure) { + HandleScope scope(isolate); + DCHECK_EQ(4, args.length()); + CONVERT_ARG_HANDLE_CHECKED(JSRegExp, regexp, 0); + CONVERT_ARG_HANDLE_CHECKED(String, subject, 1); + CONVERT_INT32_ARG_CHECKED(index, 2); + CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3); + RETURN_RESULT_OR_FAILURE( + isolate, RegExpExec(isolate, regexp, subject, index, last_match_info, + RegExp::ExecQuirks::kTreatMatchAtEndAsFailure)); } RUNTIME_FUNCTION(Runtime_RegExpExperimentalOneshotExec) { @@ -884,14 +922,24 @@ RUNTIME_FUNCTION(Runtime_RegExpExperimentalOneshotExec) { CONVERT_ARG_HANDLE_CHECKED(String, subject, 1); CONVERT_INT32_ARG_CHECKED(index, 2); CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3); - // Due to the way the JS calls are constructed this must be less than the - // length of a string, i.e. it is always a Smi. We check anyway for security. - CHECK_LE(0, index); - CHECK_GE(subject->length(), index); - isolate->counters()->regexp_entry_runtime()->Increment(); RETURN_RESULT_OR_FAILURE( - isolate, RegExp::ExperimentalOneshotExec(isolate, regexp, subject, index, - last_match_info)); + isolate, + ExperimentalOneshotExec(isolate, regexp, subject, index, last_match_info, + RegExp::ExecQuirks::kNone)); +} + +RUNTIME_FUNCTION( + Runtime_RegExpExperimentalOneshotExecTreatMatchAtEndAsFailure) { + HandleScope scope(isolate); + DCHECK_EQ(4, args.length()); + CONVERT_ARG_HANDLE_CHECKED(JSRegExp, regexp, 0); + CONVERT_ARG_HANDLE_CHECKED(String, subject, 1); + CONVERT_INT32_ARG_CHECKED(index, 2); + CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3); + RETURN_RESULT_OR_FAILURE( + isolate, + ExperimentalOneshotExec(isolate, regexp, subject, index, last_match_info, + RegExp::ExecQuirks::kTreatMatchAtEndAsFailure)); } RUNTIME_FUNCTION(Runtime_RegExpBuildIndices) { diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 7fa0d32aee..4f626c0870 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -384,16 +384,18 @@ namespace internal { F(JSProxyGetTarget, 1, 1) \ F(SetPropertyWithReceiver, 4, 1) -#define FOR_EACH_INTRINSIC_REGEXP(F, I) \ - I(IsRegExp, 1, 1) \ - F(RegExpBuildIndices, 3, 1) \ - F(RegExpExec, 4, 1) \ - F(RegExpExperimentalOneshotExec, 4, 1) \ - F(RegExpExecMultiple, 4, 1) \ - F(RegExpInitializeAndCompile, 3, 1) \ - F(RegExpReplaceRT, 3, 1) \ - F(RegExpSplit, 3, 1) \ - F(StringReplaceNonGlobalRegExpWithFunction, 3, 1) \ +#define FOR_EACH_INTRINSIC_REGEXP(F, I) \ + I(IsRegExp, 1, 1) \ + F(RegExpBuildIndices, 3, 1) \ + F(RegExpExec, 4, 1) \ + F(RegExpExecTreatMatchAtEndAsFailure, 4, 1) \ + F(RegExpExperimentalOneshotExec, 4, 1) \ + F(RegExpExperimentalOneshotExecTreatMatchAtEndAsFailure, 4, 1) \ + F(RegExpExecMultiple, 4, 1) \ + F(RegExpInitializeAndCompile, 3, 1) \ + F(RegExpReplaceRT, 3, 1) \ + F(RegExpSplit, 3, 1) \ + F(StringReplaceNonGlobalRegExpWithFunction, 3, 1) \ F(StringSplit, 3, 1) #define FOR_EACH_INTRINSIC_SCOPES(F, I) \ diff --git a/test/mjsunit/regress/regress-1075514.js b/test/mjsunit/regress/regress-1075514.js new file mode 100644 index 0000000000..ff0510c36a --- /dev/null +++ b/test/mjsunit/regress/regress-1075514.js @@ -0,0 +1,21 @@ +// Copyright 2021 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +const re = /$/; + +// The runtime path (Runtime::kRegExpExec). +assertEquals(["a"], "a".split(re)); +assertEquals("", RegExp.input); + +// Runtime / compilation to generated code. +assertEquals(["a"], "a".split(re)); +assertEquals("", RegExp.input); + +// Generated code. +assertEquals(["a"], "a".split(re)); +assertEquals("", RegExp.input); + +// Once again just because we can. +assertEquals(["a"], "a".split(re)); +assertEquals("", RegExp.input);