[regexp] Don't update last match info in @@split special case

V8 implements a fast-path for RegExp.prototype.split which diverges
from the spec: instead of creating a new sticky regexp instance
`splitter` and running it in a loop, we reuse the existing non-sticky
regexp without looping through each character.

This works fine in most cases, but we run into issues when matching at
the very end of the string. According to the spec, matches at the end
of the string are impossible in @@split, but in our fast-path
implementation they can happen.

The obvious fix would be to remove our fast-path but this comes with
high performance costs. The fix implemented in this CL adds a special
flag to `exec` s.t. matches at the end of the string can be treated as
failures. This is only relevant for @@split.

Bug: chromium:1075514
Change-Id: Ifb790ed116793998d7aeb37e307f3f3f764023d3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2681950
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: Shu-yu Guo <syg@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72644}
This commit is contained in:
Jakob Gruber 2021-02-11 07:01:12 +01:00 committed by Commit Bot
parent fc8743da42
commit 51fcfd585c
9 changed files with 182 additions and 60 deletions

View File

@ -18,7 +18,6 @@
#include "src/objects/js-regexp-string-iterator.h"
#include "src/objects/js-regexp.h"
#include "src/objects/regexp-match-info.h"
#include "src/regexp/regexp.h"
namespace v8 {
namespace internal {
@ -436,7 +435,8 @@ void RegExpBuiltinsAssembler::GetStringPointers(
TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
TNode<Context> context, TNode<JSRegExp> regexp, TNode<String> string,
TNode<Number> last_index, TNode<RegExpMatchInfo> match_info) {
TNode<Number> last_index, TNode<RegExpMatchInfo> match_info,
RegExp::ExecQuirks exec_quirks) {
ToDirectStringAssembler to_direct(state(), string);
TVARIABLE(HeapObject, var_result);
@ -676,6 +676,14 @@ TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
BIND(&if_success);
{
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
static constexpr int kMatchStartOffset = 0;
TNode<IntPtrT> value = ChangeInt32ToIntPtr(UncheckedCast<Int32T>(
Load(MachineType::Int32(), static_offsets_vector_address,
IntPtrConstant(kMatchStartOffset))));
GotoIf(UintPtrGreaterThanOrEqual(value, int_string_length), &if_failure);
}
// Check that the last match info has space for the capture registers and
// the additional information. Ensure no overflow in add.
STATIC_ASSERT(FixedArray::kMaxLength < kMaxInt - FixedArray::kLengthOffset);
@ -747,15 +755,22 @@ TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
BIND(&retry_experimental);
{
var_result =
CAST(CallRuntime(Runtime::kRegExpExperimentalOneshotExec, context,
regexp, string, last_index, match_info));
auto target_fn =
exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure
? Runtime::kRegExpExperimentalOneshotExecTreatMatchAtEndAsFailure
: Runtime::kRegExpExperimentalOneshotExec;
var_result = CAST(CallRuntime(target_fn, context, regexp, string,
last_index, match_info));
Goto(&out);
}
BIND(&runtime);
{
var_result = CAST(CallRuntime(Runtime::kRegExpExec, context, regexp, string,
auto target_fn =
exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure
? Runtime::kRegExpExecTreatMatchAtEndAsFailure
: Runtime::kRegExpExec;
var_result = CAST(CallRuntime(target_fn, context, regexp, string,
last_index, match_info));
Goto(&out);
}
@ -951,6 +966,14 @@ TF_BUILTIN(RegExpExecAtom, RegExpBuiltinsAssembler) {
const TNode<String> needle_string =
CAST(UnsafeLoadFixedArrayElement(data, JSRegExp::kAtomPatternIndex));
// ATOM patterns are guaranteed to not be the empty string (these are
// intercepted and replaced in JSRegExp::Initialize.
//
// This is especially relevant for crbug.com/1075514: atom patterns are
// non-empty and thus guaranteed not to match at the end of the string.
CSA_ASSERT(this, IntPtrGreaterThan(LoadStringLengthAsWord(needle_string),
IntPtrConstant(0)));
const TNode<Smi> match_from =
CAST(CallBuiltin(Builtins::kStringIndexOf, context, subject_string,
needle_string, last_index));
@ -1609,9 +1632,9 @@ TNode<JSArray> RegExpBuiltinsAssembler::RegExpPrototypeSplitBody(
const TNode<Object> last_match_info = LoadContextElement(
native_context, Context::REGEXP_LAST_MATCH_INFO_INDEX);
const TNode<HeapObject> match_indices_ho =
CAST(CallBuiltin(Builtins::kRegExpExecInternal, context, regexp, string,
next_search_from, last_match_info));
const TNode<HeapObject> match_indices_ho = RegExpExecInternal(
context, regexp, string, next_search_from, CAST(last_match_info),
RegExp::ExecQuirks::kTreatMatchAtEndAsFailure);
// We're done if no match was found.
{
@ -1623,16 +1646,9 @@ TNode<JSArray> RegExpBuiltinsAssembler::RegExpPrototypeSplitBody(
TNode<FixedArray> match_indices = CAST(match_indices_ho);
const TNode<Smi> match_from = CAST(UnsafeLoadFixedArrayElement(
match_indices, RegExpMatchInfo::kFirstCaptureIndex));
// We're done if the match starts beyond the string.
{
Label next(this);
Branch(SmiEqual(match_from, string_length), &push_suffix_and_out, &next);
BIND(&next);
}
const TNode<Smi> match_to = CAST(UnsafeLoadFixedArrayElement(
match_indices, RegExpMatchInfo::kFirstCaptureIndex + 1));
CSA_ASSERT(this, SmiNotEqual(match_from, string_length));
// Advance index and continue if the match is empty.
{

View File

@ -8,6 +8,7 @@
#include "src/base/optional.h"
#include "src/codegen/code-stub-assembler.h"
#include "src/common/message-template.h"
#include "src/regexp/regexp.h"
namespace v8 {
namespace internal {
@ -51,11 +52,10 @@ class RegExpBuiltinsAssembler : public CodeStubAssembler {
TVariable<RawPtrT>* var_string_end);
// Low level logic around the actual call into pattern matching code.
TNode<HeapObject> RegExpExecInternal(TNode<Context> context,
TNode<JSRegExp> regexp,
TNode<String> string,
TNode<Number> last_index,
TNode<RegExpMatchInfo> match_info);
TNode<HeapObject> RegExpExecInternal(
TNode<Context> context, TNode<JSRegExp> regexp, TNode<String> string,
TNode<Number> last_index, TNode<RegExpMatchInfo> match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
TNode<JSRegExpResult> ConstructNewResultFromMatchInfo(
TNode<Context> context, TNode<JSRegExp> regexp,

View File

@ -214,7 +214,8 @@ int32_t ExperimentalRegExp::MatchForCallFromJs(
MaybeHandle<Object> ExperimentalRegExp::Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int subject_index, Handle<RegExpMatchInfo> last_match_info) {
int subject_index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
DCHECK(FLAG_enable_experimental_regexp_engine);
DCHECK_EQ(regexp->TypeTag(), JSRegExp::EXPERIMENTAL);
#ifdef VERIFY_HEAP
@ -248,6 +249,11 @@ MaybeHandle<Object> ExperimentalRegExp::Exec(
if (num_matches > 0) {
DCHECK_EQ(num_matches, 1);
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
if (output_registers[0] >= subject->length()) {
return isolate->factory()->null_value();
}
}
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);
} else if (num_matches == 0) {
@ -285,7 +291,8 @@ int32_t ExperimentalRegExp::OneshotExecRaw(Isolate* isolate,
MaybeHandle<Object> ExperimentalRegExp::OneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int subject_index, Handle<RegExpMatchInfo> last_match_info) {
int subject_index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
DCHECK(FLAG_enable_experimental_regexp_engine_on_excessive_backtracks);
DCHECK_NE(regexp->TypeTag(), JSRegExp::NOT_COMPILED);
@ -306,6 +313,11 @@ MaybeHandle<Object> ExperimentalRegExp::OneshotExec(
if (num_matches > 0) {
DCHECK_EQ(num_matches, 1);
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
if (output_registers[0] >= subject->length()) {
return isolate->factory()->null_value();
}
}
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);
} else if (num_matches == 0) {

View File

@ -36,9 +36,10 @@ class ExperimentalRegExp final : public AllStatic {
Address backtrack_stack,
RegExp::CallOrigin call_origin,
Isolate* isolate, Address regexp);
static MaybeHandle<Object> Exec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info);
static MaybeHandle<Object> Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
static int32_t ExecRaw(Isolate* isolate, RegExp::CallOrigin call_origin,
JSRegExp regexp, String subject,
int32_t* output_registers,
@ -48,7 +49,8 @@ class ExperimentalRegExp final : public AllStatic {
// its type tag. The regexp itself is not changed (apart from lastIndex).
static MaybeHandle<Object> OneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
static int32_t OneshotExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject,
int32_t* output_registers,

View File

@ -76,7 +76,8 @@ class RegExpImpl final : public AllStatic {
// Returns an empty handle in case of an exception.
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject, bool is_one_byte);
@ -268,15 +269,17 @@ bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
// static
MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info) {
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
last_match_info);
last_match_info, exec_quirks);
}
// static
MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info) {
Handle<RegExpMatchInfo> last_match_info,
ExecQuirks exec_quirks) {
switch (regexp->TypeTag()) {
case JSRegExp::NOT_COMPILED:
UNREACHABLE();
@ -285,10 +288,10 @@ MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
last_match_info);
case JSRegExp::IRREGEXP:
return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
last_match_info);
last_match_info, exec_quirks);
case JSRegExp::EXPERIMENTAL:
return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
last_match_info);
last_match_info, exec_quirks);
}
}
@ -641,7 +644,8 @@ int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
MaybeHandle<Object> RegExpImpl::IrregexpExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int previous_index, Handle<RegExpMatchInfo> last_match_info) {
int previous_index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
subject = String::Flatten(isolate, subject);
@ -691,6 +695,11 @@ MaybeHandle<Object> RegExpImpl::IrregexpExec(
output_registers, required_registers);
if (res == RegExp::RE_SUCCESS) {
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
if (output_registers[0] >= subject->length()) {
return isolate->factory()->null_value();
}
}
int capture_count = regexp->CaptureCount();
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);

View File

@ -86,16 +86,28 @@ class RegExp final : public AllStatic {
kFromJs = 1,
};
enum class ExecQuirks {
kNone,
// Used to work around an issue in the RegExpPrototypeSplit fast path,
// which diverges from the spec by not creating a sticky copy of the RegExp
// instance and calling `exec` in a loop. If called in this context, we
// must not update the last_match_info on a successful match at the subject
// string end. See crbug.com/1075514 for more information.
kTreatMatchAtEndAsFailure,
};
// See ECMA-262 section 15.10.6.2.
// This function calls the garbage collector if necessary.
V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
int index, Handle<RegExpMatchInfo> last_match_info,
ExecQuirks exec_quirks = ExecQuirks::kNone);
V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object>
ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info);
Handle<RegExpMatchInfo> last_match_info,
ExecQuirks exec_quirks = ExecQuirks::kNone);
// Integral return values used throughout regexp code layers.
static constexpr int kInternalRegExpFailure = 0;

View File

@ -861,6 +861,36 @@ RUNTIME_FUNCTION(Runtime_StringSplit) {
return *result;
}
namespace {
MaybeHandle<Object> RegExpExec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int32_t index,
Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
return RegExp::Exec(isolate, regexp, subject, index, last_match_info,
exec_quirks);
}
MaybeHandle<Object> ExperimentalOneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int32_t index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
return RegExp::ExperimentalOneshotExec(isolate, regexp, subject, index,
last_match_info, exec_quirks);
}
} // namespace
RUNTIME_FUNCTION(Runtime_RegExpExec) {
HandleScope scope(isolate);
DCHECK_EQ(4, args.length());
@ -868,13 +898,21 @@ RUNTIME_FUNCTION(Runtime_RegExpExec) {
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
RETURN_RESULT_OR_FAILURE(
isolate, RegExp::Exec(isolate, regexp, subject, index, last_match_info));
isolate, RegExpExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kNone));
}
RUNTIME_FUNCTION(Runtime_RegExpExecTreatMatchAtEndAsFailure) {
HandleScope scope(isolate);
DCHECK_EQ(4, args.length());
CONVERT_ARG_HANDLE_CHECKED(JSRegExp, regexp, 0);
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
RETURN_RESULT_OR_FAILURE(
isolate, RegExpExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kTreatMatchAtEndAsFailure));
}
RUNTIME_FUNCTION(Runtime_RegExpExperimentalOneshotExec) {
@ -884,14 +922,24 @@ RUNTIME_FUNCTION(Runtime_RegExpExperimentalOneshotExec) {
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
RETURN_RESULT_OR_FAILURE(
isolate, RegExp::ExperimentalOneshotExec(isolate, regexp, subject, index,
last_match_info));
isolate,
ExperimentalOneshotExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kNone));
}
RUNTIME_FUNCTION(
Runtime_RegExpExperimentalOneshotExecTreatMatchAtEndAsFailure) {
HandleScope scope(isolate);
DCHECK_EQ(4, args.length());
CONVERT_ARG_HANDLE_CHECKED(JSRegExp, regexp, 0);
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
RETURN_RESULT_OR_FAILURE(
isolate,
ExperimentalOneshotExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kTreatMatchAtEndAsFailure));
}
RUNTIME_FUNCTION(Runtime_RegExpBuildIndices) {

View File

@ -384,16 +384,18 @@ namespace internal {
F(JSProxyGetTarget, 1, 1) \
F(SetPropertyWithReceiver, 4, 1)
#define FOR_EACH_INTRINSIC_REGEXP(F, I) \
I(IsRegExp, 1, 1) \
F(RegExpBuildIndices, 3, 1) \
F(RegExpExec, 4, 1) \
F(RegExpExperimentalOneshotExec, 4, 1) \
F(RegExpExecMultiple, 4, 1) \
F(RegExpInitializeAndCompile, 3, 1) \
F(RegExpReplaceRT, 3, 1) \
F(RegExpSplit, 3, 1) \
F(StringReplaceNonGlobalRegExpWithFunction, 3, 1) \
#define FOR_EACH_INTRINSIC_REGEXP(F, I) \
I(IsRegExp, 1, 1) \
F(RegExpBuildIndices, 3, 1) \
F(RegExpExec, 4, 1) \
F(RegExpExecTreatMatchAtEndAsFailure, 4, 1) \
F(RegExpExperimentalOneshotExec, 4, 1) \
F(RegExpExperimentalOneshotExecTreatMatchAtEndAsFailure, 4, 1) \
F(RegExpExecMultiple, 4, 1) \
F(RegExpInitializeAndCompile, 3, 1) \
F(RegExpReplaceRT, 3, 1) \
F(RegExpSplit, 3, 1) \
F(StringReplaceNonGlobalRegExpWithFunction, 3, 1) \
F(StringSplit, 3, 1)
#define FOR_EACH_INTRINSIC_SCOPES(F, I) \

View File

@ -0,0 +1,21 @@
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
const re = /$/;
// The runtime path (Runtime::kRegExpExec).
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
// Runtime / compilation to generated code.
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
// Generated code.
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
// Once again just because we can.
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);