[regexp] Store named captures on the regexp result

This implements storing named captures on the regexp result object.
For instance, /(?<a>.)/u.exec("b") will return a result such that:

result.group.a  // "b"

The spec proposal is not yet final, so this may still change in the future.

BUG=v8:5437

Review-Url: https://codereview.chromium.org/2630233003
Cr-Original-Commit-Position: refs/heads/master@{#42532}
Committed: 70000946eb
Review-Url: https://codereview.chromium.org/2630233003
Cr-Commit-Position: refs/heads/master@{#42570}
This commit is contained in:
jgruber 2017-01-20 08:11:13 -08:00 committed by Commit bot
parent 24b9fc3a9a
commit ee94fa11ed
5 changed files with 139 additions and 20 deletions

View File

@ -33,8 +33,9 @@ class RegExpBuiltinsAssembler : public CodeStubAssembler {
void StoreLastIndex(Node* context, Node* regexp, Node* value,
bool is_fastpath);
Node* ConstructNewResultFromMatchInfo(Node* context, Node* match_info,
Node* string);
Node* ConstructNewResultFromMatchInfo(Node* const context, Node* const regexp,
Node* const match_info,
Node* const string);
Node* RegExpPrototypeExecBodyWithoutResult(Node* const context,
Node* const regexp,
@ -141,10 +142,10 @@ void RegExpBuiltinsAssembler::StoreLastIndex(Node* context, Node* regexp,
}
}
Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
Node* match_info,
Node* string) {
Label out(this);
Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(
Node* const context, Node* const regexp, Node* const match_info,
Node* const string) {
Label named_captures(this), out(this);
Node* const num_indices = SmiUntag(LoadFixedArrayElement(
match_info, RegExpMatchInfo::kNumberOfCapturesIndex));
@ -164,7 +165,8 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
StoreFixedArrayElement(result_elements, 0, first, SKIP_WRITE_BARRIER);
GotoIf(SmiEqual(num_results, SmiConstant(Smi::FromInt(1))), &out);
// If no captures exist we can skip named capture handling as well.
GotoIf(SmiEqual(num_results, SmiConstant(1)), &out);
// Store all remaining captures.
Node* const limit = IntPtrAdd(
@ -187,7 +189,7 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
Node* const start = LoadFixedArrayElement(match_info, from_cursor);
Label next_iter(this);
GotoIf(SmiEqual(start, SmiConstant(Smi::FromInt(-1))), &next_iter);
GotoIf(SmiEqual(start, SmiConstant(-1)), &next_iter);
Node* const from_cursor_plus1 = IntPtrAdd(from_cursor, IntPtrConstant(1));
Node* const end = LoadFixedArrayElement(match_info, from_cursor_plus1);
@ -199,7 +201,85 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
Bind(&next_iter);
var_from_cursor.Bind(IntPtrAdd(from_cursor, IntPtrConstant(2)));
var_to_cursor.Bind(IntPtrAdd(to_cursor, IntPtrConstant(1)));
Branch(UintPtrLessThan(var_from_cursor.value(), limit), &loop, &out);
Branch(UintPtrLessThan(var_from_cursor.value(), limit), &loop,
&named_captures);
}
Bind(&named_captures);
{
// We reach this point only if captures exist, implying that this is an
// IRREGEXP JSRegExp.
CSA_ASSERT(this, HasInstanceType(regexp, JS_REGEXP_TYPE));
CSA_ASSERT(this, SmiGreaterThan(num_results, SmiConstant(1)));
// Preparations for named capture properties. Exit early if the result does
// not have any named captures to minimize performance impact.
Node* const data = LoadObjectField(regexp, JSRegExp::kDataOffset);
CSA_ASSERT(this, SmiEqual(LoadFixedArrayElement(data, JSRegExp::kTagIndex),
SmiConstant(JSRegExp::IRREGEXP)));
// The names fixed array associates names at even indices with a capture
// index at odd indices.
Node* const names =
LoadFixedArrayElement(data, JSRegExp::kIrregexpCaptureNameMapIndex);
GotoIf(SmiEqual(names, SmiConstant(0)), &out);
// Allocate a new object to store the named capture properties.
// TODO(jgruber): Could be optimized by adding the object map to the heap
// root list.
Node* const native_context = LoadNativeContext(context);
Node* const object_function =
LoadContextElement(native_context, Context::OBJECT_FUNCTION_INDEX);
Node* const object_function_map = LoadObjectField(
object_function, JSFunction::kPrototypeOrInitialMapOffset);
Node* const group_object = AllocateJSObjectFromMap(object_function_map);
// Store it on the result as a 'group' property.
{
Node* const name = HeapConstant(isolate()->factory()->group_string());
Node* const language_mode = SmiConstant(Smi::FromInt(STRICT));
CallRuntime(Runtime::kSetProperty, context, result, name, group_object,
language_mode);
}
// One or more named captures exist, add a property for each one.
CSA_ASSERT(this, HasInstanceType(names, FIXED_ARRAY_TYPE));
Node* const names_length = LoadAndUntagFixedArrayBaseLength(names);
CSA_ASSERT(this, IntPtrGreaterThan(names_length, IntPtrConstant(0)));
Variable var_i(this, MachineType::PointerRepresentation());
var_i.Bind(IntPtrConstant(0));
Variable* vars[] = {&var_i};
const int vars_count = sizeof(vars) / sizeof(vars[0]);
Label loop(this, vars_count, vars);
Goto(&loop);
Bind(&loop);
{
Node* const i = var_i.value();
Node* const i_plus_1 = IntPtrAdd(i, IntPtrConstant(1));
Node* const i_plus_2 = IntPtrAdd(i_plus_1, IntPtrConstant(1));
Node* const name = LoadFixedArrayElement(names, i);
Node* const index = LoadFixedArrayElement(names, i_plus_1);
Node* const capture =
LoadFixedArrayElement(result_elements, SmiUntag(index));
Node* const language_mode = SmiConstant(Smi::FromInt(STRICT));
CallRuntime(Runtime::kSetProperty, context, group_object, name, capture,
language_mode);
var_i.Bind(i_plus_2);
Branch(IntPtrGreaterThanOrEqual(var_i.value(), names_length), &out,
&loop);
}
}
Bind(&out);
@ -352,7 +432,7 @@ Node* RegExpBuiltinsAssembler::RegExpPrototypeExecBody(Node* const context,
{
Node* const match_indices = indices_or_null;
Node* const result =
ConstructNewResultFromMatchInfo(context, match_indices, string);
ConstructNewResultFromMatchInfo(context, regexp, match_indices, string);
var_result.Bind(result);
Goto(&out);
}
@ -2522,7 +2602,7 @@ TF_BUILTIN(RegExpInternalMatch, RegExpBuiltinsAssembler) {
Bind(&if_matched);
{
Node* result =
ConstructNewResultFromMatchInfo(context, match_indices, string);
ConstructNewResultFromMatchInfo(context, regexp, match_indices, string);
Return(result);
}
}

View File

@ -88,6 +88,7 @@
V(get_string, "get") \
V(get_space_string, "get ") \
V(global_string, "global") \
V(group_string, "group") \
V(has_string, "has") \
V(hour_string, "hour") \
V(ignoreCase_string, "ignoreCase") \

View File

@ -770,6 +770,15 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name);
// Disallow captures named '__proto__'.
static const char16_t proto_string[] = u"__proto__";
if (name->size() == arraysize(proto_string) - 1) {
if (std::equal(name->begin(), name->end(), &proto_string[0])) {
ReportError(CStrVector("Illegal capture group name"));
return false;
}
}
if (named_captures_ == nullptr) {
named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
} else {

View File

@ -18,15 +18,15 @@ assertThrows("/(?<ab>a)\\k<a>/u"); // Invalid reference.
assertThrows("/\\k<a>(?<ab>a)/u"); // Invalid reference.
// Fallback behavior in non-unicode mode.
assertThrows("/(?<>a)/");
assertThrows("/(?<aa)/");
assertThrows("/(?<42a>a)/");
assertThrows("/(?<:a>a)/");
assertThrows("/(?<a:>a)/");
assertThrows("/(?<a>a)(?<a>a)/");
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/");
assertThrows("/(?<a>a)\\k<ab>/");
assertThrows("/(?<ab>a)\\k<a>/");
assertThrows("/(?<>a)/", SyntaxError);
assertThrows("/(?<aa)/", SyntaxError);
assertThrows("/(?<42a>a)/", SyntaxError);
assertThrows("/(?<:a>a)/", SyntaxError);
assertThrows("/(?<a:>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<a>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/", SyntaxError);
assertThrows("/(?<a>a)\\k<ab>/", SyntaxError);
assertThrows("/(?<ab>a)\\k<a>/", SyntaxError);
assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/));
assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/));
@ -74,3 +74,29 @@ assertEquals(["bab", "b"], "bab".match(/(?<a>\k<a>\w)../u));
// Reference before group.
assertEquals(["bab", "b"], "bab".match(/\k<a>(?<a>b)\w\k<a>/u));
assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u));
// Reference properties.
assertEquals("a", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.a);
assertEquals("b", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.b);
assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.c);
assertFalse(/(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.hasOwnProperty("c"));
assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>|(?<c>c)/u.exec("aba").group.c);
assertTrue(/(?<a>a)(?<b>b)\k<a>|(?<c>c)/u
.exec("aba").group.hasOwnProperty("c"));
// Unicode names.
assertEquals("a", /(?<π>a)/u.exec("bab").group.π);
assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").group.\u03C0);
assertEquals("a", /(?<$>a)/u.exec("bab").group.$);
assertEquals("a", /(?<_>a)/u.exec("bab").group._);
assertEquals("a", /(?<$𐒤>a)/u.exec("bab").group.$𐒤);
assertEquals("a", /(?<_\u200C>a)/u.exec("bab").group._\u200C);
assertEquals("a", /(?<_\u200D>a)/u.exec("bab").group._\u200D);
assertEquals("a", /(?<ಠ_ಠ>a)/u.exec("bab").group._ಠ);
assertThrows('/(?<❤>a)/u', SyntaxError);
assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
// Capture name conflicts.
assertThrows(() => /(?<__proto__>a)/u, SyntaxError);
assertEquals("a", /(?<__proto_>a)/u.exec("a").group.__proto_);
assertEquals("a", /(?<__proto___>a)/u.exec("a").group.__proto___);

View File

@ -156,6 +156,9 @@
# desugaring regexp property class relies on ICU.
'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]],
# noi18n build cannot parse characters in supplementary plane.
'harmony/regexp-named-captures': [PASS, ['no_i18n == True', FAIL]],
# Allocates a large array buffer, which TSAN sometimes cannot handle.
'regress/regress-599717': [PASS, ['tsan', SKIP]],