[regexp] Add UseCounter for invalid regexp with /v, but valid with /u

Some patterns that were valid with /u are invalid with /v.
This CL adds a UseCounter for such usages in /u to get an idea how often
they are used in the wild.
This is important information w.r.t the proposal to use /v instead of /u
for the pattern attribute (http://go/gh/whatwg/html/pull/7908).

Chromium CL: https://crrev.com/c/4221395

Bug: v8:11935
Change-Id: Idc023ceba9ce03eee578d6c387ce8a8f37db292f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4212393
Reviewed-by: Camillo Bruni <cbruni@chromium.org>
Reviewed-by: Jakob Linke <jgruber@chromium.org>
Reviewed-by: Mathias Bynens <mathias@chromium.org>
Commit-Queue: Patrick Thier <pthier@chromium.org>
Cr-Commit-Position: refs/heads/main@{#85639}
This commit is contained in:
pthier 2023-02-01 13:43:36 +01:00 committed by V8 LUCI CQ
parent d3a3d73384
commit 5080c7727a
3 changed files with 81 additions and 9 deletions

View File

@ -538,6 +538,7 @@ class V8_EXPORT Isolate {
kAsyncStackTaggingCreateTaskCall = 116,
kDurationFormat = 117,
kInvalidatedNumberStringPrototypeNoReplaceProtector = 118,
kRegExpUnicodeSetIncompatibilitiesWithUnicodeMode = 119,
// If you add new values here, you'll also need to update Chromium's:
// web_feature.mojom, use_counter_callback.cc, and enums.xml. V8 changes to

View File

@ -429,8 +429,8 @@ class RegExpParserState : public ZoneObject {
template <class CharT>
class RegExpParserImpl final {
private:
RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
uintptr_t stack_limit, Zone* zone,
RegExpParserImpl(Isolate* isolate, const CharT* input, int input_length,
RegExpFlags flags, uintptr_t stack_limit, Zone* zone,
const DisallowGarbageCollection& no_gc);
bool Parse(RegExpCompileData* result);
@ -563,6 +563,7 @@ class RegExpParserImpl final {
bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
Zone* zone() const { return zone_; }
Isolate* isolate() const { return isolate_; }
base::uc32 current() const { return current_; }
bool has_more() const { return has_more_; }
@ -603,6 +604,10 @@ class RegExpParserImpl final {
const DisallowGarbageCollection no_gc_;
Zone* const zone_;
// TODO(pthier, v8:11935): Isolate is only used to increment the UseCounter
// for unicode set incompabilities in unicode mode. Remove when the counter
// is removed.
Isolate* const isolate_;
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
@ -629,9 +634,10 @@ class RegExpParserImpl final {
template <class CharT>
RegExpParserImpl<CharT>::RegExpParserImpl(
const CharT* input, int input_length, RegExpFlags flags,
Isolate* isolate, const CharT* input, int input_length, RegExpFlags flags,
uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
: zone_(zone),
isolate_(isolate),
captures_(nullptr),
named_captures_(nullptr),
named_back_references_(nullptr),
@ -2417,6 +2423,21 @@ void RegExpParserImpl<CharT>::ParseClassEscape(
if (current() != '\\') {
// Not a ClassEscape.
*char_out = current();
// Count usages of patterns that would break when replacing /u with /v.
// This is only temporarily enabled and should give us an idea if it is
// feasible to enable unicode sets for usage in the pattern attribute.
// TODO(pthier, v8:11935): Remove for M113.
// IsUnicodeMode() is true for both /u and /v, but this method is only
// called for /u.
if (IsUnicodeMode() && isolate() != nullptr) {
const bool unicode_sets_invalid =
IsClassSetSyntaxCharacter(*char_out) ||
IsClassSetReservedDoublePunctuator(*char_out);
if (unicode_sets_invalid) {
isolate()->CountUsage(
v8::Isolate::kRegExpUnicodeSetIncompatibilitiesWithUnicodeMode);
}
}
Advance();
return;
}
@ -3113,13 +3134,13 @@ bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
String::FlatContent content = input->GetFlatContent(no_gc);
if (content.IsOneByte()) {
base::Vector<const uint8_t> v = content.ToOneByteVector();
return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags,
stack_limit, zone, no_gc}
return RegExpParserImpl<uint8_t>{isolate, v.begin(), v.length(), flags,
stack_limit, zone, no_gc}
.Parse(result);
} else {
base::Vector<const base::uc16> v = content.ToUC16Vector();
return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags,
stack_limit, zone, no_gc}
return RegExpParserImpl<base::uc16>{
isolate, v.begin(), v.length(), flags, stack_limit, zone, no_gc}
.Parse(result);
}
}
@ -3131,8 +3152,14 @@ bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
RegExpFlags flags,
RegExpCompileData* result,
const DisallowGarbageCollection& no_gc) {
return RegExpParserImpl<CharT>{input, input_length, flags,
stack_limit, zone, no_gc}
// TODO(pthier, v8:11935): Isolate is only temporarily used to increment the
// UseCounter for unicode set incompabilities in unicode mode.
// This method is only used in the parser for early-errors. To avoid passing
// the isolate through we simply pass a nullptr. This also has the positive
// side-effect of not incrementing the UseCounter multiple times.
Isolate* isolate = nullptr;
return RegExpParserImpl<CharT>{isolate, input, input_length, flags,
stack_limit, zone, no_gc}
.Parse(result);
}

View File

@ -1654,6 +1654,42 @@ void MockUseCounterCallback(v8::Isolate* isolate,
v8::Isolate::UseCounterFeature feature) {
++global_use_counts[feature];
}
void CheckRegExpUnicodeSetIncompatibilitiesUseCounter(
v8::Isolate* isolate, const char* check_pattern) {
int* use_counts = global_use_counts;
int old_count = use_counts
[v8::Isolate::kRegExpUnicodeSetIncompatibilitiesWithUnicodeMode];
Local<v8::Context> context = isolate->GetCurrentContext();
{
v8_flags.harmony_regexp_unicode_sets = true;
std::ostringstream os;
os << "/[" << check_pattern << "]/v";
Local<v8::String> v8_source =
v8::String::NewFromUtf8(isolate, os.str().c_str()).ToLocalChecked();
MaybeLocal<v8::Script> script = v8::Script::Compile(context, v8_source);
CHECK(script.IsEmpty());
CHECK_EQ(
old_count,
use_counts
[v8::Isolate::kRegExpUnicodeSetIncompatibilitiesWithUnicodeMode]);
}
{
std::ostringstream os;
os << "/[" << check_pattern << "]/u";
Local<v8::String> v8_source =
v8::String::NewFromUtf8(isolate, os.str().c_str()).ToLocalChecked();
MaybeLocal<v8::Script> script = v8::Script::Compile(context, v8_source);
Local<v8::Value> result =
script.ToLocalChecked()->Run(context).ToLocalChecked();
CHECK(result->IsRegExp());
CHECK_EQ(
old_count + 1,
use_counts
[v8::Isolate::kRegExpUnicodeSetIncompatibilitiesWithUnicodeMode]);
}
}
} // namespace
using RegExpTestWithContext = TestWithContext;
@ -1720,6 +1756,14 @@ TEST_F(RegExpTestWithContext, UseCountRegExp) {
CHECK_EQ(2, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
CHECK(resultToStringError->IsObject());
const char* incompatible_patterns[] = {
"(", ")", "[", "{", "}", "/", "-", "|", "&&",
"!!", "##", "$$", "%%", "**", "++", ",,", "..", "::",
";;", "<<", "==", ">>", "??", "@@", "^^^", "``", "~~"};
for (auto pattern : incompatible_patterns) {
CheckRegExpUnicodeSetIncompatibilitiesUseCounter(v8_isolate(), pattern);
}
}
class UncachedExternalString