v8/src/regexp/regexp-parser.cc
Jakob Linke 440a0829f7 [regexp] Properly consider negated character classes for desugaring
.. instead of their non-negated form.

Fixed: v8:13097
Change-Id: I6426f5bbce2dfec2bbc64346d04f3b833d17c2b9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3802690
Commit-Queue: Jakob Linke <jgruber@chromium.org>
Reviewed-by: Patrick Thier <pthier@chromium.org>
Auto-Submit: Jakob Linke <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#82100}
2022-08-01 13:47:35 +00:00

2382 lines
76 KiB
C++

// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/regexp-parser.h"
#include "src/base/small-vector.h"
#include "src/execution/isolate.h"
#include "src/objects/string-inl.h"
#include "src/regexp/property-sequences.h"
#include "src/regexp/regexp-ast.h"
#include "src/regexp/regexp-macro-assembler.h"
#include "src/regexp/regexp.h"
#include "src/strings/char-predicates-inl.h"
#include "src/utils/ostreams.h"
#include "src/utils/utils.h"
#include "src/zone/zone-allocator.h"
#include "src/zone/zone-list-inl.h"
#ifdef V8_INTL_SUPPORT
#include "unicode/uniset.h"
#endif // V8_INTL_SUPPORT
namespace v8 {
namespace internal {
namespace {
// Whether we're currently inside the ClassEscape production
// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
enum class InClassEscapeState {
kInClass,
kNotInClass,
};
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder {
public:
RegExpBuilder(Zone* zone, RegExpFlags flags)
: zone_(zone),
flags_(flags),
terms_(ZoneAllocator<RegExpTree*>{zone}),
text_(ZoneAllocator<RegExpTree*>{zone}),
alternatives_(ZoneAllocator<RegExpTree*>{zone}) {}
void AddCharacter(base::uc16 character);
void AddUnicodeCharacter(base::uc32 character);
void AddEscapedUnicodeCharacter(base::uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc);
void AddCharacterClassForDesugaring(base::uc32 c);
void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
void FlushText();
RegExpTree* ToRegExp();
RegExpFlags flags() const { return flags_; }
bool ignore_case() const { return IsIgnoreCase(flags_); }
bool multiline() const { return IsMultiline(flags_); }
bool dotall() const { return IsDotAll(flags_); }
private:
static const base::uc16 kNoPendingSurrogate = 0;
void AddLeadSurrogate(base::uc16 lead_surrogate);
void AddTrailSurrogate(base::uc16 trail_surrogate);
void FlushPendingSurrogate();
void FlushCharacters();
void FlushTerms();
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
bool NeedsDesugaringForIgnoreCase(base::uc32 c);
Zone* zone() const { return zone_; }
bool unicode() const { return IsUnicode(flags_); }
Zone* const zone_;
bool pending_empty_ = false;
const RegExpFlags flags_;
ZoneList<base::uc16>* characters_ = nullptr;
base::uc16 pending_surrogate_ = kNoPendingSurrogate;
using SmallRegExpTreeVector =
base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
SmallRegExpTreeVector terms_;
SmallRegExpTreeVector text_;
SmallRegExpTreeVector alternatives_;
#ifdef DEBUG
enum {
ADD_NONE,
ADD_CHAR,
ADD_TERM,
ADD_ASSERT,
ADD_ATOM
} last_added_ = ADD_NONE;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAROUND,
NEGATIVE_LOOKAROUND,
GROUPING
};
class RegExpParserState : public ZoneObject {
public:
// Push a state on the stack.
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index,
const ZoneVector<base::uc16>* capture_name,
RegExpFlags flags, Zone* zone)
: previous_state_(previous_state),
builder_(zone, flags),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index),
capture_name_(capture_name) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() const { return previous_state_; }
bool IsSubexpression() { return previous_state_ != nullptr; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() { return &builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() const { return group_type_; }
// Lookahead or Lookbehind.
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() const { return disjunction_capture_index_; }
// The name of the current sub-expression, if group_type is CAPTURE. Only
// used for named captures.
const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
bool IsNamedCapture() const { return capture_name_ != nullptr; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index) const {
for (const RegExpParserState* s = this; s != nullptr;
s = s->previous_state()) {
if (s->group_type() != CAPTURE) continue;
// Return true if we found the matching capture index.
if (index == s->capture_index()) return true;
// Abort if index is larger than what has been parsed up till this state.
if (index > s->capture_index()) return false;
}
return false;
}
// Check whether the parser is inside a capture group with the given name.
bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
DCHECK_NOT_NULL(name);
for (const RegExpParserState* s = this; s != nullptr;
s = s->previous_state()) {
if (s->capture_name() == nullptr) continue;
if (*s->capture_name() == *name) return true;
}
return false;
}
private:
// Linked list implementation of stack of states.
RegExpParserState* const previous_state_;
// Builder for the stored disjunction.
RegExpBuilder builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
const SubexpressionType group_type_;
// Stored read direction.
const RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
const int disjunction_capture_index_;
// Stored capture name (if any).
const ZoneVector<base::uc16>* const capture_name_;
};
template <class CharT>
class RegExpParserImpl final {
private:
RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
uintptr_t stack_limit, Zone* zone,
const DisallowGarbageCollection& no_gc);
bool Parse(RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is.
bool ParseHexEscape(int length, base::uc32* value);
bool ParseUnicodeEscape(base::uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
bool ParsePropertyClassName(ZoneVector<char>* name_1,
ZoneVector<char>* name_2);
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
const ZoneVector<char>& name_1,
const ZoneVector<char>& name_2);
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
base::uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);
// Parse inside a class. Either add escaped class to the range, or return
// false and pass parsed single character through |char_out|.
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents, base::uc32* char_out,
bool* is_class_escape);
// Returns true iff parsing was successful.
bool TryParseCharacterClassEscape(base::uc32 next,
InClassEscapeState in_class_escape_state,
ZoneList<CharacterRange>* ranges,
Zone* zone,
bool add_unicode_case_equivalents);
// Parses and returns a single escaped character.
base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
bool* is_escaped_unicode_character);
RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void RewindByOneCodepoint(); // Rewinds to before the previous Advance().
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple() const { return simple_; }
bool contains_anchor() const { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() const { return captures_started_; }
int position() const { return next_pos_ - 1; }
bool failed() const { return failed_; }
bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; }
static bool IsSyntaxCharacterOrSlash(base::uc32 c);
static const base::uc32 kEndMarker = (1 << 21);
private:
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
// Creates a new named capture at the specified index. Must be called exactly
// once for each named capture. Fails if a capture with the same name is
// encountered.
bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
// Parses the name of a capture group (?<name>pattern). The name must adhere
// to IdentifierName in the ECMAScript standard.
const ZoneVector<base::uc16>* ParseCaptureGroupName();
bool ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state);
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
// After the initial parsing pass, patch corresponding RegExpCapture objects
// into all RegExpBackReferences. This is done after initial parsing in order
// to avoid complicating cases in which references comes before the capture.
void PatchNamedBackReferences();
ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
Zone* zone() const { return zone_; }
base::uc32 current() const { return current_; }
bool has_more() const { return has_more_; }
bool has_next() const { return next_pos_ < input_length(); }
base::uc32 Next();
template <bool update_position>
base::uc32 ReadNext();
CharT InputAt(int index) const {
DCHECK(0 <= index && index < input_length());
return input_[index];
}
int input_length() const { return input_length_; }
void ScanForCaptures(InClassEscapeState in_class_escape_state);
struct RegExpCaptureNameLess {
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
DCHECK_NOT_NULL(lhs);
DCHECK_NOT_NULL(rhs);
return *lhs->name() < *rhs->name();
}
};
class ForceUnicodeScope final {
public:
explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
: parser_(parser) {
DCHECK(!parser_->force_unicode_);
parser_->force_unicode_ = true;
}
~ForceUnicodeScope() {
DCHECK(parser_->force_unicode_);
parser_->force_unicode_ = false;
}
private:
RegExpParserImpl<CharT>* const parser_;
};
const DisallowGarbageCollection no_gc_;
Zone* const zone_;
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
const CharT* const input_;
const int input_length_;
base::uc32 current_;
const RegExpFlags top_level_flags_;
bool force_unicode_ = false; // Force parser to act as if unicode were set.
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.
bool has_more_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool has_named_captures_; // Only valid after we have scanned for captures.
bool failed_;
const uintptr_t stack_limit_;
friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*,
Handle<String>,
RegExpFlags,
RegExpCompileData*);
friend bool RegExpParser::VerifyRegExpSyntax<CharT>(
Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*,
const DisallowGarbageCollection&);
};
template <class CharT>
RegExpParserImpl<CharT>::RegExpParserImpl(
const CharT* input, int input_length, RegExpFlags flags,
uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
: zone_(zone),
captures_(nullptr),
named_captures_(nullptr),
named_back_references_(nullptr),
input_(input),
input_length_(input_length),
current_(kEndMarker),
top_level_flags_(flags),
next_pos_(0),
captures_started_(0),
capture_count_(0),
has_more_(true),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
has_named_captures_(false),
failed_(false),
stack_limit_(stack_limit) {
Advance();
}
template <>
template <bool update_position>
inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
int position = next_pos_;
base::uc16 c0 = InputAt(position);
position++;
DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
if (update_position) next_pos_ = position;
return c0;
}
template <>
template <bool update_position>
inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
int position = next_pos_;
base::uc16 c0 = InputAt(position);
base::uc32 result = c0;
position++;
// Read the whole surrogate pair in case of unicode flag, if possible.
if (unicode() && position < input_length() &&
unibrow::Utf16::IsLeadSurrogate(c0)) {
base::uc16 c1 = InputAt(position);
if (unibrow::Utf16::IsTrailSurrogate(c1)) {
result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
position++;
}
}
if (update_position) next_pos_ = position;
return result;
}
template <class CharT>
base::uc32 RegExpParserImpl<CharT>::Next() {
if (has_next()) {
return ReadNext<false>();
} else {
return kEndMarker;
}
}
template <class CharT>
void RegExpParserImpl<CharT>::Advance() {
if (has_next()) {
if (GetCurrentStackPosition() < stack_limit_) {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Aborting on stack overflow");
}
ReportError(RegExpError::kStackOverflow);
} else {
current_ = ReadNext<true>();
}
} else {
current_ = kEndMarker;
// Advance so that position() points to 1-after-the-last-character. This is
// important so that Reset() to this position works correctly.
next_pos_ = input_length() + 1;
has_more_ = false;
}
}
template <class CharT>
void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
if (current() == kEndMarker) return;
// Rewinds by one code point, i.e.: two code units if `current` is outside
// the basic multilingual plane (= composed of a lead and trail surrogate),
// or one code unit otherwise.
const int rewind_by =
current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
Advance(rewind_by); // Undo the last Advance.
}
template <class CharT>
void RegExpParserImpl<CharT>::Reset(int pos) {
next_pos_ = pos;
has_more_ = (pos < input_length());
Advance();
}
template <class CharT>
void RegExpParserImpl<CharT>::Advance(int dist) {
next_pos_ += dist - 1;
Advance();
}
template <class CharT>
bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
switch (c) {
case '^':
case '$':
case '\\':
case '.':
case '*':
case '+':
case '?':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '|':
case '/':
return true;
default:
break;
}
return false;
}
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
if (failed_) return nullptr; // Do not overwrite any existing error.
failed_ = true;
error_ = error;
error_pos_ = position();
// Zip to the end to make sure no more input is read.
current_ = kEndMarker;
next_pos_ = input_length();
return nullptr;
}
#define CHECK_FAILED /**/); \
if (failed_) return nullptr; \
((void)0
// Pattern ::
// Disjunction
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
RegExpTree* result = ParseDisjunction(CHECK_FAILED);
PatchNamedBackReferences(CHECK_FAILED);
DCHECK(!has_more());
// If the result of parsing is a literal string atom, and it has the
// same length as the input, then the atom is identical to the input.
if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
simple_ = true;
}
return result;
}
// Disjunction ::
// Alternative
// Alternative | Disjunction
// Alternative ::
// [empty]
// Term Alternative
// Term ::
// Assertion
// Atom
// Atom Quantifier
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
0, nullptr, top_level_flags_, zone());
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
while (true) {
switch (current()) {
case kEndMarker:
if (failed()) return nullptr; // E.g. the initial Advance failed.
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
return ReportError(RegExpError::kUnterminatedGroup);
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
return ReportError(RegExpError::kUnmatchedParen);
}
DCHECK_NE(INITIAL, state->group_type());
Advance();
// End disjunction parsing and convert builder content to new single
// regexp atom.
RegExpTree* body = builder->ToRegExp();
int end_capture_index = captures_started();
int capture_index = state->capture_index();
SubexpressionType group_type = state->group_type();
// Build result of subexpression.
if (group_type == CAPTURE) {
if (state->IsNamedCapture()) {
CreateNamedCaptureAtIndex(state->capture_name(),
capture_index CHECK_FAILED);
}
RegExpCapture* capture = GetCapture(capture_index);
capture->set_body(body);
body = capture;
} else if (group_type == GROUPING) {
body = zone()->template New<RegExpGroup>(body);
} else {
DCHECK(group_type == POSITIVE_LOOKAROUND ||
group_type == NEGATIVE_LOOKAROUND);
bool is_positive = (group_type == POSITIVE_LOOKAROUND);
body = zone()->template New<RegExpLookaround>(
body, is_positive, end_capture_index - capture_index,
capture_index, state->lookaround_type());
}
// Restore previous state.
state = state->previous_state();
builder = state->builder();
builder->AddAtom(body);
// For compatibility with JSC and ES3, we allow quantifiers after
// lookaheads, and break in all cases.
break;
}
case '|': {
Advance();
builder->NewAlternative();
continue;
}
case '*':
case '+':
case '?':
return ReportError(RegExpError::kNothingToRepeat);
case '^': {
Advance();
builder->AddAssertion(zone()->template New<RegExpAssertion>(
builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
: RegExpAssertion::Type::START_OF_INPUT));
set_contains_anchor();
continue;
}
case '$': {
Advance();
RegExpAssertion::Type assertion_type =
builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
: RegExpAssertion::Type::END_OF_INPUT;
builder->AddAssertion(
zone()->template New<RegExpAssertion>(assertion_type));
continue;
}
case '.': {
Advance();
ZoneList<CharacterRange>* ranges =
zone()->template New<ZoneList<CharacterRange>>(2, zone());
if (builder->dotall()) {
// Everything.
CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
ranges, false, zone());
} else {
// Everything except \x0A, \x0D, \u2028 and \u2029.
CharacterRange::AddClassEscape(
StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
}
RegExpCharacterClass* cc =
zone()->template New<RegExpCharacterClass>(zone(), ranges);
builder->AddCharacterClass(cc);
break;
}
case '(': {
state = ParseOpenParenthesis(state CHECK_FAILED);
builder = state->builder();
continue;
}
case '[': {
RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
builder->AddCharacterClass(cc->AsCharacterClass());
break;
}
// Atom ::
// \ AtomEscape
case '\\':
switch (Next()) {
case kEndMarker:
return ReportError(RegExpError::kEscapeAtEndOfPattern);
// AtomEscape ::
// [+UnicodeMode] DecimalEscape
// [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
// of DecimalEscape is ≤ NcapturingParens
// CharacterEscape (some cases of this mixed in too)
//
// TODO(jgruber): It may make sense to disentangle all the different
// cases and make the structure mirror the spec, e.g. for AtomEscape:
//
// if (TryParseDecimalEscape(...)) return;
// if (TryParseCharacterClassEscape(...)) return;
// if (TryParseCharacterEscape(...)) return;
// if (TryParseGroupName(...)) return;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
int index = 0;
const bool is_backref =
ParseBackReferenceIndex(&index CHECK_FAILED);
if (is_backref) {
if (state->IsInsideCaptureGroup(index)) {
// The back reference is inside the capture group it refers to.
// Nothing can possibly have been captured yet, so we use empty
// instead. This ensures that, when checking a back reference,
// the capture registers of the referenced capture are either
// both set or both cleared.
builder->AddEmpty();
} else {
RegExpCapture* capture = GetCapture(index);
RegExpTree* atom = zone()->template New<RegExpBackReference>(
capture, builder->flags());
builder->AddAtom(atom);
}
break;
}
// With /u, no identity escapes except for syntax characters
// are allowed. Otherwise, all identity escapes are allowed.
if (unicode()) {
return ReportError(RegExpError::kInvalidEscape);
}
base::uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
builder->AddCharacter(first_digit);
Advance(2);
break;
}
V8_FALLTHROUGH;
}
case '0': {
Advance();
if (unicode() && Next() >= '0' && Next() <= '9') {
// With /u, decimal escape with leading 0 are not parsed as octal.
return ReportError(RegExpError::kInvalidDecimalEscape);
}
base::uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
break;
}
case 'b':
Advance(2);
builder->AddAssertion(zone()->template New<RegExpAssertion>(
RegExpAssertion::Type::BOUNDARY));
continue;
case 'B':
Advance(2);
builder->AddAssertion(zone()->template New<RegExpAssertion>(
RegExpAssertion::Type::NON_BOUNDARY));
continue;
// AtomEscape ::
// CharacterClassEscape
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
case 'p':
case 'P': {
base::uc32 next = Next();
ZoneList<CharacterRange>* ranges =
zone()->template New<ZoneList<CharacterRange>>(2, zone());
bool add_unicode_case_equivalents =
unicode() && builder->ignore_case();
bool parsed_character_class_escape = TryParseCharacterClassEscape(
next, InClassEscapeState::kNotInClass, ranges, zone(),
add_unicode_case_equivalents CHECK_FAILED);
if (parsed_character_class_escape) {
RegExpCharacterClass* cc =
zone()->template New<RegExpCharacterClass>(zone(), ranges);
builder->AddCharacterClass(cc);
} else {
CHECK(!unicode());
Advance(2);
builder->AddCharacter(next); // IdentityEscape.
}
break;
}
// AtomEscape ::
// k GroupName
case 'k': {
// Either an identity escape or a named back-reference. The two
// interpretations are mutually exclusive: '\k' is interpreted as
// an identity escape for non-Unicode patterns without named
// capture groups, and as the beginning of a named back-reference
// in all other cases.
const bool has_named_captures =
HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
if (unicode() || has_named_captures) {
Advance(2);
ParseNamedBackReference(builder, state CHECK_FAILED);
break;
}
}
V8_FALLTHROUGH;
// AtomEscape ::
// CharacterEscape
default: {
bool is_escaped_unicode_character = false;
base::uc32 c = ParseCharacterEscape(
InClassEscapeState::kNotInClass,
&is_escaped_unicode_character CHECK_FAILED);
if (is_escaped_unicode_character) {
builder->AddEscapedUnicodeCharacter(c);
} else {
builder->AddCharacter(c);
}
break;
}
}
break;
case '{': {
int dummy;
bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
if (parsed) return ReportError(RegExpError::kNothingToRepeat);
V8_FALLTHROUGH;
}
case '}':
case ']':
if (unicode()) {
return ReportError(RegExpError::kLoneQuantifierBrackets);
}
V8_FALLTHROUGH;
default:
builder->AddUnicodeCharacter(current());
Advance();
break;
} // end switch(current())
int min;
int max;
switch (current()) {
// QuantifierPrefix ::
// *
// +
// ?
// {
case '*':
min = 0;
max = RegExpTree::kInfinity;
Advance();
break;
case '+':
min = 1;
max = RegExpTree::kInfinity;
Advance();
break;
case '?':
min = 0;
max = 1;
Advance();
break;
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
return ReportError(RegExpError::kRangeOutOfOrder);
}
break;
} else if (unicode()) {
// With /u, incomplete quantifiers are not allowed.
return ReportError(RegExpError::kIncompleteQuantifier);
}
continue;
default:
continue;
}
RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
if (current() == '?') {
quantifier_type = RegExpQuantifier::NON_GREEDY;
Advance();
} else if (FLAG_regexp_possessive_quantifier && current() == '+') {
// FLAG_regexp_possessive_quantifier is a debug-only flag.
quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance();
}
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
return ReportError(RegExpError::kInvalidQuantifier);
}
}
}
template <class CharT>
RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
RegExpParserState* state) {
RegExpLookaround::Type lookaround_type = state->lookaround_type();
bool is_named_capture = false;
const ZoneVector<base::uc16>* capture_name = nullptr;
SubexpressionType subexpr_type = CAPTURE;
Advance();
if (current() == '?') {
switch (Next()) {
case ':':
Advance(2);
subexpr_type = GROUPING;
break;
case '=':
Advance(2);
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = POSITIVE_LOOKAROUND;
break;
case '!':
Advance(2);
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = NEGATIVE_LOOKAROUND;
break;
case '<':
Advance();
if (Next() == '=') {
Advance(2);
lookaround_type = RegExpLookaround::LOOKBEHIND;
subexpr_type = POSITIVE_LOOKAROUND;
break;
} else if (Next() == '!') {
Advance(2);
lookaround_type = RegExpLookaround::LOOKBEHIND;
subexpr_type = NEGATIVE_LOOKAROUND;
break;
}
is_named_capture = true;
has_named_captures_ = true;
Advance();
break;
default:
ReportError(RegExpError::kInvalidGroup);
return nullptr;
}
}
if (subexpr_type == CAPTURE) {
if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) {
ReportError(RegExpError::kTooManyCaptures);
return nullptr;
}
captures_started_++;
if (is_named_capture) {
capture_name = ParseCaptureGroupName(CHECK_FAILED);
}
}
// Store current state and begin new disjunction parsing.
return zone()->template New<RegExpParserState>(
state, subexpr_type, lookaround_type, captures_started_, capture_name,
state->builder()->flags(), zone());
}
#ifdef DEBUG
namespace {
bool IsSpecialClassEscape(base::uc32 c) {
switch (c) {
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
return true;
default:
return false;
}
}
} // namespace
#endif
// In order to know whether an escape is a backreference or not we have to scan
// the entire regexp and find the number of capturing parentheses. However we
// don't want to scan the regexp twice unless it is necessary. This mini-parser
// is called when needed. It can see the difference between capturing and
// noncapturing parentheses and can skip character classes and backslash-escaped
// characters.
//
// Important: The scanner has to be in a consistent state when calling
// ScanForCaptures, e.g. not in the middle of an escape sequence '\['.
template <class CharT>
void RegExpParserImpl<CharT>::ScanForCaptures(
InClassEscapeState in_class_escape_state) {
DCHECK(!is_scanned_for_captures_);
const int saved_position = position();
// Start with captures started previous to current position
int capture_count = captures_started();
// When we start inside a character class, skip everything inside the class.
if (in_class_escape_state == InClassEscapeState::kInClass) {
int c;
while ((c = current()) != kEndMarker) {
Advance();
if (c == '\\') {
Advance();
} else {
if (c == ']') break;
}
}
}
// Add count of captures after this position.
int n;
while ((n = current()) != kEndMarker) {
Advance();
switch (n) {
case '\\':
Advance();
break;
case '[': {
int c;
while ((c = current()) != kEndMarker) {
Advance();
if (c == '\\') {
Advance();
} else {
if (c == ']') break;
}
}
break;
}
case '(':
if (current() == '?') {
// At this point we could be in
// * a non-capturing group '(:',
// * a lookbehind assertion '(?<=' '(?<!'
// * or a named capture '(?<'.
//
// Of these, only named captures are capturing groups.
Advance();
if (current() != '<') break;
Advance();
if (current() == '=' || current() == '!') break;
// Found a possible named capture. It could turn out to be a syntax
// error (e.g. an unterminated or invalid name), but that distinction
// does not matter for our purposes.
has_named_captures_ = true;
}
capture_count++;
break;
}
}
capture_count_ = capture_count;
is_scanned_for_captures_ = true;
Reset(saved_position);
}
template <class CharT>
bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
DCHECK_EQ('\\', current());
DCHECK('1' <= Next() && Next() <= '9');
// Try to parse a decimal literal that is no greater than the total number
// of left capturing parentheses in the input.
int start = position();
int value = Next() - '0';
Advance(2);
while (true) {
base::uc32 c = current();
if (IsDecimalDigit(c)) {
value = 10 * value + (c - '0');
if (value > RegExpMacroAssembler::kMaxRegisterCount) {
Reset(start);
return false;
}
Advance();
} else {
break;
}
}
if (value > captures_started()) {
if (!is_scanned_for_captures_)
ScanForCaptures(InClassEscapeState::kNotInClass);
if (value > capture_count_) {
Reset(start);
return false;
}
}
*index_out = value;
return true;
}
namespace {
void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
v->push_back(code_unit);
} else {
v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
}
}
} // namespace
template <class CharT>
const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
// Due to special Advance requirements (see the next comment), rewind by one
// such that names starting with a surrogate pair are parsed correctly for
// patterns where the unicode flag is unset.
//
// Note that we use this odd pattern of rewinding the last advance in order
// to adhere to the common parser behavior of expecting `current` to point at
// the first candidate character for a function (e.g. when entering ParseFoo,
// `current` should point at the first character of Foo).
RewindByOneCodepoint();
ZoneVector<base::uc16>* name =
zone()->template New<ZoneVector<base::uc16>>(zone());
{
// Advance behavior inside this function is tricky since
// RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
// and thus allows surrogate pairs and \u{}-style escapes even in
// non-unicode patterns. Therefore Advance within the capture group name
// has to force-enable unicode, and outside the name revert to default
// behavior.
ForceUnicodeScope force_unicode(this);
bool at_start = true;
while (true) {
Advance();
base::uc32 c = current();
// Convert unicode escapes.
if (c == '\\' && Next() == 'u') {
Advance(2);
if (!ParseUnicodeEscape(&c)) {
ReportError(RegExpError::kInvalidUnicodeEscape);
return nullptr;
}
RewindByOneCodepoint();
}
// The backslash char is misclassified as both ID_Start and ID_Continue.
if (c == '\\') {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
if (at_start) {
if (!IsIdentifierStart(c)) {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
push_code_unit(name, c);
at_start = false;
} else {
if (c == '>') {
break;
} else if (IsIdentifierPart(c)) {
push_code_unit(name, c);
} else {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
}
}
}
// This final advance goes back into the state of pointing at the next
// relevant char, which the rest of the parser expects. See also the previous
// comments in this function.
Advance();
return name;
}
template <class CharT>
bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
const ZoneVector<base::uc16>* name, int index) {
DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name);
RegExpCapture* capture = GetCapture(index);
DCHECK_NULL(capture->name());
capture->set_name(name);
if (named_captures_ == nullptr) {
named_captures_ =
zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
zone());
} else {
// Check for duplicates and bail if we find any.
const auto& named_capture_it = named_captures_->find(capture);
if (named_capture_it != named_captures_->end()) {
ReportError(RegExpError::kDuplicateCaptureGroupName);
return false;
}
}
named_captures_->emplace(capture);
return true;
}
template <class CharT>
bool RegExpParserImpl<CharT>::ParseNamedBackReference(
RegExpBuilder* builder, RegExpParserState* state) {
// The parser is assumed to be on the '<' in \k<name>.
if (current() != '<') {
ReportError(RegExpError::kInvalidNamedReference);
return false;
}
Advance();
const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
if (name == nullptr) {
return false;
}
if (state->IsInsideCaptureGroup(name)) {
builder->AddEmpty();
} else {
RegExpBackReference* atom =
zone()->template New<RegExpBackReference>(builder->flags());
atom->set_name(name);
builder->AddAtom(atom);
if (named_back_references_ == nullptr) {
named_back_references_ =
zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
}
named_back_references_->Add(atom, zone());
}
return true;
}
template <class CharT>
void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
if (named_back_references_ == nullptr) return;
if (named_captures_ == nullptr) {
ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
// Look up and patch the actual capture for each named back reference.
for (int i = 0; i < named_back_references_->length(); i++) {
RegExpBackReference* ref = named_back_references_->at(i);
// Capture used to search the named_captures_ by name, index of the
// capture is never used.
static const int kInvalidIndex = 0;
RegExpCapture* search_capture =
zone()->template New<RegExpCapture>(kInvalidIndex);
DCHECK_NULL(search_capture->name());
search_capture->set_name(ref->name());
int index = -1;
const auto& capture_it = named_captures_->find(search_capture);
if (capture_it != named_captures_->end()) {
index = (*capture_it)->index();
} else {
ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
ref->set_capture(GetCapture(index));
}
}
template <class CharT>
RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
// The index for the capture groups are one-based. Its index in the list is
// zero-based.
const int known_captures =
is_scanned_for_captures_ ? capture_count_ : captures_started_;
DCHECK(index <= known_captures);
if (captures_ == nullptr) {
captures_ =
zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
}
while (captures_->length() < known_captures) {
captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
zone());
}
return captures_->at(index - 1);
}
template <class CharT>
ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
if (named_captures_ == nullptr || named_captures_->empty()) {
return nullptr;
}
return zone()->template New<ZoneVector<RegExpCapture*>>(
named_captures_->begin(), named_captures_->end(), zone());
}
template <class CharT>
bool RegExpParserImpl<CharT>::HasNamedCaptures(
InClassEscapeState in_class_escape_state) {
if (has_named_captures_ || is_scanned_for_captures_) {
return has_named_captures_;
}
ScanForCaptures(in_class_escape_state);
DCHECK(is_scanned_for_captures_);
return has_named_captures_;
}
// QuantifierPrefix ::
// { DecimalDigits }
// { DecimalDigits , }
// { DecimalDigits , DecimalDigits }
//
// Returns true if parsing succeeds, and set the min_out and max_out
// values. Values are truncated to RegExpTree::kInfinity if they overflow.
template <class CharT>
bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
int* max_out) {
DCHECK_EQ(current(), '{');
int start = position();
Advance();
int min = 0;
if (!IsDecimalDigit(current())) {
Reset(start);
return false;
}
while (IsDecimalDigit(current())) {
int next = current() - '0';
if (min > (RegExpTree::kInfinity - next) / 10) {
// Overflow. Skip past remaining decimal digits and return -1.
do {
Advance();
} while (IsDecimalDigit(current()));
min = RegExpTree::kInfinity;
break;
}
min = 10 * min + next;
Advance();
}
int max = 0;
if (current() == '}') {
max = min;
Advance();
} else if (current() == ',') {
Advance();
if (current() == '}') {
max = RegExpTree::kInfinity;
Advance();
} else {
while (IsDecimalDigit(current())) {
int next = current() - '0';
if (max > (RegExpTree::kInfinity - next) / 10) {
do {
Advance();
} while (IsDecimalDigit(current()));
max = RegExpTree::kInfinity;
break;
}
max = 10 * max + next;
Advance();
}
if (current() != '}') {
Reset(start);
return false;
}
Advance();
}
} else {
Reset(start);
return false;
}
*min_out = min;
*max_out = max;
return true;
}
template <class CharT>
base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
// For compatibility with some other browsers (not all), we parse
// up to three octal digits with a value below 256.
// ES#prod-annexB-LegacyOctalEscapeSequence
base::uc32 value = current() - '0';
Advance();
if ('0' <= current() && current() <= '7') {
value = value * 8 + current() - '0';
Advance();
if (value < 32 && '0' <= current() && current() <= '7') {
value = value * 8 + current() - '0';
Advance();
}
}
return value;
}
template <class CharT>
bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
int start = position();
base::uc32 val = 0;
for (int i = 0; i < length; ++i) {
base::uc32 c = current();
int d = base::HexValue(c);
if (d < 0) {
Reset(start);
return false;
}
val = val * 16 + d;
Advance();
}
*value = val;
return true;
}
// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
template <class CharT>
bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
// arbitrary. \ and u have already been read.
if (current() == '{' && unicode()) {
int start = position();
Advance();
if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
if (current() == '}') {
Advance();
return true;
}
}
Reset(start);
return false;
}
// \u but no {, or \u{...} escapes not allowed.
bool result = ParseHexEscape(4, value);
if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
current() == '\\') {
// Attempt to read trail surrogate.
int start = position();
if (Next() == 'u') {
Advance(2);
base::uc32 trail;
if (ParseHexEscape(4, &trail) &&
unibrow::Utf16::IsTrailSurrogate(trail)) {
*value = unibrow::Utf16::CombineSurrogatePair(
static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
return true;
}
}
Reset(start);
}
return result;
}
#ifdef V8_INTL_SUPPORT
namespace {
bool IsExactPropertyAlias(const char* property_name, UProperty property) {
const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
if (short_name != nullptr && strcmp(property_name, short_name) == 0)
return true;
for (int i = 0;; i++) {
const char* long_name = u_getPropertyName(
property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
if (long_name == nullptr) break;
if (strcmp(property_name, long_name) == 0) return true;
}
return false;
}
bool IsExactPropertyValueAlias(const char* property_value_name,
UProperty property, int32_t property_value) {
const char* short_name =
u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
return true;
}
for (int i = 0;; i++) {
const char* long_name = u_getPropertyValueName(
property, property_value,
static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
if (long_name == nullptr) break;
if (strcmp(property_value_name, long_name) == 0) return true;
}
return false;
}
bool LookupPropertyValueName(UProperty property,
const char* property_value_name, bool negate,
ZoneList<CharacterRange>* result, Zone* zone) {
UProperty property_for_lookup = property;
if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
// For the property Script_Extensions, we have to do the property value
// name lookup as if the property is Script.
property_for_lookup = UCHAR_SCRIPT;
}
int32_t property_value =
u_getPropertyValueEnum(property_for_lookup, property_value_name);
if (property_value == UCHAR_INVALID_CODE) return false;
// We require the property name to match exactly to one of the property value
// aliases. However, u_getPropertyValueEnum uses loose matching.
if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
property_value)) {
return false;
}
UErrorCode ec = U_ZERO_ERROR;
icu::UnicodeSet set;
set.applyIntPropertyValue(property, property_value, ec);
bool success = ec == U_ZERO_ERROR && !set.isEmpty();
if (success) {
set.removeAllStrings();
if (negate) set.complement();
for (int i = 0; i < set.getRangeCount(); i++) {
result->Add(
CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
zone);
}
}
return success;
}
template <size_t N>
inline bool NameEquals(const char* name, const char (&literal)[N]) {
return strncmp(name, literal, N + 1) == 0;
}
bool LookupSpecialPropertyValueName(const char* name,
ZoneList<CharacterRange>* result,
bool negate, Zone* zone) {
if (NameEquals(name, "Any")) {
if (negate) {
// Leave the list of character ranges empty, since the negation of 'Any'
// is the empty set.
} else {
result->Add(CharacterRange::Everything(), zone);
}
} else if (NameEquals(name, "ASCII")) {
result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
: CharacterRange::Range(0x0, 0x7F),
zone);
} else if (NameEquals(name, "Assigned")) {
return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
!negate, result, zone);
} else {
return false;
}
return true;
}
// Explicitly allowlist supported binary properties. The spec forbids supporting
// properties outside of this set to ensure interoperability.
bool IsSupportedBinaryProperty(UProperty property) {
switch (property) {
case UCHAR_ALPHABETIC:
// 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
// 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
case UCHAR_ASCII_HEX_DIGIT:
// 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
case UCHAR_BIDI_CONTROL:
case UCHAR_BIDI_MIRRORED:
case UCHAR_CASE_IGNORABLE:
case UCHAR_CASED:
case UCHAR_CHANGES_WHEN_CASEFOLDED:
case UCHAR_CHANGES_WHEN_CASEMAPPED:
case UCHAR_CHANGES_WHEN_LOWERCASED:
case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
case UCHAR_CHANGES_WHEN_TITLECASED:
case UCHAR_CHANGES_WHEN_UPPERCASED:
case UCHAR_DASH:
case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
case UCHAR_DEPRECATED:
case UCHAR_DIACRITIC:
case UCHAR_EMOJI:
case UCHAR_EMOJI_COMPONENT:
case UCHAR_EMOJI_MODIFIER_BASE:
case UCHAR_EMOJI_MODIFIER:
case UCHAR_EMOJI_PRESENTATION:
case UCHAR_EXTENDED_PICTOGRAPHIC:
case UCHAR_EXTENDER:
case UCHAR_GRAPHEME_BASE:
case UCHAR_GRAPHEME_EXTEND:
case UCHAR_HEX_DIGIT:
case UCHAR_ID_CONTINUE:
case UCHAR_ID_START:
case UCHAR_IDEOGRAPHIC:
case UCHAR_IDS_BINARY_OPERATOR:
case UCHAR_IDS_TRINARY_OPERATOR:
case UCHAR_JOIN_CONTROL:
case UCHAR_LOGICAL_ORDER_EXCEPTION:
case UCHAR_LOWERCASE:
case UCHAR_MATH:
case UCHAR_NONCHARACTER_CODE_POINT:
case UCHAR_PATTERN_SYNTAX:
case UCHAR_PATTERN_WHITE_SPACE:
case UCHAR_QUOTATION_MARK:
case UCHAR_RADICAL:
case UCHAR_REGIONAL_INDICATOR:
case UCHAR_S_TERM:
case UCHAR_SOFT_DOTTED:
case UCHAR_TERMINAL_PUNCTUATION:
case UCHAR_UNIFIED_IDEOGRAPH:
case UCHAR_UPPERCASE:
case UCHAR_VARIATION_SELECTOR:
case UCHAR_WHITE_SPACE:
case UCHAR_XID_CONTINUE:
case UCHAR_XID_START:
return true;
default:
break;
}
return false;
}
bool IsUnicodePropertyValueCharacter(char c) {
// https://tc39.github.io/proposal-regexp-unicode-property-escapes/
//
// Note that using this to validate each parsed char is quite conservative.
// A possible alternative solution would be to only ensure the parsed
// property name/value candidate string does not contain '\0' characters and
// let ICU lookups trigger the final failure.
if ('a' <= c && c <= 'z') return true;
if ('A' <= c && c <= 'Z') return true;
if ('0' <= c && c <= '9') return true;
return (c == '_');
}
} // namespace
template <class CharT>
bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
ZoneVector<char>* name_2) {
DCHECK(name_1->empty());
DCHECK(name_2->empty());
// Parse the property class as follows:
// - In \p{name}, 'name' is interpreted
// - either as a general category property value name.
// - or as a binary property name.
// - In \p{name=value}, 'name' is interpreted as an enumerated property name,
// and 'value' is interpreted as one of the available property value names.
// - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
// - Loose matching is not applied.
if (current() == '{') {
// Parse \p{[PropertyName=]PropertyNameValue}
for (Advance(); current() != '}' && current() != '='; Advance()) {
if (!IsUnicodePropertyValueCharacter(current())) return false;
if (!has_next()) return false;
name_1->push_back(static_cast<char>(current()));
}
if (current() == '=') {
for (Advance(); current() != '}'; Advance()) {
if (!IsUnicodePropertyValueCharacter(current())) return false;
if (!has_next()) return false;
name_2->push_back(static_cast<char>(current()));
}
name_2->push_back(0); // null-terminate string.
}
} else {
return false;
}
Advance();
name_1->push_back(0); // null-terminate string.
DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
return true;
}
template <class CharT>
bool RegExpParserImpl<CharT>::AddPropertyClassRange(
ZoneList<CharacterRange>* add_to, bool negate,
const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
if (name_2.empty()) {
// First attempt to interpret as general category property value name.
const char* name = name_1.data();
if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
add_to, zone())) {
return true;
}
// Interpret "Any", "ASCII", and "Assigned".
if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
return true;
}
// Then attempt to interpret as binary property name with value name 'Y'.
UProperty property = u_getPropertyEnum(name);
if (!IsSupportedBinaryProperty(property)) return false;
if (!IsExactPropertyAlias(name, property)) return false;
return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
zone());
} else {
// Both property name and value name are specified. Attempt to interpret
// the property name as enumerated property.
const char* property_name = name_1.data();
const char* value_name = name_2.data();
UProperty property = u_getPropertyEnum(property_name);
if (!IsExactPropertyAlias(property_name, property)) return false;
if (property == UCHAR_GENERAL_CATEGORY) {
// We want to allow aggregate value names such as "Letter".
property = UCHAR_GENERAL_CATEGORY_MASK;
} else if (property != UCHAR_SCRIPT &&
property != UCHAR_SCRIPT_EXTENSIONS) {
return false;
}
return LookupPropertyValueName(property, value_name, negate, add_to,
zone());
}
}
#else // V8_INTL_SUPPORT
template <class CharT>
bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
ZoneVector<char>* name_2) {
return false;
}
template <class CharT>
bool RegExpParserImpl<CharT>::AddPropertyClassRange(
ZoneList<CharacterRange>* add_to, bool negate,
const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
return false;
}
#endif // V8_INTL_SUPPORT
template <class CharT>
bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
base::uc32* value) {
base::uc32 x = 0;
int d = base::HexValue(current());
if (d < 0) {
return false;
}
while (d >= 0) {
x = x * 16 + d;
if (x > static_cast<base::uc32>(max_value)) {
return false;
}
Advance();
d = base::HexValue(current());
}
*value = x;
return true;
}
// https://tc39.es/ecma262/#prod-CharacterEscape
template <class CharT>
base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
InClassEscapeState in_class_escape_state,
bool* is_escaped_unicode_character) {
DCHECK_EQ('\\', current());
DCHECK(has_next() && !IsSpecialClassEscape(Next()));
Advance();
const base::uc32 c = current();
switch (c) {
// CharacterEscape ::
// ControlEscape :: one of
// f n r t v
case 'f':
Advance();
return '\f';
case 'n':
Advance();
return '\n';
case 'r':
Advance();
return '\r';
case 't':
Advance();
return '\t';
case 'v':
Advance();
return '\v';
// CharacterEscape ::
// c ControlLetter
case 'c': {
base::uc32 controlLetter = Next();
base::uc32 letter = controlLetter & ~('A' ^ 'a');
if (letter >= 'A' && letter <= 'Z') {
Advance(2);
// Control letters mapped to ASCII control characters in the range
// 0x00-0x1F.
return controlLetter & 0x1F;
}
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(RegExpError::kInvalidUnicodeEscape);
return 0;
}
if (in_class_escape_state == InClassEscapeState::kInClass) {
// Inside a character class, we also accept digits and underscore as
// control characters, unless with /u. See Annex B:
// ES#prod-annexB-ClassControlLetter
if ((controlLetter >= '0' && controlLetter <= '9') ||
controlLetter == '_') {
Advance(2);
return controlLetter & 0x1F;
}
}
// We match JSC in reading the backslash as a literal
// character instead of as starting an escape.
return '\\';
}
// CharacterEscape ::
// 0 [lookahead ∉ DecimalDigit]
// [~UnicodeMode] LegacyOctalEscapeSequence
case '0':
// \0 is interpreted as NUL if not followed by another digit.
if (Next() < '0' || Next() > '9') {
Advance();
return 0;
}
V8_FALLTHROUGH;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
// For compatibility, we interpret a decimal escape that isn't
// a back reference (and therefore either \0 or not valid according
// to the specification) as a 1..3 digit octal character code.
// ES#prod-annexB-LegacyOctalEscapeSequence
if (unicode()) {
// With /u, decimal escape is not interpreted as octal character code.
ReportError(RegExpError::kInvalidClassEscape);
return 0;
}
return ParseOctalLiteral();
// CharacterEscape ::
// HexEscapeSequence
case 'x': {
Advance();
base::uc32 value;
if (ParseHexEscape(2, &value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(RegExpError::kInvalidEscape);
return 0;
}
// If \x is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'x';
}
// CharacterEscape ::
// RegExpUnicodeEscapeSequence [?UnicodeMode]
case 'u': {
Advance();
base::uc32 value;
if (ParseUnicodeEscape(&value)) {
*is_escaped_unicode_character = true;
return value;
}
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(RegExpError::kInvalidUnicodeEscape);
return 0;
}
// If \u is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'u';
}
default:
break;
}
// CharacterEscape ::
// IdentityEscape[?UnicodeMode, ?N]
//
// * With /u, no identity escapes except for syntax characters are
// allowed.
// * Without /u:
// * '\c' is not an IdentityEscape.
// * '\k' is not an IdentityEscape when named captures exist.
// * Otherwise, all identity escapes are allowed.
if (unicode()) {
if (!IsSyntaxCharacterOrSlash(c)) {
ReportError(RegExpError::kInvalidEscape);
return 0;
}
Advance();
return c;
}
DCHECK(!unicode());
if (c == 'c') {
ReportError(RegExpError::kInvalidEscape);
return 0;
}
Advance();
// Note: It's important to Advance before the HasNamedCaptures call s.t. we
// don't start scanning in the middle of an escape.
if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
ReportError(RegExpError::kInvalidEscape);
return 0;
}
return c;
}
// https://tc39.es/ecma262/#prod-ClassEscape
template <class CharT>
void RegExpParserImpl<CharT>::ParseClassEscape(
ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents, base::uc32* char_out,
bool* is_class_escape) {
*is_class_escape = false;
if (current() != '\\') {
// Not a ClassEscape.
*char_out = current();
Advance();
return;
}
const base::uc32 next = Next();
switch (next) {
case 'b':
*char_out = '\b';
Advance(2);
return;
case '-':
if (unicode()) {
*char_out = next;
Advance(2);
return;
}
break;
case kEndMarker:
ReportError(RegExpError::kEscapeAtEndOfPattern);
return;
default:
break;
}
static constexpr InClassEscapeState kInClassEscape =
InClassEscapeState::kInClass;
*is_class_escape = TryParseCharacterClassEscape(
next, kInClassEscape, ranges, zone, add_unicode_case_equivalents);
if (*is_class_escape) return;
bool dummy = false; // Unused.
*char_out = ParseCharacterEscape(kInClassEscape, &dummy);
}
// https://tc39.es/ecma262/#prod-CharacterClassEscape
template <class CharT>
bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
base::uc32 next, InClassEscapeState in_class_escape_state,
ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents) {
DCHECK_EQ(current(), '\\');
DCHECK_EQ(Next(), next);
switch (next) {
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
ranges, add_unicode_case_equivalents,
zone);
Advance(2);
return true;
case 'p':
case 'P': {
if (!unicode()) return false;
bool negate = next == 'P';
Advance(2);
ZoneVector<char> name_1(zone);
ZoneVector<char> name_2(zone);
if (!ParsePropertyClassName(&name_1, &name_2) ||
!AddPropertyClassRange(ranges, negate, name_1, name_2)) {
ReportError(in_class_escape_state == InClassEscapeState::kInClass
? RegExpError::kInvalidClassPropertyName
: RegExpError::kInvalidPropertyName);
}
return true;
}
default:
return false;
}
}
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
const RegExpBuilder* builder) {
DCHECK_EQ(current(), '[');
Advance();
bool is_negated = false;
if (current() == '^') {
is_negated = true;
Advance();
}
ZoneList<CharacterRange>* ranges =
zone()->template New<ZoneList<CharacterRange>>(2, zone());
bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
while (has_more() && current() != ']') {
base::uc32 char_1, char_2;
bool is_class_1, is_class_2;
ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
&is_class_1 CHECK_FAILED);
if (current() == '-') {
Advance();
if (current() == kEndMarker) {
// If we reach the end we break out of the loop and let the
// following code report an error.
break;
} else if (current() == ']') {
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
break;
}
ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
&is_class_2 CHECK_FAILED);
if (is_class_1 || is_class_2) {
// Either end is an escaped character class. Treat the '-' verbatim.
if (unicode()) {
// ES2015 21.2.2.15.1 step 1.
return ReportError(RegExpError::kInvalidCharacterClass);
}
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
continue;
}
// ES2015 21.2.2.15.1 step 6.
if (char_1 > char_2) {
return ReportError(RegExpError::kOutOfOrderCharacterClass);
}
ranges->Add(CharacterRange::Range(char_1, char_2), zone());
} else {
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
}
}
if (!has_more()) {
return ReportError(RegExpError::kUnterminatedCharacterClass);
}
Advance();
RegExpCharacterClass::CharacterClassFlags character_class_flags;
if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
return zone()->template New<RegExpCharacterClass>(zone(), ranges,
character_class_flags);
}
#undef CHECK_FAILED
template <class CharT>
bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
DCHECK_NOT_NULL(result);
RegExpTree* tree = ParsePattern();
if (failed()) {
DCHECK_NULL(tree);
DCHECK_NE(error_, RegExpError::kNone);
result->error = error_;
result->error_pos = error_pos_;
return false;
}
DCHECK_NOT_NULL(tree);
DCHECK_EQ(error_, RegExpError::kNone);
if (FLAG_trace_regexp_parser) {
StdoutStream os;
tree->Print(os, zone());
os << "\n";
}
result->tree = tree;
const int capture_count = captures_started();
result->simple = tree->IsAtom() && simple() && capture_count == 0;
result->contains_anchor = contains_anchor();
result->capture_count = capture_count;
result->named_captures = GetNamedCaptures();
return true;
}
void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
FlushPendingSurrogate();
// Hold onto the lead surrogate, waiting for a trail surrogate to follow.
pending_surrogate_ = lead_surrogate;
}
void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
if (pending_surrogate_ != kNoPendingSurrogate) {
base::uc16 lead_surrogate = pending_surrogate_;
pending_surrogate_ = kNoPendingSurrogate;
DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
base::uc32 combined =
unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
if (NeedsDesugaringForIgnoreCase(combined)) {
AddCharacterClassForDesugaring(combined);
} else {
ZoneList<base::uc16> surrogate_pair(2, zone());
surrogate_pair.Add(lead_surrogate, zone());
surrogate_pair.Add(trail_surrogate, zone());
RegExpAtom* atom =
zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
AddAtom(atom);
}
} else {
pending_surrogate_ = trail_surrogate;
FlushPendingSurrogate();
}
}
void RegExpBuilder::FlushPendingSurrogate() {
if (pending_surrogate_ != kNoPendingSurrogate) {
DCHECK(unicode());
base::uc32 c = pending_surrogate_;
pending_surrogate_ = kNoPendingSurrogate;
AddCharacterClassForDesugaring(c);
}
}
void RegExpBuilder::FlushCharacters() {
FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ != nullptr) {
RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
characters_ = nullptr;
text_.emplace_back(atom);
LAST(ADD_ATOM);
}
}
void RegExpBuilder::FlushText() {
FlushCharacters();
size_t num_text = text_.size();
if (num_text == 0) {
return;
} else if (num_text == 1) {
terms_.emplace_back(text_.back());
} else {
RegExpText* text = zone()->New<RegExpText>(zone());
for (size_t i = 0; i < num_text; i++) {
text_[i]->AppendToText(text, zone());
}
terms_.emplace_back(text);
}
text_.clear();
}
void RegExpBuilder::AddCharacter(base::uc16 c) {
FlushPendingSurrogate();
pending_empty_ = false;
if (NeedsDesugaringForIgnoreCase(c)) {
AddCharacterClassForDesugaring(c);
} else {
if (characters_ == nullptr) {
characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
}
characters_->Add(c, zone());
LAST(ADD_CHAR);
}
}
void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
DCHECK(unicode());
AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
} else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
AddLeadSurrogate(c);
} else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
AddTrailSurrogate(c);
} else {
AddCharacter(static_cast<base::uc16>(c));
}
}
void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
// A lead or trail surrogate parsed via escape sequence will not
// pair up with any preceding lead or following trail surrogate.
FlushPendingSurrogate();
AddUnicodeCharacter(character);
FlushPendingSurrogate();
}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (NeedsDesugaringForUnicode(cc)) {
// With /u, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText.
AddTerm(cc);
} else {
AddAtom(cc);
}
}
void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) {
AddTerm(zone()->New<RegExpCharacterClass>(
zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
}
void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) {
AddEmpty();
return;
}
if (term->IsTextElement()) {
FlushCharacters();
text_.emplace_back(term);
} else {
FlushText();
terms_.emplace_back(term);
}
LAST(ADD_ATOM);
}
void RegExpBuilder::AddTerm(RegExpTree* term) {
FlushText();
terms_.emplace_back(term);
LAST(ADD_ATOM);
}
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
FlushText();
terms_.emplace_back(assert);
LAST(ADD_ASSERT);
}
void RegExpBuilder::NewAlternative() { FlushTerms(); }
void RegExpBuilder::FlushTerms() {
FlushText();
size_t num_terms = terms_.size();
RegExpTree* alternative;
if (num_terms == 0) {
alternative = zone()->New<RegExpEmpty>();
} else if (num_terms == 1) {
alternative = terms_.back();
} else {
alternative =
zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
base::VectorOf(terms_.begin(), terms_.size()), zone()));
}
alternatives_.emplace_back(alternative);
terms_.clear();
LAST(ADD_NONE);
}
bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
if (!unicode()) return false;
// TODO(yangguo): we could be smarter than this. Case-insensitivity does not
// necessarily mean that we need to desugar. It's probably nicer to have a
// separate pass to figure out unicode desugarings.
if (ignore_case()) return true;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::Canonicalize(ranges);
if (cc->is_negated()) {
ZoneList<CharacterRange>* negated_ranges =
zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());
CharacterRange::Negate(ranges, negated_ranges, zone());
ranges = negated_ranges;
}
for (int i = ranges->length() - 1; i >= 0; i--) {
base::uc32 from = ranges->at(i).from();
base::uc32 to = ranges->at(i).to();
// Check for non-BMP characters.
if (to >= kNonBmpStart) return true;
// Check for lone surrogates.
if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
}
return false;
}
bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
#ifdef V8_INTL_SUPPORT
if (unicode() && ignore_case()) {
icu::UnicodeSet set(c, c);
set.closeOver(USET_CASE_INSENSITIVE);
set.removeAllStrings();
return set.size() > 1;
}
// In the case where ICU is not included, we act as if the unicode flag is
// not set, and do not desugar.
#endif // V8_INTL_SUPPORT
return false;
}
RegExpTree* RegExpBuilder::ToRegExp() {
FlushTerms();
size_t num_alternatives = alternatives_.size();
if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
if (num_alternatives == 1) return alternatives_.back();
return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
}
bool RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
return true;
}
RegExpTree* atom;
if (characters_ != nullptr) {
DCHECK(last_added_ == ADD_CHAR);
// Last atom was character.
base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
int num_chars = char_vector.length();
if (num_chars > 1) {
base::Vector<const base::uc16> prefix =
char_vector.SubVector(0, num_chars - 1);
text_.emplace_back(zone()->New<RegExpAtom>(prefix));
char_vector = char_vector.SubVector(num_chars - 1, num_chars);
}
characters_ = nullptr;
atom = zone()->New<RegExpAtom>(char_vector);
FlushText();
} else if (text_.size() > 0) {
DCHECK(last_added_ == ADD_ATOM);
atom = text_.back();
text_.pop_back();
FlushText();
} else if (terms_.size() > 0) {
DCHECK(last_added_ == ADD_ATOM);
atom = terms_.back();
terms_.pop_back();
if (atom->IsLookaround()) {
// With /u, lookarounds are not quantifiable.
if (unicode()) return false;
// Lookbehinds are not quantifiable.
if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
return false;
}
}
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
return true;
}
terms_.emplace_back(atom);
return true;
}
} else {
// Only call immediately after adding an atom or character!
UNREACHABLE();
}
terms_.emplace_back(
zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));
LAST(ADD_TERM);
return true;
}
template class RegExpParserImpl<uint8_t>;
template class RegExpParserImpl<base::uc16>;
} // namespace
// static
bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
Handle<String> input,
RegExpFlags flags,
RegExpCompileData* result) {
DisallowGarbageCollection no_gc;
uintptr_t stack_limit = isolate->stack_guard()->real_climit();
String::FlatContent content = input->GetFlatContent(no_gc);
if (content.IsOneByte()) {
base::Vector<const uint8_t> v = content.ToOneByteVector();
return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags,
stack_limit, zone, no_gc}
.Parse(result);
} else {
base::Vector<const base::uc16> v = content.ToUC16Vector();
return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags,
stack_limit, zone, no_gc}
.Parse(result);
}
}
// static
template <class CharT>
bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
const CharT* input, int input_length,
RegExpFlags flags,
RegExpCompileData* result,
const DisallowGarbageCollection& no_gc) {
return RegExpParserImpl<CharT>{input, input_length, flags,
stack_limit, zone, no_gc}
.Parse(result);
}
template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
const DisallowGarbageCollection&);
template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
const DisallowGarbageCollection&);
#undef LAST
} // namespace internal
} // namespace v8