[regexp] Support unicode capture names in non-unicode patterns

This ensures that capture names containing surrogate pairs are parsed correctly even in non-unicode RegExp patterns by introducing a new scanning mode which unconditionally combines surrogate pairs. BUG=v8:5437,v8:6192 Review-Url: https://codereview.chromium.org/2791163003 Cr-Commit-Position: refs/heads/master@{#44466}
2017-04-07 00:34:10 -07:00 · 2017-04-07 00:34:10 -07:00 · a8651c5671
commit a8651c5671
parent 57bef9a1e2
3 changed files with 28 additions and 20 deletions
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -46,13 +46,13 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
  Advance();
 }

-template <bool update_position>
-inline uc32 RegExpParser::ReadNext() {
+inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
  int position = next_pos_;
  uc32 c0 = in()->Get(position);
  position++;
-  // Read the whole surrogate pair in case of unicode flag, if possible.
-  if (unicode() && position < in()->length() &&
+  const bool try_combine_surrogate_pairs =
+      (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
+  if (try_combine_surrogate_pairs && position < in()->length() &&
      unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
    uc16 c1 = in()->Get(position);
    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
@ -67,14 +67,13 @@ inline uc32 RegExpParser::ReadNext() {

 uc32 RegExpParser::Next() {
  if (has_next()) {
-    return ReadNext<false>();
+    return ReadNext(false, ScanMode::DEFAULT);
  } else {
    return kEndMarker;
  }
 }

-
-void RegExpParser::Advance() {
+void RegExpParser::Advance(ScanMode mode) {
  if (has_next()) {
    StackLimitCheck check(isolate());
    if (check.HasOverflowed()) {
@ -84,7 +83,7 @@ void RegExpParser::Advance() {
    } else if (zone()->excess_allocation()) {
      ReportError(CStrVector("Regular expression too large"));
    } else {
-      current_ = ReadNext<true>();
+      current_ = ReadNext(true, mode);
    }
  } else {
    current_ = kEndMarker;
@ -102,10 +101,9 @@ void RegExpParser::Reset(int pos) {
  Advance();
 }

-
-void RegExpParser::Advance(int dist) {
+void RegExpParser::Advance(int dist, ScanMode mode) {
  next_pos_ += dist - 1;
-  Advance();
+  Advance(mode);
 }


@ -329,7 +327,6 @@ RegExpTree* RegExpParser::ParseDisjunction() {
              if (FLAG_harmony_regexp_named_captures) {
                has_named_captures_ = true;
                is_named_capture = true;
-                Advance();
                break;
              }
            // Fall through.
@ -769,20 +766,26 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {

 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
  DCHECK(FLAG_harmony_regexp_named_captures);
+  DCHECK_EQ(current(), '<');

  ZoneVector<uc16>* name =
      new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

+  // Capture names can always contain surrogate pairs, and we need to scan
+  // accordingly.
+  const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
+  Advance(scan_mode);
+
  bool at_start = true;
  while (true) {
    uc32 c = current();
-    Advance();
+    Advance(scan_mode);

    // Convert unicode escapes.
    if (c == '\\' && current() == 'u') {
      // TODO(jgruber): Reconsider this once the spec has settled.
      // https://github.com/tc39/proposal-regexp-named-groups/issues/23
-      Advance();
+      Advance(scan_mode);
      if (!ParseUnicodeEscape(&c)) {
        ReportError(CStrVector("Invalid Unicode escape sequence"));
        return nullptr;
@ -853,7 +856,6 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
    return false;
  }

-  Advance();
  const ZoneVector<uc16>* name = ParseCaptureGroupName();
  if (name == nullptr) {
    return false;
--- a/src/regexp/regexp-parser.h
+++ b/src/regexp/regexp-parser.h
@ -184,11 +184,18 @@ class RegExpParser BASE_EMBEDDED {
  // can be reparsed.
  bool ParseBackReferenceIndex(int* index_out);

+  // The default behavior is to combine surrogate pairs in unicode mode and
+  // don't combine them otherwise (a quantifier after a surrogate pair would
+  // then apply only to the trailing surrogate). Forcing combination is required
+  // when parsing capture names since they can always legally contain surrogate
+  // pairs.
+  enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };
+
  bool ParseClassProperty(ZoneList<CharacterRange>* result);
  CharacterRange ParseClassAtom(uc16* char_class);
  RegExpTree* ReportError(Vector<const char> message);
-  void Advance();
-  void Advance(int dist);
+  void Advance(ScanMode mode = ScanMode::DEFAULT);
+  void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
  void Reset(int pos);

  // Reports whether the pattern might be used as a literal search string.
@ -304,8 +311,7 @@ class RegExpParser BASE_EMBEDDED {
  bool has_more() { return has_more_; }
  bool has_next() { return next_pos_ < in()->length(); }
  uc32 Next();
-  template <bool update_position>
-  uc32 ReadNext();
+  uc32 ReadNext(bool update_position, ScanMode mode);
  FlatStringReader* in() { return in_; }
  void ScanForCaptures();

--- a/test/mjsunit/harmony/regexp-named-captures.js
+++ b/test/mjsunit/harmony/regexp-named-captures.js
@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError);  // ID_Continue but not ID_Start.
 assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
 assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
 assertEquals("a", /(?<_>a)/.exec("bab").groups._);
-assertThrows("/(?<$𐒤>a)/", SyntaxError);
+assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤);
 assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups.ಠ_ಠ);
 assertThrows('/(?<❤>a)/', SyntaxError);
 assertThrows('/(?<𐒤>a)/', SyntaxError);  // ID_Continue but not ID_Start.