Reland "[regexp] Move AST-to-Node code to a dedicated file"

This is a reland of 811bfbbc56 Original change's description: > [regexp] Move AST-to-Node code to a dedicated file > > Prior to this CL, jsregexp contains a bunch of things that are slightly > related but would be cleaner in separate files, including: AST-to-Node > transformations, the compiler implementation, and a debugging printer. > > This CL extracts AST-to-Node transformations. > > Bug: v8:9359 > Change-Id: I030cfca5c40cfd72e3a7abe2188e4654cfe2277c > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1655303 > Auto-Submit: Jakob Gruber <jgruber@chromium.org> > Reviewed-by: Yang Guo <yangguo@chromium.org> > Commit-Queue: Jakob Gruber <jgruber@chromium.org> > Cr-Commit-Position: refs/heads/master@{#62148} Tbr: yangguo@chromium.org Bug: v8:9359 Change-Id: I68a16086dc56c9a059547033ca8bc1e9de1080db Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1658568 Reviewed-by: Jakob Gruber <jgruber@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/master@{#62154}
2019-06-13 18:00:50 +02:00 · 2019-06-13 18:00:50 +02:00 · d61a558a23
commit d61a558a23
parent a1462d9f07
6 changed files with 1719 additions and 1727 deletions
--- a/BUILD.gn
+++ b/BUILD.gn
@ -2683,6 +2683,8 @@ v8_source_set("v8_base_without_compiler") {
    "src/regexp/regexp-ast.cc",
    "src/regexp/regexp-ast.h",
    "src/regexp/regexp-bytecodes.h",
+    "src/regexp/regexp-compiler-tonode.cc",
+    "src/regexp/regexp-compiler.h",
    "src/regexp/regexp-interpreter.cc",
    "src/regexp/regexp-interpreter.h",
    "src/regexp/regexp-macro-assembler-arch.h",
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
--- a/src/regexp/jsregexp.h
+++ b/src/regexp/jsregexp.h
@ -290,35 +290,6 @@ class DispatchTable : public ZoneObject {
  ZoneSplayTree<Config> tree_;
 };

-
-// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
-class UnicodeRangeSplitter {
- public:
-  V8_EXPORT_PRIVATE UnicodeRangeSplitter(Zone* zone,
-                                         ZoneList<CharacterRange>* base);
-  void Call(uc32 from, DispatchTable::Entry entry);
-
-  ZoneList<CharacterRange>* bmp() { return bmp_; }
-  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
-  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
-  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
-
- private:
-  static const int kBase = 0;
-  // Separate ranges into
-  static const int kBmpCodePoints = 1;
-  static const int kLeadSurrogates = 2;
-  static const int kTrailSurrogates = 3;
-  static const int kNonBmpCodePoints = 4;
-
-  Zone* zone_;
-  DispatchTable table_;
-  ZoneList<CharacterRange>* bmp_;
-  ZoneList<CharacterRange>* lead_surrogates_;
-  ZoneList<CharacterRange>* trail_surrogates_;
-  ZoneList<CharacterRange>* non_bmp_;
-};
-
 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
  VISIT(End)                                                         \
  VISIT(Action)                                                      \
--- a/src/regexp/regexp-compiler-tonode.cc
+++ b/src/regexp/regexp-compiler-tonode.cc
--- a/src/regexp/regexp-compiler.h
+++ b/src/regexp/regexp-compiler.h
@ -0,0 +1,217 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_COMPILER_H_
+#define V8_REGEXP_REGEXP_COMPILER_H_
+
+#include "src/regexp/jsregexp.h"  // TODO(jgruber): Remove if possible.
+#include "src/regexp/regexp-macro-assembler-arch.h"
+
+namespace v8 {
+namespace internal {
+
+class Isolate;
+
+namespace regexp_compiler_constants {
+
+// The '2' variant is has inclusive from and exclusive to.
+// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
+// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
+constexpr uc32 kRangeEndMarker = 0x110000;
+constexpr int kSpaceRanges[] = {
+    '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
+    0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
+    0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
+constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
+
+constexpr int kWordRanges[] = {'0',     '9' + 1, 'A',     'Z' + 1,        '_',
+                               '_' + 1, 'a',     'z' + 1, kRangeEndMarker};
+constexpr int kWordRangeCount = arraysize(kWordRanges);
+constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
+constexpr int kDigitRangeCount = arraysize(kDigitRanges);
+constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
+                                    kLeadSurrogateStart + 1, kRangeEndMarker};
+constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
+constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D,         0x000E,
+                                         0x2028, 0x202A, kRangeEndMarker};
+constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
+
+}  // namespace regexp_compiler_constants
+
+class FrequencyCollator {
+ public:
+  FrequencyCollator() : total_samples_(0) {
+    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+      frequencies_[i] = CharacterFrequency(i);
+    }
+  }
+
+  void CountCharacter(int character) {
+    int index = (character & RegExpMacroAssembler::kTableMask);
+    frequencies_[index].Increment();
+    total_samples_++;
+  }
+
+  // Does not measure in percent, but rather per-128 (the table size from the
+  // regexp macro assembler).
+  int Frequency(int in_character) {
+    DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
+    if (total_samples_ < 1) return 1;  // Division by zero.
+    int freq_in_per128 =
+        (frequencies_[in_character].counter() * 128) / total_samples_;
+    return freq_in_per128;
+  }
+
+ private:
+  class CharacterFrequency {
+   public:
+    CharacterFrequency() : counter_(0), character_(-1) {}
+    explicit CharacterFrequency(int character)
+        : counter_(0), character_(character) {}
+
+    void Increment() { counter_++; }
+    int counter() { return counter_; }
+    int character() { return character_; }
+
+   private:
+    int counter_;
+    int character_;
+  };
+
+ private:
+  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
+  int total_samples_;
+};
+
+class RegExpCompiler {
+ public:
+  RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
+                 bool is_one_byte);
+
+  int AllocateRegister() {
+    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
+      reg_exp_too_big_ = true;
+      return next_register_;
+    }
+    return next_register_++;
+  }
+
+  // Lookarounds to match lone surrogates for unicode character class matches
+  // are never nested. We can therefore reuse registers.
+  int UnicodeLookaroundStackRegister() {
+    if (unicode_lookaround_stack_register_ == kNoRegister) {
+      unicode_lookaround_stack_register_ = AllocateRegister();
+    }
+    return unicode_lookaround_stack_register_;
+  }
+
+  int UnicodeLookaroundPositionRegister() {
+    if (unicode_lookaround_position_register_ == kNoRegister) {
+      unicode_lookaround_position_register_ = AllocateRegister();
+    }
+    return unicode_lookaround_position_register_;
+  }
+
+  RegExpEngine::CompilationResult Assemble(Isolate* isolate,
+                                           RegExpMacroAssembler* assembler,
+                                           RegExpNode* start, int capture_count,
+                                           Handle<String> pattern);
+
+  inline void AddWork(RegExpNode* node) {
+    if (!node->on_work_list() && !node->label()->is_bound()) {
+      node->set_on_work_list(true);
+      work_list_->push_back(node);
+    }
+  }
+
+  static const int kImplementationOffset = 0;
+  static const int kNumberOfRegistersOffset = 0;
+  static const int kCodeOffset = 1;
+
+  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
+  EndNode* accept() { return accept_; }
+
+  static const int kMaxRecursion = 100;
+  inline int recursion_depth() { return recursion_depth_; }
+  inline void IncrementRecursionDepth() { recursion_depth_++; }
+  inline void DecrementRecursionDepth() { recursion_depth_--; }
+
+  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
+
+  inline bool one_byte() { return one_byte_; }
+  inline bool optimize() { return optimize_; }
+  inline void set_optimize(bool value) { optimize_ = value; }
+  inline bool limiting_recursion() { return limiting_recursion_; }
+  inline void set_limiting_recursion(bool value) {
+    limiting_recursion_ = value;
+  }
+  bool read_backward() { return read_backward_; }
+  void set_read_backward(bool value) { read_backward_ = value; }
+  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
+
+  int current_expansion_factor() { return current_expansion_factor_; }
+  void set_current_expansion_factor(int value) {
+    current_expansion_factor_ = value;
+  }
+
+  Isolate* isolate() const { return isolate_; }
+  Zone* zone() const { return zone_; }
+
+  static const int kNoRegister = -1;
+
+ private:
+  EndNode* accept_;
+  int next_register_;
+  int unicode_lookaround_stack_register_;
+  int unicode_lookaround_position_register_;
+  std::vector<RegExpNode*>* work_list_;
+  int recursion_depth_;
+  RegExpMacroAssembler* macro_assembler_;
+  bool one_byte_;
+  bool reg_exp_too_big_;
+  bool limiting_recursion_;
+  bool optimize_;
+  bool read_backward_;
+  int current_expansion_factor_;
+  FrequencyCollator frequency_collator_;
+  Isolate* isolate_;
+  Zone* zone_;
+};
+
+// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
+class UnicodeRangeSplitter {
+ public:
+  V8_EXPORT_PRIVATE UnicodeRangeSplitter(Zone* zone,
+                                         ZoneList<CharacterRange>* base);
+  void Call(uc32 from, DispatchTable::Entry entry);
+
+  ZoneList<CharacterRange>* bmp() { return bmp_; }
+  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
+  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
+  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
+
+ private:
+  static const int kBase = 0;
+  // Separate ranges into
+  static const int kBmpCodePoints = 1;
+  static const int kLeadSurrogates = 2;
+  static const int kTrailSurrogates = 3;
+  static const int kNonBmpCodePoints = 4;
+
+  Zone* zone_;
+  DispatchTable table_;
+  ZoneList<CharacterRange>* bmp_;
+  ZoneList<CharacterRange>* lead_surrogates_;
+  ZoneList<CharacterRange>* trail_surrogates_;
+  ZoneList<CharacterRange>* non_bmp_;
+};
+
+// We need to check for the following characters: 0x39C 0x3BC 0x178.
+// TODO(jgruber): Move to CharacterRange.
+bool RangeContainsLatin1Equivalents(CharacterRange range);
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_REGEXP_REGEXP_COMPILER_H_
--- a/test/cctest/test-regexp.cc
+++ b/test/cctest/test-regexp.cc
@ -37,6 +37,7 @@
 #include "src/init/v8.h"
 #include "src/objects/objects-inl.h"
 #include "src/regexp/jsregexp.h"
+#include "src/regexp/regexp-compiler.h"
 #include "src/regexp/regexp-interpreter.h"
 #include "src/regexp/regexp-macro-assembler-arch.h"
 #include "src/regexp/regexp-macro-assembler-irregexp.h"