v8/src/torque/earley-parser.h

// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_TORQUE_EARLEY_PARSER_H_
#define V8_TORQUE_EARLEY_PARSER_H_

#include <map>
#include <vector>

#include "src/base/optional.h"
#include "src/torque/contextual.h"
#include "src/torque/source-positions.h"
#include "src/torque/utils.h"

namespace v8 {
namespace internal {
namespace torque {

class Symbol;
class Item;

class ParseResultHolderBase {
 public:
  enum class TypeId;
  virtual ~ParseResultHolderBase() = default;
  template <class T>
  T& Cast();
  template <class T>
  const T& Cast() const;

 protected:
  explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
    // MSVC wrongly complains about type_id_ being an unused private field.
    USE(type_id_);
  }

 private:
  const TypeId type_id_;
};

using ParseResultTypeId = ParseResultHolderBase::TypeId;

template <class T>
class ParseResultHolder : public ParseResultHolderBase {
 public:
  explicit ParseResultHolder(T value)
      : ParseResultHolderBase(id), value_(std::move(value)) {}

 private:
  V8_EXPORT_PRIVATE static const TypeId id;
  friend class ParseResultHolderBase;
  T value_;
};

template <class T>
T& ParseResultHolderBase::Cast() {
  CHECK_EQ(ParseResultHolder<T>::id, type_id_);
  return static_cast<ParseResultHolder<T>*>(this)->value_;
}

template <class T>
const T& ParseResultHolderBase::Cast() const {
  CHECK_EQ(ParseResultHolder<T>::id, type_id_);
  return static_cast<const ParseResultHolder<T>*>(this)->value_;
}

class ParseResult {
 public:
  template <class T>
  explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}

  template <class T>
  const T& Cast() const {
    return value_->Cast<T>();
  }
  template <class T>
  T& Cast() {
    return value_->Cast<T>();
  }

 private:
  std::unique_ptr<ParseResultHolderBase> value_;
};

using InputPosition = const char*;

struct MatchedInput {
  MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
      : begin(begin), end(end), pos(pos) {}
  InputPosition begin;
  InputPosition end;
  SourcePosition pos;
  std::string ToString() const { return {begin, end}; }
};

class ParseResultIterator {
 public:
  explicit ParseResultIterator(std::vector<ParseResult> results,
                               MatchedInput matched_input)
      : results_(std::move(results)), matched_input_(matched_input) {}
  ~ParseResultIterator() {
    // Check that all parse results have been used.
    CHECK_EQ(results_.size(), i_);
  }

  ParseResult Next() {
    CHECK_LT(i_, results_.size());
    return std::move(results_[i_++]);
  }
  template <class T>
  T NextAs() {
    return std::move(Next().Cast<T>());
  }
  bool HasNext() const { return i_ < results_.size(); }

  const MatchedInput& matched_input() const { return matched_input_; }

 private:
  std::vector<ParseResult> results_;
  size_t i_ = 0;
  MatchedInput matched_input_;

  DISALLOW_COPY_AND_MOVE_AND_ASSIGN(ParseResultIterator);
};

struct LexerResult {
  std::vector<Symbol*> token_symbols;
  std::vector<MatchedInput> token_contents;
};

using Action =
    base::Optional<ParseResult> (*)(ParseResultIterator* child_results);

inline base::Optional<ParseResult> DefaultAction(
    ParseResultIterator* child_results) {
  if (!child_results->HasNext()) return base::nullopt;
  return child_results->Next();
}

// A rule of the context-free grammar. Each rule can have an action attached to
// it, which is executed after the parsing is finished.
class Rule final {
 public:
  explicit Rule(std::vector<Symbol*> right_hand_side,
                Action action = DefaultAction)
      : right_hand_side_(std::move(right_hand_side)), action_(action) {}

  Symbol* left() const {
    DCHECK_NOT_NULL(left_hand_side_);
    return left_hand_side_;
  }
  const std::vector<Symbol*>& right() const { return right_hand_side_; }

  void SetLeftHandSide(Symbol* left_hand_side) {
    DCHECK_NULL(left_hand_side_);
    left_hand_side_ = left_hand_side;
  }

  V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
      const Item* completed_item, const LexerResult& tokens) const;

 private:
  Symbol* left_hand_side_ = nullptr;
  std::vector<Symbol*> right_hand_side_;
  Action action_;
};

// A Symbol represents a terminal or a non-terminal of the grammar.
// It stores the list of rules, which have this symbol as the
// left-hand side.
// Terminals have an empty list of rules, they are created by the Lexer
// instead of from rules.
// Symbols need to reside at stable memory addresses, because the addresses are
// used in the parser.
class Symbol {
 public:
  Symbol() : Symbol({}) {}
  Symbol(std::initializer_list<Rule> rules) { *this = rules; }

  V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);

  bool IsTerminal() const { return rules_.empty(); }
  Rule* rule(size_t index) const { return rules_[index].get(); }
  size_t rule_number() const { return rules_.size(); }

  void AddRule(const Rule& rule) {
    rules_.push_back(base::make_unique<Rule>(rule));
    rules_.back()->SetLeftHandSide(this);
  }

  V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
      const Item* item, const LexerResult& tokens);

 private:
  std::vector<std::unique_ptr<Rule>> rules_;

  // Disallow copying and moving to ensure Symbol has a stable address.
  DISALLOW_COPY_AND_MOVE_AND_ASSIGN(Symbol);
};

// Items are the core datastructure of Earley's algorithm.
// They consist of a (partially) matched rule, a marked position inside of the
// right-hand side of the rule (traditionally written as a dot) and an input
// range from {start} to {pos} that matches the symbols of the right-hand side
// that are left of the mark. In addition, they store a child and a left-sibling
// pointer to reconstruct the AST in the end.
class Item {
 public:
  Item(const Rule* rule, size_t mark, size_t start, size_t pos)
      : rule_(rule), mark_(mark), start_(start), pos_(pos) {
    DCHECK_LE(mark_, right().size());
  }

  // A complete item has the mark at the right end, which means the input range
  // matches the complete rule.
  bool IsComplete() const {
    DCHECK_LE(mark_, right().size());
    return mark_ == right().size();
  }

  // The symbol right after the mark is expected at {pos} for this item to
  // advance.
  Symbol* NextSymbol() const {
    DCHECK(!IsComplete());
    DCHECK_LT(mark_, right().size());
    return right()[mark_];
  }

  // We successfully parsed NextSymbol() between {pos} and {new_pos}.
  // If NextSymbol() was a non-terminal, then {child} is a pointer to a
  // completed item for this parse.
  // We create a new item, which moves the mark one forward.
  Item Advance(size_t new_pos, const Item* child = nullptr) const {
    if (child) {
      DCHECK(child->IsComplete());
      DCHECK_EQ(pos(), child->start());
      DCHECK_EQ(new_pos, child->pos());
      DCHECK_EQ(NextSymbol(), child->left());
    }
    Item result(rule_, mark_ + 1, start_, new_pos);
    result.prev_ = this;
    result.child_ = child;
    return result;
  }

  // Collect the items representing the AST children of this completed item.
  std::vector<const Item*> Children() const;
  // The matched input separated according to the next branching AST level.
  std::string SplitByChildren(const LexerResult& tokens) const;
  // Check if {other} results in the same AST as this Item.
  void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;

  MatchedInput GetMatchedInput(const LexerResult& tokens) const {
    return {tokens.token_contents[start_].begin,
            start_ == pos_ ? tokens.token_contents[start_].begin
                           : tokens.token_contents[pos_ - 1].end,
            tokens.token_contents[start_].pos};
  }

  // We exclude {prev_} and {child_} from equality and hash computations,
  // because they are just globally unique data associated with an item.
  bool operator==(const Item& other) const {
    return rule_ == other.rule_ && mark_ == other.mark_ &&
           start_ == other.start_ && pos_ == other.pos_;
  }

  friend size_t hash_value(const Item& i) {
    return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
  }

  const Rule* rule() const { return rule_; }
  Symbol* left() const { return rule_->left(); }
  const std::vector<Symbol*>& right() const { return rule_->right(); }
  size_t pos() const { return pos_; }
  size_t start() const { return start_; }

 private:
  const Rule* rule_;
  size_t mark_;
  size_t start_;
  size_t pos_;

  const Item* prev_ = nullptr;
  const Item* child_ = nullptr;
};

inline base::Optional<ParseResult> Symbol::RunAction(
    const Item* item, const LexerResult& tokens) {
  DCHECK(item->IsComplete());
  DCHECK_EQ(item->left(), this);
  return item->rule()->RunAction(item, tokens);
}

V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
    Symbol* start, const LexerResult& tokens,
    std::unordered_set<Item, base::hash<Item>>* processed);

inline base::Optional<ParseResult> ParseTokens(Symbol* start,
                                               const LexerResult& tokens) {
  std::unordered_set<Item, base::hash<Item>> table;
  const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
  return start->RunAction(final_item, tokens);
}

// The lexical syntax is dynamically defined while building the grammar by
// adding patterns and keywords to the Lexer.
// The term keyword here can stand for any fixed character sequence, including
// operators and parentheses.
// Each pattern or keyword automatically gets a terminal symbol associated with
// it. These symbols form the result of the lexing.
// Patterns and keywords are matched using the longest match principle. If the
// longest matching pattern coincides with a keyword, the keyword symbol is
// chosen instead of the pattern.
// In addition, there is a single whitespace pattern which is consumed but does
// not become part of the token list.
class Lexer {
 public:
  // Functions to define patterns. They try to match starting from {pos}. If
  // successful, they return true and advance {pos}. Otherwise, {pos} stays
  // unchanged.
  using PatternFunction = bool (*)(InputPosition* pos);

  void SetWhitespace(PatternFunction whitespace) {
    match_whitespace_ = whitespace;
  }

  Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
  Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
  V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);

 private:
  PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
  std::map<PatternFunction, Symbol> patterns_;
  std::map<std::string, Symbol> keywords_;
  Symbol* MatchToken(InputPosition* pos, InputPosition end);
};

// A grammar can have a result, which is the results of the start symbol.
// Grammar is intended to be subclassed, with Symbol members forming the
// mutually recursive rules of the grammar.
class Grammar {
 public:
  using PatternFunction = Lexer::PatternFunction;

  explicit Grammar(Symbol* start) : start_(start) {}

  base::Optional<ParseResult> Parse(const std::string& input) {
    LexerResult tokens = lexer().RunLexer(input);
    return ParseTokens(start_, tokens);
  }

 protected:
  Symbol* Token(const std::string& s) { return lexer_.Token(s); }
  Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
  void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }

  // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
  // This is necessary to define helpers that create new symbols.
  Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
    Symbol* result = new Symbol(rules);
    generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
    return result;
  }

  // Helper functions to define lexer patterns. If they match, they return true
  // and advance {pos}. Otherwise, {pos} is unchanged.
  V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
                                          InputPosition* pos);
  V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
                                          InputPosition* pos);
  V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
  V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);

  // The action MatchInput() produces the input matched by the rule as
  // result.
  static base::Optional<ParseResult> YieldMatchedInput(
      ParseResultIterator* child_results) {
    return ParseResult{child_results->matched_input().ToString()};
  }

  // Create a new symbol to parse the given sequence of symbols.
  // At most one of the symbols can return a result.
  Symbol* Sequence(std::vector<Symbol*> symbols) {
    return NewSymbol({Rule(std::move(symbols))});
  }

  template <class T, T value>
  static base::Optional<ParseResult> YieldIntegralConstant(
      ParseResultIterator* child_results) {
    return ParseResult{value};
  }

  template <class T>
  static base::Optional<ParseResult> YieldDefaultValue(
      ParseResultIterator* child_results) {
    return ParseResult{T{}};
  }

  template <class From, class To>
  static base::Optional<ParseResult> CastParseResult(
      ParseResultIterator* child_results) {
    To result = std::move(child_results->NextAs<From>());
    return ParseResult{std::move(result)};
  }

  // Try to parse {s} and return the result of type {Result} casted to {T}.
  // Otherwise, the result is a default-constructed {T}.
  template <class T, class Result = T>
  Symbol* TryOrDefault(Symbol* s) {
    return NewSymbol({Rule({s}, CastParseResult<Result, T>),
                      Rule({}, YieldDefaultValue<T>)});
  }

  template <class T>
  static base::Optional<ParseResult> MakeSingletonVector(
      ParseResultIterator* child_results) {
    T x = child_results->NextAs<T>();
    std::vector<T> result;
    result.push_back(std::move(x));
    return ParseResult{std::move(result)};
  }

  template <class T>
  static base::Optional<ParseResult> MakeExtendedVector(
      ParseResultIterator* child_results) {
    std::vector<T> l = child_results->NextAs<std::vector<T>>();
    T x = child_results->NextAs<T>();
    l.push_back(std::move(x));
    return ParseResult{std::move(l)};
  }

  // For example, NonemptyList(Token("A"), Token(",")) parses any of
  // A or A,A or A,A,A and so on.
  template <class T>
  Symbol* NonemptyList(Symbol* element,
                       base::Optional<Symbol*> separator = {}) {
    Symbol* list = NewSymbol();
    *list = {Rule({element}, MakeSingletonVector<T>),
             separator
                 ? Rule({list, *separator, element}, MakeExtendedVector<T>)
                 : Rule({list, element}, MakeExtendedVector<T>)};
    return list;
  }

  template <class T>
  Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
    return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
  }

  template <class T>
  Symbol* Optional(Symbol* x) {
    return TryOrDefault<base::Optional<T>, T>(x);
  }

  Symbol* CheckIf(Symbol* x) {
    return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
                      Rule({}, YieldIntegralConstant<bool, false>)});
  }

  Lexer& lexer() { return lexer_; }

 private:
  Lexer lexer_;
  std::vector<std::unique_ptr<Symbol>> generated_symbols_;
  Symbol* start_;
};

}  // namespace torque
}  // namespace internal
}  // namespace v8

#endif  // V8_TORQUE_EARLEY_PARSER_H_