f31c6419eb
Change-Id: I02c117ef66480eb73eb9cc1d4f80bbc64e9d3624 Reviewed-on: https://chromium-review.googlesource.com/1146655 Commit-Queue: Tobias Tebbi <tebbi@chromium.org> Reviewed-by: Jakob Gruber <jgruber@chromium.org> Reviewed-by: Jaroslav Sevcik <jarin@chromium.org> Cr-Commit-Position: refs/heads/master@{#54649}
474 lines
15 KiB
C++
474 lines
15 KiB
C++
// Copyright 2018 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#ifndef V8_TORQUE_EARLEY_PARSER_H_
|
|
#define V8_TORQUE_EARLEY_PARSER_H_
|
|
|
|
#include <map>
|
|
#include <vector>
|
|
|
|
#include "src/base/optional.h"
|
|
#include "src/torque/contextual.h"
|
|
#include "src/torque/source-positions.h"
|
|
#include "src/torque/utils.h"
|
|
|
|
namespace v8 {
|
|
namespace internal {
|
|
namespace torque {
|
|
|
|
class Symbol;
|
|
class Item;
|
|
|
|
class ParseResultHolderBase {
|
|
public:
|
|
enum class TypeId;
|
|
virtual ~ParseResultHolderBase() = default;
|
|
template <class T>
|
|
T& Cast();
|
|
template <class T>
|
|
const T& Cast() const;
|
|
|
|
protected:
|
|
explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
|
|
// MSVC wrongly complains about type_id_ being an unused private field.
|
|
USE(type_id_);
|
|
}
|
|
|
|
private:
|
|
const TypeId type_id_;
|
|
};
|
|
|
|
using ParseResultTypeId = ParseResultHolderBase::TypeId;
|
|
|
|
template <class T>
|
|
class ParseResultHolder : public ParseResultHolderBase {
|
|
public:
|
|
explicit ParseResultHolder(T value)
|
|
: ParseResultHolderBase(id), value_(std::move(value)) {}
|
|
|
|
private:
|
|
V8_EXPORT_PRIVATE static const TypeId id;
|
|
friend class ParseResultHolderBase;
|
|
T value_;
|
|
};
|
|
|
|
template <class T>
|
|
T& ParseResultHolderBase::Cast() {
|
|
CHECK_EQ(ParseResultHolder<T>::id, type_id_);
|
|
return static_cast<ParseResultHolder<T>*>(this)->value_;
|
|
}
|
|
|
|
template <class T>
|
|
const T& ParseResultHolderBase::Cast() const {
|
|
CHECK_EQ(ParseResultHolder<T>::id, type_id_);
|
|
return static_cast<const ParseResultHolder<T>*>(this)->value_;
|
|
}
|
|
|
|
class ParseResult {
|
|
public:
|
|
template <class T>
|
|
explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
|
|
|
|
template <class T>
|
|
const T& Cast() const {
|
|
return value_->Cast<T>();
|
|
}
|
|
template <class T>
|
|
T& Cast() {
|
|
return value_->Cast<T>();
|
|
}
|
|
|
|
private:
|
|
std::unique_ptr<ParseResultHolderBase> value_;
|
|
};
|
|
|
|
using InputPosition = const char*;
|
|
|
|
struct MatchedInput {
|
|
MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
|
|
: begin(begin), end(end), pos(pos) {}
|
|
InputPosition begin;
|
|
InputPosition end;
|
|
SourcePosition pos;
|
|
std::string ToString() const { return {begin, end}; }
|
|
};
|
|
|
|
class ParseResultIterator {
|
|
public:
|
|
explicit ParseResultIterator(std::vector<ParseResult> results,
|
|
MatchedInput matched_input)
|
|
: results_(std::move(results)), matched_input_(matched_input) {}
|
|
~ParseResultIterator() {
|
|
// Check that all parse results have been used.
|
|
CHECK_EQ(results_.size(), i_);
|
|
}
|
|
|
|
ParseResult Next() {
|
|
CHECK_LT(i_, results_.size());
|
|
return std::move(results_[i_++]);
|
|
}
|
|
template <class T>
|
|
T NextAs() {
|
|
return std::move(Next().Cast<T>());
|
|
}
|
|
bool HasNext() const { return i_ < results_.size(); }
|
|
|
|
const MatchedInput& matched_input() const { return matched_input_; }
|
|
|
|
private:
|
|
std::vector<ParseResult> results_;
|
|
size_t i_ = 0;
|
|
MatchedInput matched_input_;
|
|
|
|
DISALLOW_COPY_AND_MOVE_AND_ASSIGN(ParseResultIterator);
|
|
};
|
|
|
|
struct LexerResult {
|
|
std::vector<Symbol*> token_symbols;
|
|
std::vector<MatchedInput> token_contents;
|
|
};
|
|
|
|
using Action =
|
|
base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
|
|
|
|
inline base::Optional<ParseResult> DefaultAction(
|
|
ParseResultIterator* child_results) {
|
|
if (!child_results->HasNext()) return base::nullopt;
|
|
return child_results->Next();
|
|
}
|
|
|
|
// A rule of the context-free grammar. Each rule can have an action attached to
|
|
// it, which is executed after the parsing is finished.
|
|
class Rule final {
|
|
public:
|
|
explicit Rule(std::vector<Symbol*> right_hand_side,
|
|
Action action = DefaultAction)
|
|
: right_hand_side_(std::move(right_hand_side)), action_(action) {}
|
|
|
|
Symbol* left() const {
|
|
DCHECK_NOT_NULL(left_hand_side_);
|
|
return left_hand_side_;
|
|
}
|
|
const std::vector<Symbol*>& right() const { return right_hand_side_; }
|
|
|
|
void SetLeftHandSide(Symbol* left_hand_side) {
|
|
DCHECK_NULL(left_hand_side_);
|
|
left_hand_side_ = left_hand_side;
|
|
}
|
|
|
|
V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
|
|
const Item* completed_item, const LexerResult& tokens) const;
|
|
|
|
private:
|
|
Symbol* left_hand_side_ = nullptr;
|
|
std::vector<Symbol*> right_hand_side_;
|
|
Action action_;
|
|
};
|
|
|
|
// A Symbol represents a terminal or a non-terminal of the grammar.
|
|
// It stores the list of rules, which have this symbol as the
|
|
// left-hand side.
|
|
// Terminals have an empty list of rules, they are created by the Lexer
|
|
// instead of from rules.
|
|
// Symbols need to reside at stable memory addresses, because the addresses are
|
|
// used in the parser.
|
|
class Symbol {
|
|
public:
|
|
Symbol() : Symbol({}) {}
|
|
Symbol(std::initializer_list<Rule> rules) { *this = rules; }
|
|
|
|
V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
|
|
|
|
bool IsTerminal() const { return rules_.empty(); }
|
|
Rule* rule(size_t index) const { return rules_[index].get(); }
|
|
size_t rule_number() const { return rules_.size(); }
|
|
|
|
void AddRule(const Rule& rule) {
|
|
rules_.push_back(base::make_unique<Rule>(rule));
|
|
rules_.back()->SetLeftHandSide(this);
|
|
}
|
|
|
|
V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
|
|
const Item* item, const LexerResult& tokens);
|
|
|
|
private:
|
|
std::vector<std::unique_ptr<Rule>> rules_;
|
|
|
|
// Disallow copying and moving to ensure Symbol has a stable address.
|
|
DISALLOW_COPY_AND_MOVE_AND_ASSIGN(Symbol);
|
|
};
|
|
|
|
// Items are the core datastructure of Earley's algorithm.
|
|
// They consist of a (partially) matched rule, a marked position inside of the
|
|
// right-hand side of the rule (traditionally written as a dot) and an input
|
|
// range from {start} to {pos} that matches the symbols of the right-hand side
|
|
// that are left of the mark. In addition, they store a child and a left-sibling
|
|
// pointer to reconstruct the AST in the end.
|
|
class Item {
|
|
public:
|
|
Item(const Rule* rule, size_t mark, size_t start, size_t pos)
|
|
: rule_(rule), mark_(mark), start_(start), pos_(pos) {
|
|
DCHECK_LE(mark_, right().size());
|
|
}
|
|
|
|
// A complete item has the mark at the right end, which means the input range
|
|
// matches the complete rule.
|
|
bool IsComplete() const {
|
|
DCHECK_LE(mark_, right().size());
|
|
return mark_ == right().size();
|
|
}
|
|
|
|
// The symbol right after the mark is expected at {pos} for this item to
|
|
// advance.
|
|
Symbol* NextSymbol() const {
|
|
DCHECK(!IsComplete());
|
|
DCHECK_LT(mark_, right().size());
|
|
return right()[mark_];
|
|
}
|
|
|
|
// We successfully parsed NextSymbol() between {pos} and {new_pos}.
|
|
// If NextSymbol() was a non-terminal, then {child} is a pointer to a
|
|
// completed item for this parse.
|
|
// We create a new item, which moves the mark one forward.
|
|
Item Advance(size_t new_pos, const Item* child = nullptr) const {
|
|
if (child) {
|
|
DCHECK(child->IsComplete());
|
|
DCHECK_EQ(pos(), child->start());
|
|
DCHECK_EQ(new_pos, child->pos());
|
|
DCHECK_EQ(NextSymbol(), child->left());
|
|
}
|
|
Item result(rule_, mark_ + 1, start_, new_pos);
|
|
result.prev_ = this;
|
|
result.child_ = child;
|
|
return result;
|
|
}
|
|
|
|
// Collect the items representing the AST children of this completed item.
|
|
std::vector<const Item*> Children() const;
|
|
// The matched input separated according to the next branching AST level.
|
|
std::string SplitByChildren(const LexerResult& tokens) const;
|
|
// Check if {other} results in the same AST as this Item.
|
|
void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
|
|
|
|
MatchedInput GetMatchedInput(const LexerResult& tokens) const {
|
|
return {tokens.token_contents[start_].begin,
|
|
start_ == pos_ ? tokens.token_contents[start_].begin
|
|
: tokens.token_contents[pos_ - 1].end,
|
|
tokens.token_contents[start_].pos};
|
|
}
|
|
|
|
// We exclude {prev_} and {child_} from equality and hash computations,
|
|
// because they are just globally unique data associated with an item.
|
|
bool operator==(const Item& other) const {
|
|
return rule_ == other.rule_ && mark_ == other.mark_ &&
|
|
start_ == other.start_ && pos_ == other.pos_;
|
|
}
|
|
|
|
friend size_t hash_value(const Item& i) {
|
|
return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
|
|
}
|
|
|
|
const Rule* rule() const { return rule_; }
|
|
Symbol* left() const { return rule_->left(); }
|
|
const std::vector<Symbol*>& right() const { return rule_->right(); }
|
|
size_t pos() const { return pos_; }
|
|
size_t start() const { return start_; }
|
|
|
|
private:
|
|
const Rule* rule_;
|
|
size_t mark_;
|
|
size_t start_;
|
|
size_t pos_;
|
|
|
|
const Item* prev_ = nullptr;
|
|
const Item* child_ = nullptr;
|
|
};
|
|
|
|
inline base::Optional<ParseResult> Symbol::RunAction(
|
|
const Item* item, const LexerResult& tokens) {
|
|
DCHECK(item->IsComplete());
|
|
DCHECK_EQ(item->left(), this);
|
|
return item->rule()->RunAction(item, tokens);
|
|
}
|
|
|
|
V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
|
|
Symbol* start, const LexerResult& tokens,
|
|
std::unordered_set<Item, base::hash<Item>>* processed);
|
|
|
|
inline base::Optional<ParseResult> ParseTokens(Symbol* start,
|
|
const LexerResult& tokens) {
|
|
std::unordered_set<Item, base::hash<Item>> table;
|
|
const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
|
|
return start->RunAction(final_item, tokens);
|
|
}
|
|
|
|
// The lexical syntax is dynamically defined while building the grammar by
|
|
// adding patterns and keywords to the Lexer.
|
|
// The term keyword here can stand for any fixed character sequence, including
|
|
// operators and parentheses.
|
|
// Each pattern or keyword automatically gets a terminal symbol associated with
|
|
// it. These symbols form the result of the lexing.
|
|
// Patterns and keywords are matched using the longest match principle. If the
|
|
// longest matching pattern coincides with a keyword, the keyword symbol is
|
|
// chosen instead of the pattern.
|
|
// In addition, there is a single whitespace pattern which is consumed but does
|
|
// not become part of the token list.
|
|
class Lexer {
|
|
public:
|
|
// Functions to define patterns. They try to match starting from {pos}. If
|
|
// successful, they return true and advance {pos}. Otherwise, {pos} stays
|
|
// unchanged.
|
|
using PatternFunction = bool (*)(InputPosition* pos);
|
|
|
|
void SetWhitespace(PatternFunction whitespace) {
|
|
match_whitespace_ = whitespace;
|
|
}
|
|
|
|
Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
|
|
Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
|
|
V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
|
|
|
|
private:
|
|
PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
|
|
std::map<PatternFunction, Symbol> patterns_;
|
|
std::map<std::string, Symbol> keywords_;
|
|
Symbol* MatchToken(InputPosition* pos, InputPosition end);
|
|
};
|
|
|
|
// A grammar can have a result, which is the results of the start symbol.
|
|
// Grammar is intended to be subclassed, with Symbol members forming the
|
|
// mutually recursive rules of the grammar.
|
|
class Grammar {
|
|
public:
|
|
using PatternFunction = Lexer::PatternFunction;
|
|
|
|
explicit Grammar(Symbol* start) : start_(start) {}
|
|
|
|
base::Optional<ParseResult> Parse(const std::string& input) {
|
|
LexerResult tokens = lexer().RunLexer(input);
|
|
return ParseTokens(start_, tokens);
|
|
}
|
|
|
|
protected:
|
|
Symbol* Token(const std::string& s) { return lexer_.Token(s); }
|
|
Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
|
|
void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
|
|
|
|
// NewSymbol() allocates a fresh symbol and stores it in the current grammar.
|
|
// This is necessary to define helpers that create new symbols.
|
|
Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
|
|
Symbol* result = new Symbol(rules);
|
|
generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
|
|
return result;
|
|
}
|
|
|
|
// Helper functions to define lexer patterns. If they match, they return true
|
|
// and advance {pos}. Otherwise, {pos} is unchanged.
|
|
V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
|
|
InputPosition* pos);
|
|
V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
|
|
InputPosition* pos);
|
|
V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
|
|
V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
|
|
|
|
// The action MatchInput() produces the input matched by the rule as
|
|
// result.
|
|
static base::Optional<ParseResult> YieldMatchedInput(
|
|
ParseResultIterator* child_results) {
|
|
return ParseResult{child_results->matched_input().ToString()};
|
|
}
|
|
|
|
// Create a new symbol to parse the given sequence of symbols.
|
|
// At most one of the symbols can return a result.
|
|
Symbol* Sequence(std::vector<Symbol*> symbols) {
|
|
return NewSymbol({Rule(std::move(symbols))});
|
|
}
|
|
|
|
template <class T, T value>
|
|
static base::Optional<ParseResult> YieldIntegralConstant(
|
|
ParseResultIterator* child_results) {
|
|
return ParseResult{value};
|
|
}
|
|
|
|
template <class T>
|
|
static base::Optional<ParseResult> YieldDefaultValue(
|
|
ParseResultIterator* child_results) {
|
|
return ParseResult{T{}};
|
|
}
|
|
|
|
template <class From, class To>
|
|
static base::Optional<ParseResult> CastParseResult(
|
|
ParseResultIterator* child_results) {
|
|
To result = std::move(child_results->NextAs<From>());
|
|
return ParseResult{std::move(result)};
|
|
}
|
|
|
|
// Try to parse {s} and return the result of type {Result} casted to {T}.
|
|
// Otherwise, the result is a default-constructed {T}.
|
|
template <class T, class Result = T>
|
|
Symbol* TryOrDefault(Symbol* s) {
|
|
return NewSymbol({Rule({s}, CastParseResult<Result, T>),
|
|
Rule({}, YieldDefaultValue<T>)});
|
|
}
|
|
|
|
template <class T>
|
|
static base::Optional<ParseResult> MakeSingletonVector(
|
|
ParseResultIterator* child_results) {
|
|
T x = child_results->NextAs<T>();
|
|
std::vector<T> result;
|
|
result.push_back(std::move(x));
|
|
return ParseResult{std::move(result)};
|
|
}
|
|
|
|
template <class T>
|
|
static base::Optional<ParseResult> MakeExtendedVector(
|
|
ParseResultIterator* child_results) {
|
|
std::vector<T> l = child_results->NextAs<std::vector<T>>();
|
|
T x = child_results->NextAs<T>();
|
|
l.push_back(std::move(x));
|
|
return ParseResult{std::move(l)};
|
|
}
|
|
|
|
// For example, NonemptyList(Token("A"), Token(",")) parses any of
|
|
// A or A,A or A,A,A and so on.
|
|
template <class T>
|
|
Symbol* NonemptyList(Symbol* element,
|
|
base::Optional<Symbol*> separator = {}) {
|
|
Symbol* list = NewSymbol();
|
|
*list = {Rule({element}, MakeSingletonVector<T>),
|
|
separator
|
|
? Rule({list, *separator, element}, MakeExtendedVector<T>)
|
|
: Rule({list, element}, MakeExtendedVector<T>)};
|
|
return list;
|
|
}
|
|
|
|
template <class T>
|
|
Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
|
|
return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
|
|
}
|
|
|
|
template <class T>
|
|
Symbol* Optional(Symbol* x) {
|
|
return TryOrDefault<base::Optional<T>, T>(x);
|
|
}
|
|
|
|
Symbol* CheckIf(Symbol* x) {
|
|
return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
|
|
Rule({}, YieldIntegralConstant<bool, false>)});
|
|
}
|
|
|
|
Lexer& lexer() { return lexer_; }
|
|
|
|
private:
|
|
Lexer lexer_;
|
|
std::vector<std::unique_ptr<Symbol>> generated_symbols_;
|
|
Symbol* start_;
|
|
};
|
|
|
|
} // namespace torque
|
|
} // namespace internal
|
|
} // namespace v8
|
|
|
|
#endif // V8_TORQUE_EARLEY_PARSER_H_
|