qt5base-lts/util/lexgen/re2nfa.cpp
Jason McDonald 5635823e17 Remove "All rights reserved" line from license headers.
As in the past, to avoid rewriting various autotests that contain
line-number information, an extra blank line has been inserted at the
end of the license text to ensure that this commit does not change the
total number of lines in the license header.

Change-Id: I311e001373776812699d6efc045b5f742890c689
Reviewed-by: Rohan McGovern <rohan.mcgovern@nokia.com>
2012-01-30 03:54:59 +01:00

548 lines
15 KiB
C++

/****************************************************************************
**
** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/
**
** This file is part of the utils of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** GNU Lesser General Public License Usage
** This file may be used under the terms of the GNU Lesser General Public
** License version 2.1 as published by the Free Software Foundation and
** appearing in the file LICENSE.LGPL included in the packaging of this
** file. Please review the following information to ensure the GNU Lesser
** General Public License version 2.1 requirements will be met:
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights. These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU General
** Public License version 3.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of this
** file. Please review the following information to ensure the GNU General
** Public License version 3.0 requirements will be met:
** http://www.gnu.org/copyleft/gpl.html.
**
** Other Usage
** Alternatively, this file may be used in accordance with the terms and
** conditions contained in a signed written agreement between you and Nokia.
**
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include "re2nfa.h"
#include "tokenizer.cpp"
RE2NFA::RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
: macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
{
}
NFA RE2NFA::parse(const QString &expression, int *errCol)
{
tokenize(expression);
if (symbols.isEmpty())
return NFA();
index = 0;
NFA result = parseExpr();
if (result.isEmpty()) {
if (errCol)
*errCol = errorColumn;
}
return result;
}
NFA RE2NFA::parseExpr()
{
NFA value = parseBranch();
while (test(TOK_OR)) {
NFA rhs = parseBranch();
value = NFA::createAlternatingNFA(value, rhs);
}
return value;
}
NFA RE2NFA::parseBranch()
{
NFA value = parsePiece();
if (!hasNext())
return value;
NFA next;
do {
next = parsePiece();
if (!next.isEmpty())
value = NFA::createConcatenatingNFA(value, next);
} while (!next.isEmpty() && hasNext());
return value;
}
NFA RE2NFA::parsePiece()
{
NFA atom = parseAtom();
if (atom.isEmpty() || !hasNext())
return atom;
return parseMaybeQuantifier(atom);
}
NFA RE2NFA::parseAtom()
{
// ####
switch (next()) {
case TOK_STRING:
return createCharNFA();
case TOK_LPAREN: {
NFA subExpr = parseExpr();
next(TOK_RPAREN);
return subExpr;
}
case TOK_LBRACE: {
QString macroName = lexemUntil(TOK_RBRACE);
QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
if (macro == macros.end()) {
qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
return NFA();
}
return *macro;
}
case TOK_LBRACKET: {
NFA set = parseSet();
next(TOK_RBRACKET);
return set;
}
case TOK_SEQUENCE:
return parseSet2();
case TOK_DOT:
return NFA::createSetNFA(maxInputSet);
default:
prev();
return NFA();
}
}
NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
{
// ####
switch (next()) {
case TOK_STAR:
return NFA::createOptionalNFA(nfa);
case TOK_QUESTION:
return NFA::createZeroOrOneNFA(nfa);
case TOK_PLUS:
return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
case TOK_LBRACE: {
const int rewind = index - 1;
QString lexemBeforeComma;
QString lexemAfterComma;
bool seenComma = false;
forever {
if (test(TOK_COMMA)) {
if (seenComma) {
errorColumn = symbol().column;
return NFA();
}
seenComma = true;
} else if (test(TOK_RBRACE)) {
break;
} else {
next(TOK_STRING);
if (seenComma)
lexemAfterComma += symbol().lexem;
else
lexemBeforeComma += symbol().lexem;
}
}
bool isNumber = false;
int min = lexemBeforeComma.toInt(&isNumber);
if (!isNumber) {
index = rewind;
return nfa;
}
int max = min;
if (seenComma) {
max = lexemAfterComma.toInt(&isNumber);
if (!isNumber) {
errorColumn = symbol().column;
return NFA();
}
}
return NFA::applyQuantity(nfa, min, max);
}
default:
prev();
return nfa;
}
}
NFA RE2NFA::parseSet()
{
QSet<InputType> set;
bool negate = false;
next(TOK_STRING);
do {
Q_ASSERT(symbol().lexem.length() == 1);
// ###
QChar ch = symbol().lexem.at(0);
if (set.isEmpty() && ch == QLatin1Char('^')) {
negate = true;
continue;
}
// look ahead for ranges like a-z
bool rangeFound = false;
if (test(TOK_STRING)) {
if (symbol().lexem.length() == 1
&& symbol().lexem.at(0) == QLatin1Char('-')) {
next(TOK_STRING);
Q_ASSERT(symbol().lexem.length() == 1);
QChar last = symbol().lexem.at(0);
if (ch.unicode() > last.unicode())
qSwap(ch, last);
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(QChar(i).toLower().unicode());
} else {
set.insert(i);
}
}
rangeFound = true;
} else {
prev();
}
}
if (!rangeFound) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(ch.toLower().unicode());
} else {
set.insert(ch.unicode());
}
}
} while (test(TOK_STRING));
if (negate) {
QSet<InputType> negatedSet = maxInputSet;
negatedSet.subtract(set);
set = negatedSet;
}
return NFA::createSetNFA(set);
}
NFA RE2NFA::parseSet2()
{
QSet<InputType> set;
bool negate = false;
QString str = symbol().lexem;
// strip off brackets
str.chop(1);
str.remove(0, 1);
int i = 0;
while (i < str.length()) {
// ###
QChar ch = str.at(i++);
if (set.isEmpty() && ch == QLatin1Char('^')) {
negate = true;
continue;
}
// look ahead for ranges like a-z
bool rangeFound = false;
if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
++i;
QChar last = str.at(i++);
if (ch.unicode() > last.unicode())
qSwap(ch, last);
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(QChar(i).toLower().unicode());
} else {
set.insert(i);
}
}
rangeFound = true;
}
if (!rangeFound) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(ch.toLower().unicode());
} else {
set.insert(ch.unicode());
}
}
}
if (negate) {
QSet<InputType> negatedSet = maxInputSet;
negatedSet.subtract(set);
set = negatedSet;
}
return NFA::createSetNFA(set);
}
NFA RE2NFA::createCharNFA()
{
NFA nfa;
// ####
if (caseSensitivity == Qt::CaseInsensitive) {
nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
} else {
nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
}
return nfa;
}
static inline int skipQuote(const QString &str, int pos)
{
while (pos < str.length()
&& str.at(pos) != QLatin1Char('"')) {
if (str.at(pos) == QLatin1Char('\\')) {
++pos;
if (pos >= str.length())
break;
}
++pos;
}
if (pos < str.length())
++pos;
return pos;
}
#if 0
static const char*tokStr(Token t)
{
switch (t) {
case TOK_INVALID: return "TOK_INVALID";
case TOK_STRING: return "TOK_STRING";
case TOK_LBRACE: return "TOK_LBRACE";
case TOK_RBRACE: return "TOK_RBRACE";
case TOK_LBRACKET: return "TOK_LBRACKET";
case TOK_RBRACKET: return "TOK_RBRACKET";
case TOK_LPAREN: return "TOK_LPAREN";
case TOK_RPAREN: return "TOK_RPAREN";
case TOK_COMMA: return "TOK_COMMA";
case TOK_STAR: return "TOK_STAR";
case TOK_OR: return "TOK_OR";
case TOK_QUESTION: return "TOK_QUESTION";
case TOK_DOT: return "TOK_DOT";
case TOK_PLUS: return "TOK_PLUS";
case TOK_SEQUENCE: return "TOK_SEQUENCE";
case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
}
return "";
}
#endif
void RE2NFA::tokenize(const QString &input)
{
symbols.clear();
#if 1
RegExpTokenizer tokenizer(input);
Symbol sym;
int tok = tokenizer.lex();
while (tok != -1) {
Symbol sym;
sym.token = static_cast<Token>(tok);
sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
if (sym.token == TOK_QUOTED_STRING) {
sym.lexem.chop(1);
sym.lexem.remove(0, 1);
sym.token = TOK_STRING;
}
if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
for (int i = 0; i < sym.lexem.length(); ++i) {
if (sym.lexem.at(i) == '\\') {
if (i >= sym.lexem.length() - 1)
break;
QChar ch = sym.lexem.at(i + 1);
if (ch == QLatin1Char('n')) {
ch = '\n';
} else if (ch == QLatin1Char('r')) {
ch = '\r';
} else if (ch == QLatin1Char('t')) {
ch = '\t';
} else if (ch == QLatin1Char('f')) {
ch = '\f';
}
sym.lexem.replace(i, 2, ch);
}
}
}
/*
if (sym.token == TOK_SEQUENCE) {
Symbol s;
s.token = TOK_LBRACKET;
s.lexem = "[";
symbols.append(s);
for (int i = 1; i < sym.lexem.length() - 1; ++i) {
s.token = TOK_STRING;
s.lexem = sym.lexem.at(i);
symbols.append(s);
}
s.token = TOK_RBRACKET;
s.lexem = "]";
symbols.append(s);
tok = tokenizer.lex();
continue;
}
*/
symbols.append(sym);
tok = tokenizer.lex();
}
#else
int pos = 0;
bool insideSet = false;
while (pos < input.length()) {
QChar ch = input.at(pos);
Symbol sym;
sym.column = pos;
sym.token = TOK_INVALID;
sym.lexem = QString(ch);
switch (ch.toLatin1()) {
case '"': {
if (insideSet) {
sym.token = TOK_STRING;
sym.lexem = QString(ch);
symbols += sym;
++pos;
continue;
}
if (pos + 1 >= input.length())
return;
int quoteEnd = skipQuote(input, pos + 1);
sym.token = TOK_STRING;
sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
symbols += sym;
pos = quoteEnd;
continue;
}
case '{':
sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
break;
case '}':
sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
break;
case '[':
insideSet = true;
sym.token = TOK_LBRACKET;
break;
case ']':
insideSet = false;
sym.token = TOK_RBRACKET;
break;
case '(':
sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
break;
case ')':
sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
break;
case ',':
sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
break;
case '*':
sym.token = (insideSet ? TOK_STRING : TOK_STAR);
break;
case '|':
sym.token = (insideSet ? TOK_STRING : TOK_OR);
break;
case '?':
sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
break;
case '.':
sym.token = (insideSet ? TOK_STRING : TOK_DOT);
break;
case '+':
sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
break;
case '\\':
++pos;
if (pos >= input.length())
return;
ch = input.at(pos);
if (ch == QLatin1Char('n')) {
ch = '\n';
} else if (ch == QLatin1Char('r')) {
ch = '\r';
} else if (ch == QLatin1Char('t')) {
ch = '\t';
} else if (ch == QLatin1Char('f')) {
ch = '\f';
}
// fall through
default:
sym.token = TOK_STRING;
sym.lexem = QString(ch);
symbols += sym;
++pos;
continue;
}
symbols += sym;
++pos;
}
#endif
#if 0
foreach (Symbol s, symbols) {
qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
}
#endif
}
bool RE2NFA::next(Token t)
{
if (hasNext() && next() == t)
return true;
errorColumn = symbol().column;
Q_ASSERT(false);
return false;
}
bool RE2NFA::test(Token t)
{
if (index >= symbols.count())
return false;
if (symbols.at(index).token == t) {
++index;
return true;
}
return false;
}
QString RE2NFA::lexemUntil(Token t)
{
QString lexem;
while (hasNext() && next() != t)
lexem += symbol().lexem;
return lexem;
}