faa2f9f9e1
- Merge the look-ahead results slots used when multiple rules share a common accepting state. - Sequentially number the look-ahead result slot. Will eventually allow replacing the runtime map with an array. - Inhibit chaining out of look-ahead rules. This could never actually happen; when a hard break rule matches, the engine is stopped immediately, but the state table was being constructed as if it could happen. Reduces table size for line break rules. - Remove incorrect handling of fAccepting and fLookAhead fields of a state table row when removing duplicate states. Look-ahead slot number was being mis-interpreted as a state number.
168 lines
7.3 KiB
C++
168 lines
7.3 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
//
|
|
// rbbiscan.h
|
|
//
|
|
// Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
|
// All Rights Reserved.
|
|
//
|
|
// This file contains declarations for class RBBIRuleScanner
|
|
//
|
|
|
|
|
|
#ifndef RBBISCAN_H
|
|
#define RBBISCAN_H
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uobject.h"
|
|
#include "unicode/rbbi.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/parseerr.h"
|
|
#include "uhash.h"
|
|
#include "uvector.h"
|
|
#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
|
|
// looks up references to $variables within a set.
|
|
#include "rbbinode.h"
|
|
#include "rbbirpt.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
class RBBIRuleBuilder;
|
|
class RBBISymbolTable;
|
|
|
|
|
|
//--------------------------------------------------------------------------------
|
|
//
|
|
// class RBBIRuleScanner does the lowest level, character-at-a-time
|
|
// scanning of break iterator rules.
|
|
//
|
|
// The output of the scanner is parse trees for
|
|
// the rule expressions and a list of all Unicode Sets
|
|
// encountered.
|
|
//
|
|
//--------------------------------------------------------------------------------
|
|
|
|
class RBBIRuleScanner : public UMemory {
|
|
public:
|
|
|
|
enum {
|
|
kStackSize = 100 // The size of the state stack for
|
|
}; // rules parsing. Corresponds roughly
|
|
// to the depth of parentheses nesting
|
|
// that is allowed in the rules.
|
|
|
|
struct RBBIRuleChar {
|
|
UChar32 fChar;
|
|
UBool fEscaped;
|
|
RBBIRuleChar() : fChar(0), fEscaped(FALSE) {}
|
|
};
|
|
|
|
RBBIRuleScanner(RBBIRuleBuilder *rb);
|
|
|
|
|
|
virtual ~RBBIRuleScanner();
|
|
|
|
void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
|
|
// Return false if at end.
|
|
|
|
UBool push(const RBBIRuleChar &c); // Push (unget) one character.
|
|
// Only a single character may be pushed.
|
|
|
|
void parse(); // Parse the rules, generating two parse
|
|
// trees, one each for the forward and
|
|
// reverse rules,
|
|
// and a list of UnicodeSets encountered.
|
|
|
|
int32_t numRules(); // Return the number of rules that have been seen.
|
|
|
|
/**
|
|
* Return a rules string without unnecessary
|
|
* characters.
|
|
*/
|
|
static UnicodeString stripRules(const UnicodeString &rules);
|
|
private:
|
|
|
|
UBool doParseActions(int32_t a);
|
|
void error(UErrorCode e); // error reporting convenience function.
|
|
void fixOpStack(RBBINode::OpPrecedence p);
|
|
// a character.
|
|
void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
|
|
|
|
UChar32 nextCharLL();
|
|
#ifdef RBBI_DEBUG
|
|
void printNodeStack(const char *title);
|
|
#endif
|
|
RBBINode *pushNewNode(RBBINode::NodeType t);
|
|
void scanSet();
|
|
|
|
|
|
RBBIRuleBuilder *fRB; // The rule builder that we are part of.
|
|
|
|
int32_t fScanIndex; // Index of current character being processed
|
|
// in the rule input string.
|
|
int32_t fNextIndex; // Index of the next character, which
|
|
// is the first character not yet scanned.
|
|
UBool fQuoteMode; // Scan is in a 'quoted region'
|
|
int32_t fLineNum; // Line number in input file.
|
|
int32_t fCharNum; // Char position within the line.
|
|
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
|
// as a single line, not two.
|
|
|
|
RBBIRuleChar fC; // Current char for parse state machine
|
|
// processing.
|
|
UnicodeString fVarName; // $variableName, valid when we've just
|
|
// scanned one.
|
|
|
|
RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
|
|
// parsing. index by p[state][char-class]
|
|
|
|
uint16_t fStack[kStackSize]; // State stack, holds state pushes
|
|
int32_t fStackPtr; // and pops as specified in the state
|
|
// transition rules.
|
|
|
|
RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
|
|
// during the parse of a rule
|
|
int32_t fNodeStackPtr;
|
|
|
|
|
|
UBool fReverseRule; // True if the rule currently being scanned
|
|
// is a reverse direction rule (if it
|
|
// starts with a '!')
|
|
|
|
UBool fLookAheadRule; // True if the rule includes a '/'
|
|
// somewhere within it.
|
|
|
|
UBool fNoChainInRule; // True if the current rule starts with a '^'.
|
|
|
|
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
|
|
// $variable symbols.
|
|
|
|
UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
|
|
// the sets created while parsing rules.
|
|
// The key is the string used for creating
|
|
// the set.
|
|
|
|
UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
|
|
// the scanning of RBBI rules. The
|
|
// indicies for these are assigned by the
|
|
// perl script that builds the state tables.
|
|
// See rbbirpt.h.
|
|
|
|
int32_t fRuleNum; // Counts each rule as it is scanned.
|
|
|
|
int32_t fOptionStart; // Input index of start of a !!option
|
|
// keyword, while being scanned.
|
|
|
|
UnicodeSet *gRuleSet_rule_char;
|
|
UnicodeSet *gRuleSet_white_space;
|
|
UnicodeSet *gRuleSet_name_char;
|
|
UnicodeSet *gRuleSet_name_start_char;
|
|
|
|
RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
|
|
RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
|
|
};
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif
|