#***************************************************************************** # # Copyright (C) 2002, International Business Machines Corporation and others. # All Rights Reserved. # #***************************************************************************** # # file: rbbirpt.txt # ICU Break Iterator Rule Parser State Table # # This state table is used when reading and parsing a set of RBBI rules # The rule parser uses a state machine; the data in this file define the # state transitions that occur for each input character. # # *** This file defines the RBBI rule grammar. This is it. # *** The determination of what is accepted is here. # # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays # that are then built with the rule parser. # # # Here is the syntax of the state definitions in this file: # # #StateName: # input-char n next-state ^push-state action # input-char n next-state ^push-state action # | | | | | # | | | | |--- action to be performed by state machine # | | | | See function RBBIRuleScanner::doParseActions() # | | | | # | | | |--- Push this named state onto the state stack. # | | | Later, when next state is specified as "pop", # | | | the pushed state will become the current state. # | | | # | | |--- Transition to this state if the current input character matches the input # | | character or char class in the left hand column. "pop" causes the next # | | state to be popped from the state stack. # | | # | |--- When making the state transition specified on this line, advance to the next # | character from the input only if 'n' appears here. # | # |--- Character or named character classes to test for. If the current character being scanned # matches, peform the actions and go to the state specified on this line. # The input character is tested sequentally, in the order written. The characters and # character classes tested for do not need to be mutually exclusive. The first match wins. # # # start state, scan position is at the beginning of the rules file, or in between two rules. # start: escaped term ^break-rule-end doExprStart white_space n start '$' scan-var-name ^assign-or-rule doExprStart '!' n reverse-rule doReverseDir ';' n start # ignore empty rules. eof exit default term ^break-rule-end doExprStart # # break-rule-end: Returned from doing a break-rule expression. # break-rule-end: ';' n start doEndOfRule white_space n break-rule-end default errorDeath doRuleError # # Reverse Rule We've just scanned a '!', indicating a reverse direction rule. # A rule expression must follow. # reverse-rule: default term ^break-rule-end doExprStart # # term. Eat through a single rule character, or a composite thing, which # could be a parenthesized expression, a variable name, or a Unicode Set. # term: escaped n expr-mod doRuleChar white_space n term rule_char n expr-mod doRuleChar '[' scan-unicode-set ^expr-mod '(' n term ^expr-mod doLParen '$' scan-var-name ^term-var-ref '.' n expr-mod doDotAny default errorDeath doRuleError # # term-var-ref We've just finished scanning a reference to a $variable. # Check that the variable was defined. # The variable name scanning is in common with assignment statements, # so the check can't be done there. term-var-ref: default expr-mod doCheckVarDef # # expr-mod We've just finished scanning a term, now look for the optional # trailing '*', '?', '+' # expr-mod: '*' n expr-cont doUnaryOpStar '+' n expr-cont doUnaryOpPlus '?' n expr-cont doUnaryOpQuestion default expr-cont # # expr-cont Expression, continuation. At a point where additional terms are # allowed, but not required. # expr-cont: escaped term doExprCatOperator white_space n expr-cont rule_char term doExprCatOperator '[' term doExprCatOperator '(' term doExprCatOperator '$' term doExprCatOperator '.' term doExprCatOperator '/' look-ahead doExprCatOperator '{' n tag-open doExprCatOperator '|' n term doExprOrOperator ')' n pop doExprRParen default pop doExprFinished # # look-ahead Scanning a '/', which identifies a break point, assuming that the # remainder of the expression matches. # # Generate a parse tree as if this was a special kind of input symbol # appearing in an otherwise normal concatenation expression. # look-ahead: '/' n expr-cont-no-slash doSlash default errorDeath # # expr-cont-no-slash Expression, continuation. At a point where additional terms are # allowed, but not required. Just like # expr-cont, above, except that no '/' # look-ahead symbol is permitted. # expr-cont-no-slash: escaped term doExprCatOperator white_space n expr-cont rule_char term doExprCatOperator '[' term doExprCatOperator '(' term doExprCatOperator '$' term doExprCatOperator '.' term doExprCatOperator '|' n term doExprOrOperator ')' n pop doExprRParen default pop doExprFinished # # tags scanning a '{', the opening delimiter for a tag that identifies # the kind of match. Scan the whole {dddd} tag, where d=digit # tag-open: white_space n tag-open digit_char tag-value doStartTagValue default errorDeath doTagExpectedError tag-value: white_space n tag-close '}' tag-close digit_char n tag-value doTagDigit default errorDeath doTagExpectedError tag-close: white_space n tag-close '}' n expr-cont-no-tag doTagValue default errorDeath doTagExpectedError # # expr-cont-no-tag Expression, continuation. At a point where additional terms are # allowed, but not required. Just like # expr-cont, above, except that no "{ddd}" # tagging is permitted. # expr-cont-no-tag: escaped term doExprCatOperator white_space n expr-cont-no-tag rule_char term doExprCatOperator '[' term doExprCatOperator '(' term doExprCatOperator '$' term doExprCatOperator '.' term doExprCatOperator '/' look-ahead doExprCatOperator '|' n term doExprOrOperator ')' n pop doExprRParen default pop doExprFinished # # Variable Name Scanning. # # The state that branched to here must have pushed a return state # to go to after completion of the variable name scanning. # # The current input character must be the $ that introduces the name. # The $ is consummed here rather than in the state that first detected it # so that the doStartVariableName action only needs to happen in one # place (here), and the other states don't need to worry about it. # scan-var-name: '$' n scan-var-start doStartVariableName default errorDeath scan-var-start: name_start_char n scan-var-body default errorDeath doVariableNameExpectedErr scan-var-body: name_char n scan-var-body default pop doEndVariableName # # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. # Within the RBBI parser, after finding the first character # of a Unicode Set, we just hand the rule input at that # point of to the Unicode Set constructor, then pick # up parsing after the close of the set. # # The action for this state invokes the UnicodeSet parser. # scan-unicode-set: '[' n pop doScanUnicodeSet 'p' n pop doScanUnicodeSet 'P' n pop doScanUnicodeSet default errorDeath # # assign-or-rule. A $variable was encountered at the start of something, could be # either an assignment statement or a rule, depending on whether an '=' # follows the variable name. We get to this state when the variable name # scanning does a return. # assign-or-rule: white_space n assign-or-rule '=' n term ^assign-end doStartAssign # variable was target of assignment default term-var-ref ^break-rule-end # variable was a term in a rule # # assign-end This state is entered when the end of the expression on the # right hand side of an assignment is found. We get here via # a pop; this state is pushed when the '=' in an assignment is found. # # The only thing allowed at this point is a ';'. The RHS of an # assignment must look like a rule expression, and we come here # when what is being scanned no longer looks like an expression. # assign-end: ';' n start doEndAssign default errorDeath doRuleErrorAssignExpr # # errorDeath. This state is specified as the next state whenever a syntax error # in the source rules is detected. Barring bugs, the state machine will never # actually get here, but will stop because of the action associated with the error. # But, just in case, this state asks the state machine to exit. errorDeath: default n errorDeath doExit