ICU-12081 RBBI extensions & Emoji rules. Import rule data to Java from C++, port code changes.
X-SVN-Rev: 38422
This commit is contained in:
parent
48214e5b5d
commit
b552700cc6
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2001-2010, International Business Machines Corporation and
|
||||
* Copyright (c) 2001-2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
@ -17,7 +17,7 @@ import com.ibm.icu.impl.Assert;
|
||||
*/
|
||||
class RBBINode {
|
||||
|
||||
|
||||
|
||||
// enum NodeType {
|
||||
static final int setRef = 0;
|
||||
static final int uset = 1;
|
||||
@ -36,7 +36,7 @@ class RBBINode {
|
||||
static final int opReverse = 14;
|
||||
static final int opLParen = 15;
|
||||
static final int nodeTypeLimit = 16; // For Assertion checking only.
|
||||
|
||||
|
||||
static final String [] nodeTypeNames = {
|
||||
"setRef",
|
||||
"uset",
|
||||
@ -56,20 +56,20 @@ class RBBINode {
|
||||
"opLParen"
|
||||
};
|
||||
|
||||
// enum OpPrecedence {
|
||||
// enum OpPrecedence {
|
||||
static final int precZero = 0;
|
||||
static final int precStart = 1;
|
||||
static final int precLParen = 2;
|
||||
static final int precOpOr = 3;
|
||||
static final int precOpCat = 4;
|
||||
|
||||
|
||||
int fType; // enum NodeType
|
||||
RBBINode fParent;
|
||||
RBBINode fLeftChild;
|
||||
RBBINode fRightChild;
|
||||
UnicodeSet fInputSet; // For uset nodes only.
|
||||
int fPrecedence = precZero; // enum OpPrecedence, For binary ops only.
|
||||
|
||||
|
||||
String fText; // Text corresponding to this node.
|
||||
// May be lazily evaluated when (if) needed
|
||||
// for some node types.
|
||||
@ -89,12 +89,17 @@ class RBBINode {
|
||||
// state transition table.
|
||||
|
||||
boolean fLookAheadEnd; // For endMark nodes, set TRUE if
|
||||
// marking the end of a look-ahead rule.
|
||||
// marking the end of a look-ahead rule.
|
||||
|
||||
boolean fRuleRoot; // True if this node is the root of a rule.
|
||||
boolean fChainIn; // True if chaining into this rule is allowed
|
||||
// (no '^' present).
|
||||
|
||||
|
||||
Set<RBBINode> fFirstPosSet; // See Aho DFA table generation algorithm
|
||||
Set<RBBINode> fLastPosSet; // See Aho.
|
||||
Set<RBBINode> fLastPosSet; // See Aho.
|
||||
Set<RBBINode> fFollowPos; // See Aho.
|
||||
|
||||
|
||||
int fSerialNum; // Debugging aids. Each node gets a unique serial number.
|
||||
static int gLastSerial;
|
||||
|
||||
@ -129,6 +134,8 @@ class RBBINode {
|
||||
fLastPos = other.fLastPos;
|
||||
fNullable = other.fNullable;
|
||||
fVal = other.fVal;
|
||||
fRuleRoot = false;
|
||||
fChainIn = other.fChainIn;
|
||||
fFirstPosSet = new HashSet<RBBINode>(other.fFirstPosSet);
|
||||
fLastPosSet = new HashSet<RBBINode>(other.fLastPosSet);
|
||||
fFollowPos = new HashSet<RBBINode>(other.fFollowPos);
|
||||
@ -163,6 +170,8 @@ class RBBINode {
|
||||
n.fRightChild.fParent = n;
|
||||
}
|
||||
}
|
||||
n.fRuleRoot = this.fRuleRoot;
|
||||
n.fChainIn = this.fChainIn;
|
||||
return n;
|
||||
}
|
||||
|
||||
@ -259,8 +268,8 @@ class RBBINode {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// print. Print out a single node, for debugging.
|
||||
@ -279,7 +288,7 @@ class RBBINode {
|
||||
RBBINode.printInt(n.fRightChild==null? 0 : n.fRightChild.fSerialNum, 12);
|
||||
RBBINode.printInt(n.fFirstPos, 12);
|
||||
RBBINode.printInt(n.fVal, 7);
|
||||
|
||||
|
||||
if (n.fType == varRef) {
|
||||
System.out.print(" " + n.fText);
|
||||
}
|
||||
@ -287,7 +296,7 @@ class RBBINode {
|
||||
System.out.println("");
|
||||
}
|
||||
///CLOVER:ON
|
||||
|
||||
|
||||
|
||||
// Print a String in a fixed field size.
|
||||
// Debugging function.
|
||||
@ -344,7 +353,7 @@ class RBBINode {
|
||||
if (fLeftChild != null) {
|
||||
fLeftChild.printTree(false);
|
||||
}
|
||||
|
||||
|
||||
if (fRightChild != null) {
|
||||
fRightChild.printTree(false);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
* Copyright (c) 2003-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
@ -13,6 +13,8 @@ package com.ibm.icu.text;
|
||||
* rule parser.
|
||||
* It is generated by the Perl script "rbbicst.pl" from
|
||||
* the rule parser state definitions file "rbbirpt.txt".
|
||||
* @internal
|
||||
*
|
||||
*/
|
||||
class RBBIRuleParseTable
|
||||
{
|
||||
@ -29,24 +31,25 @@ class RBBIRuleParseTable
|
||||
static final short doExprStart = 11;
|
||||
static final short doLParen = 12;
|
||||
static final short doNOP = 13;
|
||||
static final short doOptionEnd = 14;
|
||||
static final short doOptionStart = 15;
|
||||
static final short doReverseDir = 16;
|
||||
static final short doRuleChar = 17;
|
||||
static final short doRuleError = 18;
|
||||
static final short doRuleErrorAssignExpr = 19;
|
||||
static final short doScanUnicodeSet = 20;
|
||||
static final short doSlash = 21;
|
||||
static final short doStartAssign = 22;
|
||||
static final short doStartTagValue = 23;
|
||||
static final short doStartVariableName = 24;
|
||||
static final short doTagDigit = 25;
|
||||
static final short doTagExpectedError = 26;
|
||||
static final short doTagValue = 27;
|
||||
static final short doUnaryOpPlus = 28;
|
||||
static final short doUnaryOpQuestion = 29;
|
||||
static final short doUnaryOpStar = 30;
|
||||
static final short doVariableNameExpectedErr = 31;
|
||||
static final short doNoChain = 14;
|
||||
static final short doOptionEnd = 15;
|
||||
static final short doOptionStart = 16;
|
||||
static final short doReverseDir = 17;
|
||||
static final short doRuleChar = 18;
|
||||
static final short doRuleError = 19;
|
||||
static final short doRuleErrorAssignExpr = 20;
|
||||
static final short doScanUnicodeSet = 21;
|
||||
static final short doSlash = 22;
|
||||
static final short doStartAssign = 23;
|
||||
static final short doStartTagValue = 24;
|
||||
static final short doStartVariableName = 25;
|
||||
static final short doTagDigit = 26;
|
||||
static final short doTagExpectedError = 27;
|
||||
static final short doTagValue = 28;
|
||||
static final short doUnaryOpPlus = 29;
|
||||
static final short doUnaryOpQuestion = 30;
|
||||
static final short doUnaryOpStar = 31;
|
||||
static final short doVariableNameExpectedErr = 32;
|
||||
|
||||
static final short kRuleSet_default = 255;
|
||||
static final short kRuleSet_digit_char = 128;
|
||||
@ -73,104 +76,112 @@ class RBBIRuleParseTable
|
||||
fNextChar = nc;
|
||||
fStateName = sn;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static RBBIRuleTableElement[] gRuleParseStateTable = {
|
||||
new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0
|
||||
, new RBBIRuleTableElement(doExprStart, 254, 21, 8, false, "start") // 1
|
||||
, new RBBIRuleTableElement(doExprStart, 254, 29, 9, false, "start") // 1
|
||||
, new RBBIRuleTableElement(doNOP, 132, 1,0, true, null ) // 2
|
||||
, new RBBIRuleTableElement(doExprStart,'$', 80, 90, false, null ) // 3
|
||||
, new RBBIRuleTableElement(doNOP,'!', 11,0, true, null ) // 4
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 5
|
||||
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 6
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, null ) // 7
|
||||
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 8
|
||||
, new RBBIRuleTableElement(doNOP, 132, 8,0, true, null ) // 9
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 10
|
||||
, new RBBIRuleTableElement(doNOP,'!', 13,0, true, "rev-option") // 11
|
||||
, new RBBIRuleTableElement(doReverseDir, 255, 20, 8, false, null ) // 12
|
||||
, new RBBIRuleTableElement(doOptionStart, 130, 15,0, true, "option-scan1") // 13
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 14
|
||||
, new RBBIRuleTableElement(doNOP, 129, 15,0, true, "option-scan2") // 15
|
||||
, new RBBIRuleTableElement(doOptionEnd, 255, 17,0, false, null ) // 16
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 17
|
||||
, new RBBIRuleTableElement(doNOP, 132, 17,0, true, null ) // 18
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 19
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, "reverse-rule") // 20
|
||||
, new RBBIRuleTableElement(doRuleChar, 254, 30,0, true, "term") // 21
|
||||
, new RBBIRuleTableElement(doNOP, 132, 21,0, true, null ) // 22
|
||||
, new RBBIRuleTableElement(doRuleChar, 131, 30,0, true, null ) // 23
|
||||
, new RBBIRuleTableElement(doNOP,'[', 86, 30, false, null ) // 24
|
||||
, new RBBIRuleTableElement(doLParen,'(', 21, 30, true, null ) // 25
|
||||
, new RBBIRuleTableElement(doNOP,'$', 80, 29, false, null ) // 26
|
||||
, new RBBIRuleTableElement(doDotAny,'.', 30,0, true, null ) // 27
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 28
|
||||
, new RBBIRuleTableElement(doCheckVarDef, 255, 30,0, false, "term-var-ref") // 29
|
||||
, new RBBIRuleTableElement(doNOP, 132, 30,0, true, "expr-mod") // 30
|
||||
, new RBBIRuleTableElement(doUnaryOpStar,'*', 35,0, true, null ) // 31
|
||||
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 35,0, true, null ) // 32
|
||||
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 35,0, true, null ) // 33
|
||||
, new RBBIRuleTableElement(doNOP, 255, 35,0, false, null ) // 34
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont") // 35
|
||||
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 36
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 37
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 38
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 39
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 40
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 41
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 42
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'{', 59,0, true, null ) // 43
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 44
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 45
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 46
|
||||
, new RBBIRuleTableElement(doSlash,'/', 49,0, true, "look-ahead") // 47
|
||||
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 48
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-slash") // 49
|
||||
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 50
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 51
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 52
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 53
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 54
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 55
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 56
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 57
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 58
|
||||
, new RBBIRuleTableElement(doNOP, 132, 59,0, true, "tag-open") // 59
|
||||
, new RBBIRuleTableElement(doStartTagValue, 128, 62,0, false, null ) // 60
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 61
|
||||
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-value") // 62
|
||||
, new RBBIRuleTableElement(doNOP,'}', 66,0, false, null ) // 63
|
||||
, new RBBIRuleTableElement(doTagDigit, 128, 62,0, true, null ) // 64
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 65
|
||||
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-close") // 66
|
||||
, new RBBIRuleTableElement(doTagValue,'}', 69,0, true, null ) // 67
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 68
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-tag") // 69
|
||||
, new RBBIRuleTableElement(doNOP, 132, 69,0, true, null ) // 70
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 71
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 72
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 73
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 74
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 75
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 76
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 77
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 78
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 79
|
||||
, new RBBIRuleTableElement(doStartVariableName,'$', 82,0, true, "scan-var-name") // 80
|
||||
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 81
|
||||
, new RBBIRuleTableElement(doNOP, 130, 84,0, true, "scan-var-start") // 82
|
||||
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 95,0, false, null ) // 83
|
||||
, new RBBIRuleTableElement(doNOP, 129, 84,0, true, "scan-var-body") // 84
|
||||
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 85
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 86
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 87
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 88
|
||||
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 89
|
||||
, new RBBIRuleTableElement(doNOP, 132, 90,0, true, "assign-or-rule") // 90
|
||||
, new RBBIRuleTableElement(doStartAssign,'=', 21, 93, true, null ) // 91
|
||||
, new RBBIRuleTableElement(doNOP, 255, 29, 8, false, null ) // 92
|
||||
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 93
|
||||
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 95,0, false, null ) // 94
|
||||
, new RBBIRuleTableElement(doExit, 255, 95,0, true, "errorDeath") // 95
|
||||
, new RBBIRuleTableElement(doNoChain,'^', 12, 9, true, null ) // 3
|
||||
, new RBBIRuleTableElement(doExprStart,'$', 88, 98, false, null ) // 4
|
||||
, new RBBIRuleTableElement(doNOP,'!', 19,0, true, null ) // 5
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 6
|
||||
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 7
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, null ) // 8
|
||||
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 9
|
||||
, new RBBIRuleTableElement(doNOP, 132, 9,0, true, null ) // 10
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 11
|
||||
, new RBBIRuleTableElement(doExprStart, 254, 29,0, false, "start-after-caret") // 12
|
||||
, new RBBIRuleTableElement(doNOP, 132, 12,0, true, null ) // 13
|
||||
, new RBBIRuleTableElement(doRuleError,'^', 103,0, false, null ) // 14
|
||||
, new RBBIRuleTableElement(doExprStart,'$', 88, 37, false, null ) // 15
|
||||
, new RBBIRuleTableElement(doRuleError,';', 103,0, false, null ) // 16
|
||||
, new RBBIRuleTableElement(doRuleError, 252, 103,0, false, null ) // 17
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 29,0, false, null ) // 18
|
||||
, new RBBIRuleTableElement(doNOP,'!', 21,0, true, "rev-option") // 19
|
||||
, new RBBIRuleTableElement(doReverseDir, 255, 28, 9, false, null ) // 20
|
||||
, new RBBIRuleTableElement(doOptionStart, 130, 23,0, true, "option-scan1") // 21
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 22
|
||||
, new RBBIRuleTableElement(doNOP, 129, 23,0, true, "option-scan2") // 23
|
||||
, new RBBIRuleTableElement(doOptionEnd, 255, 25,0, false, null ) // 24
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 25
|
||||
, new RBBIRuleTableElement(doNOP, 132, 25,0, true, null ) // 26
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 27
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, "reverse-rule") // 28
|
||||
, new RBBIRuleTableElement(doRuleChar, 254, 38,0, true, "term") // 29
|
||||
, new RBBIRuleTableElement(doNOP, 132, 29,0, true, null ) // 30
|
||||
, new RBBIRuleTableElement(doRuleChar, 131, 38,0, true, null ) // 31
|
||||
, new RBBIRuleTableElement(doNOP,'[', 94, 38, false, null ) // 32
|
||||
, new RBBIRuleTableElement(doLParen,'(', 29, 38, true, null ) // 33
|
||||
, new RBBIRuleTableElement(doNOP,'$', 88, 37, false, null ) // 34
|
||||
, new RBBIRuleTableElement(doDotAny,'.', 38,0, true, null ) // 35
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 36
|
||||
, new RBBIRuleTableElement(doCheckVarDef, 255, 38,0, false, "term-var-ref") // 37
|
||||
, new RBBIRuleTableElement(doNOP, 132, 38,0, true, "expr-mod") // 38
|
||||
, new RBBIRuleTableElement(doUnaryOpStar,'*', 43,0, true, null ) // 39
|
||||
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 43,0, true, null ) // 40
|
||||
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 43,0, true, null ) // 41
|
||||
, new RBBIRuleTableElement(doNOP, 255, 43,0, false, null ) // 42
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont") // 43
|
||||
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 44
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 45
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 46
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 47
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 48
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 49
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 50
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'{', 67,0, true, null ) // 51
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 52
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 53
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 54
|
||||
, new RBBIRuleTableElement(doSlash,'/', 57,0, true, "look-ahead") // 55
|
||||
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 56
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-slash") // 57
|
||||
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 58
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 59
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 60
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 61
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 62
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 63
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 64
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 65
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 66
|
||||
, new RBBIRuleTableElement(doNOP, 132, 67,0, true, "tag-open") // 67
|
||||
, new RBBIRuleTableElement(doStartTagValue, 128, 70,0, false, null ) // 68
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 69
|
||||
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-value") // 70
|
||||
, new RBBIRuleTableElement(doNOP,'}', 74,0, false, null ) // 71
|
||||
, new RBBIRuleTableElement(doTagDigit, 128, 70,0, true, null ) // 72
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 73
|
||||
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-close") // 74
|
||||
, new RBBIRuleTableElement(doTagValue,'}', 77,0, true, null ) // 75
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 76
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-tag") // 77
|
||||
, new RBBIRuleTableElement(doNOP, 132, 77,0, true, null ) // 78
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 79
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 80
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 81
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 82
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 83
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 84
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 85
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 86
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 87
|
||||
, new RBBIRuleTableElement(doStartVariableName,'$', 90,0, true, "scan-var-name") // 88
|
||||
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 89
|
||||
, new RBBIRuleTableElement(doNOP, 130, 92,0, true, "scan-var-start") // 90
|
||||
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 103,0, false, null ) // 91
|
||||
, new RBBIRuleTableElement(doNOP, 129, 92,0, true, "scan-var-body") // 92
|
||||
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 93
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 94
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 95
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 96
|
||||
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 97
|
||||
, new RBBIRuleTableElement(doNOP, 132, 98,0, true, "assign-or-rule") // 98
|
||||
, new RBBIRuleTableElement(doStartAssign,'=', 29, 101, true, null ) // 99
|
||||
, new RBBIRuleTableElement(doNOP, 255, 37, 9, false, null ) // 100
|
||||
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 101
|
||||
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 103,0, false, null ) // 102
|
||||
, new RBBIRuleTableElement(doExit, 255, 103,0, true, "errorDeath") // 103
|
||||
};
|
||||
}
|
||||
};
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
|
||||
* Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.ParsePosition;
|
||||
@ -19,12 +19,12 @@ import com.ibm.icu.lang.UCharacter;
|
||||
* There is no public API here.
|
||||
*/
|
||||
class RBBIRuleScanner {
|
||||
|
||||
|
||||
private final static int kStackSize = 100; // The size of the state stack for
|
||||
// rules parsing. Corresponds roughly
|
||||
// to the depth of parentheses nesting
|
||||
// that is allowed in the rules.
|
||||
|
||||
|
||||
static class RBBIRuleChar {
|
||||
int fChar;
|
||||
boolean fEscaped;
|
||||
@ -33,7 +33,7 @@ class RBBIRuleScanner {
|
||||
|
||||
|
||||
RBBIRuleBuilder fRB; // The rule builder that we are part of.
|
||||
|
||||
|
||||
int fScanIndex; // Index of current character being processed
|
||||
// in the rule input string.
|
||||
int fNextIndex; // Index of the next character, which
|
||||
@ -43,49 +43,52 @@ class RBBIRuleScanner {
|
||||
int fCharNum; // Char position within the line.
|
||||
int fLastChar; // Previous char, needed to count CR-LF
|
||||
// as a single line, not two.
|
||||
|
||||
|
||||
RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
|
||||
// processing.
|
||||
String fVarName; // $variableName, valid when we've just
|
||||
// scanned one.
|
||||
|
||||
|
||||
|
||||
|
||||
short fStack[] = new short[kStackSize]; // State stack, holds state pushes
|
||||
int fStackPtr; // and pops as specified in the state
|
||||
// transition rules.
|
||||
|
||||
|
||||
RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
|
||||
// during the parse of a rule
|
||||
int fNodeStackPtr;
|
||||
|
||||
|
||||
boolean fReverseRule; // True if the rule currently being scanned
|
||||
|
||||
|
||||
boolean fReverseRule; // True if the rule currently being scanned
|
||||
// is a reverse direction rule (if it
|
||||
// starts with a '!')
|
||||
|
||||
boolean fLookAheadRule; // True if the rule includes a '/'
|
||||
|
||||
boolean fLookAheadRule; // True if the rule includes a '/'
|
||||
// somewhere within it.
|
||||
|
||||
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
|
||||
|
||||
boolean fNoChainInRule; // True if the current rule starts with a '^'.
|
||||
|
||||
|
||||
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
|
||||
// $variable symbols.
|
||||
|
||||
|
||||
HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
|
||||
// the sets created while parsing rules.
|
||||
// The key is the string used for creating
|
||||
// the set.
|
||||
|
||||
|
||||
UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
|
||||
// the scanning of RBBI rules. The
|
||||
// indicies for these are assigned by the
|
||||
// perl script that builds the state tables.
|
||||
// See rbbirpt.h.
|
||||
|
||||
|
||||
int fRuleNum; // Counts each rule as it is scanned.
|
||||
|
||||
|
||||
int fOptionStart; // Input index of start of a !!option
|
||||
// keyword, while being scanned.
|
||||
|
||||
|
||||
|
||||
|
||||
static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
|
||||
static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
|
||||
@ -94,8 +97,8 @@ class RBBIRuleScanner {
|
||||
static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
|
||||
static private String kAny = "any";
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
@ -139,6 +142,12 @@ class RBBIRuleScanner {
|
||||
fRuleNum++;
|
||||
break;
|
||||
|
||||
case RBBIRuleParseTable.doNoChain:
|
||||
// Scanned a '^' while on the rule start state.
|
||||
fNoChainInRule = true;
|
||||
break;
|
||||
|
||||
|
||||
case RBBIRuleParseTable.doExprOrOperator: {
|
||||
fixOpStack(RBBINode.precOpCat);
|
||||
RBBINode operandNode = fNodeStack[fNodeStackPtr--];
|
||||
@ -241,11 +250,11 @@ class RBBIRuleScanner {
|
||||
printNodeStack("end of rule");
|
||||
}
|
||||
Assert.assrt(fNodeStackPtr == 1);
|
||||
RBBINode thisRule = fNodeStack[fNodeStackPtr];
|
||||
|
||||
// If this rule includes a look-ahead '/', add a endMark node to the
|
||||
// expression tree.
|
||||
if (fLookAheadRule) {
|
||||
RBBINode thisRule = fNodeStack[fNodeStackPtr];
|
||||
RBBINode endNode = pushNewNode(RBBINode.endMark);
|
||||
RBBINode catNode = pushNewNode(RBBINode.opCat);
|
||||
fNodeStackPtr -= 2;
|
||||
@ -254,8 +263,24 @@ class RBBIRuleScanner {
|
||||
fNodeStack[fNodeStackPtr] = catNode;
|
||||
endNode.fVal = fRuleNum;
|
||||
endNode.fLookAheadEnd = true;
|
||||
thisRule = catNode;
|
||||
|
||||
// TODO: Disable chaining out of look-ahead (hard break) rules.
|
||||
// The break on rule match is forced, so there is no point in building up
|
||||
// the state table to chain into another rule for a longer match.
|
||||
}
|
||||
|
||||
// Mark this node as being the root of a rule.
|
||||
thisRule.fRuleRoot = true;
|
||||
|
||||
// Flag if chaining into this rule is wanted.
|
||||
//
|
||||
if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain
|
||||
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
|
||||
thisRule.fChainIn = true;
|
||||
}
|
||||
|
||||
|
||||
// All rule expressions are ORed together.
|
||||
// The ';' that terminates an expression really just functions as a
|
||||
// '|' with
|
||||
@ -269,12 +294,12 @@ class RBBIRuleScanner {
|
||||
int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
|
||||
|
||||
if (fRB.fTreeRoots[destRules] != null) {
|
||||
// This is not the first rule encounted.
|
||||
// This is not the first rule encountered.
|
||||
// OR previous stuff (from *destRules)
|
||||
// with the current rule expression (on the Node Stack)
|
||||
// with the resulting OR expression going to *destRules
|
||||
//
|
||||
RBBINode thisRule = fNodeStack[fNodeStackPtr];
|
||||
thisRule = fNodeStack[fNodeStackPtr];
|
||||
RBBINode prevRules = fRB.fTreeRoots[destRules];
|
||||
RBBINode orNode = pushNewNode(RBBINode.opOr);
|
||||
orNode.fLeftChild = prevRules;
|
||||
@ -289,6 +314,7 @@ class RBBIRuleScanner {
|
||||
}
|
||||
fReverseRule = false; // in preparation for the next rule.
|
||||
fLookAheadRule = false;
|
||||
fNoChainInRule = false;
|
||||
fNodeStackPtr = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2009, International Business Machines
|
||||
* Copyright (c) 2002-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -28,9 +28,9 @@ import com.ibm.icu.lang.UProperty;
|
||||
// There is no user-visible public API here.
|
||||
//
|
||||
class RBBITableBuilder {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
|
||||
// one for each state.
|
||||
@ -58,8 +58,8 @@ class RBBITableBuilder {
|
||||
// symbol.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private RBBIRuleBuilder fRB;
|
||||
private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
|
||||
// for the parse tree to operate on.
|
||||
@ -84,7 +84,7 @@ class RBBITableBuilder {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
|
||||
@ -109,11 +109,11 @@ class RBBITableBuilder {
|
||||
}
|
||||
|
||||
//
|
||||
// If the rules contained any references to {bof}
|
||||
// If the rules contained any references to {bof}
|
||||
// add a {bof} <cat> <former root of tree> to the
|
||||
// tree. Means that all matches must start out with the
|
||||
// tree. Means that all matches must start out with the
|
||||
// {bof} fake character.
|
||||
//
|
||||
//
|
||||
if (fRB.fSetBuilder.sawBOF()) {
|
||||
RBBINode bofTop = new RBBINode(RBBINode.opCat);
|
||||
RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
|
||||
@ -361,6 +361,25 @@ class RBBITableBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
|
||||
// as roots of a rule to a destination vector.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
|
||||
if (node == null) {
|
||||
return;
|
||||
}
|
||||
if (node.fRuleRoot) {
|
||||
dest.add(node);
|
||||
// Note: rules cannot nest. If we found a rule start node,
|
||||
// no child node can also be a start node.
|
||||
return;
|
||||
}
|
||||
addRuleRootNodes(dest, node.fLeftChild);
|
||||
addRuleRootNodes(dest, node.fRightChild);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
@ -379,17 +398,21 @@ class RBBITableBuilder {
|
||||
// get a list all leaf nodes
|
||||
tree.findNodes(leafNodes, RBBINode.leafChar);
|
||||
|
||||
// Get all nodes that can be the start a match, which is FirstPosition()
|
||||
// of the portion of the tree corresponding to user-written rules.
|
||||
// See the tree description in bofFixup().
|
||||
RBBINode userRuleRoot = tree;
|
||||
if (fRB.fSetBuilder.sawBOF()) {
|
||||
userRuleRoot = tree.fLeftChild.fRightChild;
|
||||
}
|
||||
Assert.assrt(userRuleRoot != null);
|
||||
Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;
|
||||
// Collect all leaf nodes that can start matches for rules
|
||||
// with inbound chaining enabled, which is the union of the
|
||||
// firstPosition sets from each of the rule root nodes.
|
||||
|
||||
// Iteratate over all leaf nodes,
|
||||
List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
|
||||
addRuleRootNodes(ruleRootNodes, tree);
|
||||
|
||||
Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
|
||||
for (RBBINode node: ruleRootNodes) {
|
||||
if (node.fChainIn) {
|
||||
matchStartNodes.addAll(node.fFirstPosSet);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over all leaf nodes,
|
||||
//
|
||||
for (RBBINode tNode : leafNodes) {
|
||||
RBBINode endNode = null;
|
||||
@ -461,9 +484,9 @@ class RBBITableBuilder {
|
||||
//
|
||||
// The parse tree looks like this ...
|
||||
// fTree root --. <cat>
|
||||
// / \
|
||||
// / \
|
||||
// <cat> <#end node>
|
||||
// / \
|
||||
// / \
|
||||
// <bofNode> rest
|
||||
// of tree
|
||||
//
|
||||
@ -477,7 +500,7 @@ class RBBITableBuilder {
|
||||
// (excluding the fake bofNode)
|
||||
// We want the nodes that can start a match in the
|
||||
// part labeled "rest of tree"
|
||||
//
|
||||
//
|
||||
Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
|
||||
for (RBBINode startNode : matchStartNodes) {
|
||||
if (startNode.fType != RBBINode.leafChar) {
|
||||
@ -489,7 +512,7 @@ class RBBITableBuilder {
|
||||
// explicitly written into a rule.
|
||||
// Add everything from the followPos set of this node to the
|
||||
// followPos set of the fake bofNode at the start of the tree.
|
||||
//
|
||||
//
|
||||
bofNode.fFollowPos.addAll(startNode.fFollowPos);
|
||||
}
|
||||
}
|
||||
@ -705,7 +728,7 @@ class RBBITableBuilder {
|
||||
// The RBBI runtime uses an array of {sets of status values} that can
|
||||
// be returned for boundaries. Each accepting state that has non-zero
|
||||
// status includes an index into this array. The format of the array
|
||||
// is
|
||||
// is
|
||||
// Num of status values in group 1
|
||||
// status val
|
||||
// status val
|
||||
@ -718,7 +741,7 @@ class RBBITableBuilder {
|
||||
//
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void mergeRuleStatusVals() {
|
||||
//
|
||||
// The basic outline of what happens here is this...
|
||||
@ -731,14 +754,14 @@ class RBBITableBuilder {
|
||||
// add the tag list for this state to the global list.
|
||||
//
|
||||
int n;
|
||||
|
||||
|
||||
// Pre-load a single tag of {0} into the table.
|
||||
// We will need this as a default, for rule sets with no explicit tagging,
|
||||
// or with explicit tagging of {0}.
|
||||
if (fRB.fRuleStatusVals.size() == 0) {
|
||||
fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group
|
||||
fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero
|
||||
|
||||
|
||||
SortedSet<Integer> s0 = new TreeSet<Integer>();
|
||||
Integer izero = Integer.valueOf(0);
|
||||
fRB.fStatusSets.put(s0, izero);
|
||||
@ -756,17 +779,17 @@ class RBBITableBuilder {
|
||||
if (arrayIndexI == null) {
|
||||
// This is the first encounter of this set of status values.
|
||||
// Add them to the statusSets map, This map associates
|
||||
// the set of status values with an index in the runtime status
|
||||
// the set of status values with an index in the runtime status
|
||||
// values array.
|
||||
arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());
|
||||
fRB.fStatusSets.put(statusVals, arrayIndexI);
|
||||
|
||||
|
||||
// Add the new set of status values to the vector of values that
|
||||
// will eventually become the array used by the runtime engine.
|
||||
fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));
|
||||
fRB.fRuleStatusVals.addAll(statusVals);
|
||||
}
|
||||
|
||||
|
||||
// Save the runtime array index back into the state descriptor.
|
||||
sd.fTagsIdx = arrayIndexI.intValue();
|
||||
}
|
||||
@ -784,7 +807,7 @@ class RBBITableBuilder {
|
||||
// for each node in the tree.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void printPosSets(RBBINode n) {
|
||||
if (n==null) {
|
||||
return;
|
||||
@ -804,7 +827,7 @@ class RBBITableBuilder {
|
||||
printPosSets(n.fLeftChild);
|
||||
printPosSets(n.fRightChild);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -860,7 +883,7 @@ class RBBITableBuilder {
|
||||
// See struct RBBIStateTable in ICU4C, common/rbbidata.h
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
short [] exportTable() {
|
||||
int state;
|
||||
int col;
|
||||
@ -870,18 +893,18 @@ class RBBITableBuilder {
|
||||
}
|
||||
|
||||
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
|
||||
fDStates.size() < 0x7fff);
|
||||
fDStates.size() < 0x7fff);
|
||||
|
||||
int numStates = fDStates.size();
|
||||
|
||||
|
||||
// Size of table size in shorts.
|
||||
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
|
||||
int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
|
||||
int tableSize = getTableSize() / 2;
|
||||
|
||||
|
||||
|
||||
short [] table = new short[tableSize];
|
||||
|
||||
|
||||
//
|
||||
// Fill in the header fields.
|
||||
// Annoying because they really want to be ints, not shorts.
|
||||
@ -893,7 +916,7 @@ class RBBITableBuilder {
|
||||
// RBBIStateTable.fRowLen
|
||||
table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16);
|
||||
table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
|
||||
|
||||
|
||||
// RBBIStateTable.fFlags
|
||||
int flags = 0;
|
||||
if (fRB.fLookAheadHardBreak) {
|
||||
@ -904,7 +927,7 @@ class RBBITableBuilder {
|
||||
}
|
||||
table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16);
|
||||
table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
|
||||
|
||||
|
||||
int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
|
||||
for (state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
@ -928,14 +951,14 @@ class RBBITableBuilder {
|
||||
// printSet Debug function. Print the contents of a set of Nodes
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void printSet(Collection<RBBINode> s) {
|
||||
for (RBBINode n : s) {
|
||||
RBBINode.printInt(n.fSerialNum, 8);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
@ -943,7 +966,7 @@ class RBBITableBuilder {
|
||||
// printStates Debug Function. Dump the fully constructed state transition table.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void printStates() {
|
||||
int c; // input "character"
|
||||
int n; // state number
|
||||
@ -964,7 +987,7 @@ class RBBITableBuilder {
|
||||
RBBIStateDescriptor sd = fDStates.get(n);
|
||||
RBBINode.printInt(n, 5);
|
||||
System.out.print(" | ");
|
||||
|
||||
|
||||
RBBINode.printInt(sd.fAccepting, 3);
|
||||
RBBINode.printInt(sd.fLookAhead, 4);
|
||||
RBBINode.printInt(sd.fTagsIdx, 6);
|
||||
@ -976,7 +999,7 @@ class RBBITableBuilder {
|
||||
}
|
||||
System.out.print("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -985,7 +1008,7 @@ class RBBITableBuilder {
|
||||
// printRuleStatusTable Debug Function. Dump the common rule status table
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void printRuleStatusTable() {
|
||||
int thisRecord = 0;
|
||||
int nextRecord = 0;
|
||||
@ -1007,7 +1030,7 @@ class RBBITableBuilder {
|
||||
}
|
||||
System.out.print("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -30,17 +30,17 @@ import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
/**
|
||||
* Rule Based Break Iterator
|
||||
* Rule Based Break Iterator
|
||||
* This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
|
||||
*
|
||||
*
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class RuleBasedBreakIterator extends BreakIterator {
|
||||
//=======================================================================
|
||||
// Constructors & Factories
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* private constructor
|
||||
*/
|
||||
private RuleBasedBreakIterator() {
|
||||
@ -51,14 +51,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
|
||||
/**
|
||||
* Create a break iterator from a precompiled set of break rules.
|
||||
*
|
||||
*
|
||||
* Creating a break iterator from the binary rules is much faster than
|
||||
* creating one from source rules.
|
||||
*
|
||||
* creating one from source rules.
|
||||
*
|
||||
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
|
||||
* Binary break iterator rules are not guaranteed to be compatible between
|
||||
* different versions of ICU.
|
||||
*
|
||||
*
|
||||
* @param is an input stream supplying the compiled binary rules.
|
||||
* @throws IOException if there is an error while reading the rules from the InputStream.
|
||||
* @see #compileRules(String, OutputStream)
|
||||
@ -67,7 +67,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
|
||||
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
|
||||
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
|
||||
return This;
|
||||
return This;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -129,7 +129,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
{
|
||||
RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
|
||||
if (fText != null) {
|
||||
result.fText = (CharacterIterator)(fText.clone());
|
||||
result.fText = (CharacterIterator)(fText.clone());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -151,15 +151,15 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
|
||||
return false;
|
||||
}
|
||||
if (fRData != null && other.fRData != null &&
|
||||
if (fRData != null && other.fRData != null &&
|
||||
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
|
||||
return false;
|
||||
}
|
||||
if (fText == null && other.fText == null) {
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
if (fText == null || other.fText == null) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
return fText.equals(other.fText);
|
||||
}
|
||||
@ -188,13 +188,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
*/
|
||||
public int hashCode()
|
||||
{
|
||||
return fRData.fRuleSource.hashCode();
|
||||
return fRData.fRuleSource.hashCode();
|
||||
}
|
||||
|
||||
|
||||
private static final int START_STATE = 1; // The state number of the starting state
|
||||
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
|
||||
|
||||
|
||||
// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
|
||||
// of user text. A variable with this enum type keeps track of where we
|
||||
// are. The state machine only fetches user text input while in RUN mode.
|
||||
@ -206,14 +206,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
* The character iterator through which this BreakIterator accesses the text.
|
||||
*/
|
||||
private CharacterIterator fText = new java.text.StringCharacterIterator("");
|
||||
|
||||
|
||||
/**
|
||||
* The rule data for this BreakIterator instance. Package private.
|
||||
*/
|
||||
RBBIDataWrapper fRData;
|
||||
|
||||
|
||||
/*
|
||||
* Index of the Rule {tag} values for the most recent match.
|
||||
* Index of the Rule {tag} values for the most recent match.
|
||||
*/
|
||||
private int fLastRuleStatusIndex;
|
||||
|
||||
@ -245,18 +245,18 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
|
||||
|
||||
/**
|
||||
* What kind of break iterator this is. Set to KIND_LINE by default,
|
||||
* What kind of break iterator this is. Set to KIND_LINE by default,
|
||||
* since this produces sensible output.
|
||||
*/
|
||||
private int fBreakType = KIND_LINE;
|
||||
|
||||
|
||||
/**
|
||||
* The "default" break engine - just skips over ranges of dictionary words,
|
||||
* producing no breaks. Should only be used if characters need to be handled
|
||||
* by a dictionary but we have no dictionary implementation for them.
|
||||
*/
|
||||
private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
|
||||
|
||||
|
||||
/**
|
||||
* when a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
@ -271,8 +271,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
*/
|
||||
private int fPositionInCache;
|
||||
|
||||
|
||||
private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
|
||||
|
||||
private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
|
||||
new ConcurrentHashMap<Integer, LanguageBreakEngine>();
|
||||
/**
|
||||
* Dumps caches and performs other actions associated with a complete change
|
||||
@ -293,18 +293,18 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
*/
|
||||
@Deprecated
|
||||
public void dump() {
|
||||
this.fRData.dump();
|
||||
this.fRData.dump();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compile a set of source break rules into the binary state tables used
|
||||
* by the break iterator engine. Creating a break iterator from precompiled
|
||||
* rules is much faster than creating one from source rules.
|
||||
*
|
||||
*
|
||||
* Binary break rules are not guaranteed to be compatible between different
|
||||
* versions of ICU.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* @param rules The source form of the break rules
|
||||
* @param ruleBinary An output stream to receive the compiled rules.
|
||||
* @throws IOException If there is an error writing the output.
|
||||
@ -314,7 +314,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
|
||||
RBBIRuleBuilder.compileRules(rules, ruleBinary);
|
||||
}
|
||||
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
@ -337,7 +337,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
fText.first();
|
||||
return fText.getIndex();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the end of the text.
|
||||
* (i.e., the CharacterIterator's ending offset).
|
||||
@ -364,7 +364,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
fText.setIndex(pos);
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Advances the iterator either forward or backward the specified number of steps.
|
||||
* Negative values move backward, and positive values move forward. This is
|
||||
@ -387,7 +387,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Advances the iterator to the next boundary position.
|
||||
* @return The position of the first boundary after this one.
|
||||
@ -424,11 +424,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
* process.
|
||||
*/
|
||||
private int checkDictionary(int startPos, int endPos, boolean reverse) {
|
||||
|
||||
|
||||
// Reset the old break cache first.
|
||||
reset();
|
||||
|
||||
// note: code segment below assumes that dictionary chars are in the
|
||||
// note: code segment below assumes that dictionary chars are in the
|
||||
// startPos-endPos range
|
||||
// value returned should be next character in sequence
|
||||
if ((endPos - startPos) <= 1) {
|
||||
@ -465,7 +465,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
} while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
|
||||
|
||||
|
||||
// Back up to the last dictionary character
|
||||
rangeEnd = fText.getIndex();
|
||||
if (c == CharacterIteration.DONE32) {
|
||||
@ -497,7 +497,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
// For each span, find the appropriate break engine, and ask it to find
|
||||
// any breaks within the span.
|
||||
@ -518,11 +518,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
// We now have a dictionary character. Get the appropriate language object
|
||||
// to deal with it.
|
||||
lbe = getLanguageBreakEngine(c);
|
||||
|
||||
|
||||
// Ask the language object if there are any breaks. It will leave the text
|
||||
// pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != null) {
|
||||
@ -530,12 +530,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks);
|
||||
assert fText.getIndex() > startingIdx;
|
||||
}
|
||||
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
}
|
||||
|
||||
|
||||
// If we found breaks, build a new break cache. The first and last entries must
|
||||
// be the original starting and ending position.
|
||||
if (foundBreakCount > 0) {
|
||||
@ -549,15 +549,15 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
if (endPos > breaks.peek()) {
|
||||
breaks.push(endPos);
|
||||
}
|
||||
|
||||
|
||||
// TODO: get rid of this array, use results from the deque directly
|
||||
fCachedBreakPositions = new int[breaks.size()];
|
||||
|
||||
|
||||
int i = 0;
|
||||
while (breaks.size() > 0) {
|
||||
fCachedBreakPositions[i++] = breaks.pollLast();
|
||||
}
|
||||
|
||||
|
||||
// If there are breaks, then by definition, we are replacing the original
|
||||
// proposed break by one of the breaks we found. Use following() and
|
||||
// preceding() to do the work. They should never recurse in this case.
|
||||
@ -573,10 +573,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// to the original proposed break.
|
||||
fText.setIndex(reverse ? startPos : endPos);
|
||||
return (reverse ? startPos : endPos);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Moves the iterator backwards, to the last boundary preceding this one.
|
||||
* @return The position of the last boundary position preceding this one.
|
||||
@ -585,7 +585,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
public int previous() {
|
||||
int result;
|
||||
int startPos;
|
||||
|
||||
|
||||
CharacterIterator text = getText();
|
||||
|
||||
fLastStatusIndexValid = false;
|
||||
@ -705,7 +705,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
return text.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int rulesFollowing(int offset) {
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
@ -744,7 +744,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
}
|
||||
if (fRData.fSFTable != null) {
|
||||
// No Safe point reverse table, but there is a safe pt forward table.
|
||||
//
|
||||
//
|
||||
fText.setIndex(offset);
|
||||
previous32(fText);
|
||||
// handle next will give result >= offset
|
||||
@ -820,7 +820,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
return text.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int rulesPreceding(int offset) {
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
@ -1002,7 +1002,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the status (tag) values from the break rule(s) that determined the most
|
||||
* Get the status (tag) values from the break rule(s) that determined the most
|
||||
* recently returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. The default status value for rules
|
||||
* that do not explicitly provide one is zero.
|
||||
@ -1014,8 +1014,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
* the output will be truncated to the available length. No exception
|
||||
* will be thrown.
|
||||
*
|
||||
* @param fillInArray an array to be filled in with the status values.
|
||||
* @return The number of rule status values from rules that determined
|
||||
* @param fillInArray an array to be filled in with the status values.
|
||||
* @return The number of rule status values from rules that determined
|
||||
* the most recent boundary returned by the break iterator.
|
||||
* In the event that the array is too small, the return value
|
||||
* is the total number of status values that were available,
|
||||
@ -1026,7 +1026,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
public int getRuleStatusVec(int[] fillInArray) {
|
||||
makeRuleStatusValid();
|
||||
int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
|
||||
if (fillInArray != null) {
|
||||
if (fillInArray != null) {
|
||||
int numToCopy = Math.min(numStatusVals, fillInArray.length);
|
||||
for (int i=0; i<numToCopy; i++) {
|
||||
fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
|
||||
@ -1079,8 +1079,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
*/
|
||||
static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
|
||||
ICUDebug.value(RBBI_DEBUG_ARG) : null;
|
||||
|
||||
|
||||
|
||||
|
||||
private LanguageBreakEngine getLanguageBreakEngine(int c) {
|
||||
|
||||
// We have a dictionary character.
|
||||
@ -1098,7 +1098,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// Fold them together for mapping from script -> engine.
|
||||
script = UScript.HAN;
|
||||
}
|
||||
|
||||
|
||||
LanguageBreakEngine eng = fBreakEngines.get(script);
|
||||
/*
|
||||
if (eng != null && !eng.handles(c, fBreakType)) {
|
||||
@ -1158,15 +1158,60 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
return eng;
|
||||
}
|
||||
|
||||
|
||||
private static final int kMaxLookaheads = 8;
|
||||
private static class LookAheadResults {
|
||||
int fUsedSlotLimit;
|
||||
int[] fPositions;
|
||||
int[] fKeys;
|
||||
|
||||
LookAheadResults() {
|
||||
fUsedSlotLimit= 0;
|
||||
fPositions = new int[kMaxLookaheads];
|
||||
fKeys = new int[kMaxLookaheads];
|
||||
}
|
||||
|
||||
int getPosition(int key) {
|
||||
for (int i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
return fPositions[i];
|
||||
}
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void setPosition(int key, int position) {
|
||||
int i;
|
||||
for (i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
fPositions[i] = position;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (i >= kMaxLookaheads) {
|
||||
assert(false);
|
||||
i = kMaxLookaheads - 1;
|
||||
}
|
||||
fKeys[i] = key;
|
||||
fPositions[i] = position;
|
||||
assert(fUsedSlotLimit == i);
|
||||
fUsedSlotLimit = i + 1;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
fUsedSlotLimit = 0;
|
||||
}
|
||||
};
|
||||
private LookAheadResults fLookAheadMatches = new LookAheadResults();
|
||||
|
||||
|
||||
/**
|
||||
* The State Machine Engine for moving forward is here.
|
||||
* This function is the heart of the RBBI run time engine.
|
||||
*
|
||||
*
|
||||
* @param stateTable
|
||||
* @return the new iterator position
|
||||
*
|
||||
*
|
||||
* A note on supplementary characters and the position of underlying
|
||||
* Java CharacterIterator: Normally, a character iterator is positioned at
|
||||
* the char most recently returned by next(). Within this function, when
|
||||
@ -1201,7 +1246,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
|
||||
// Set the initial state for the state machine
|
||||
int state = START_STATE;
|
||||
int row = fRData.getRowIndex(state);
|
||||
int row = fRData.getRowIndex(state);
|
||||
short category = 3;
|
||||
int flagsState = fRData.getStateTableFlags(stateTable);
|
||||
int mode = RBBI_RUN;
|
||||
@ -1209,14 +1254,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
category = 2;
|
||||
mode = RBBI_START;
|
||||
if (TRACE) {
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||||
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||||
}
|
||||
}
|
||||
int lookaheadStatus = 0;
|
||||
int lookaheadTagIdx = 0;
|
||||
int lookaheadResult = 0;
|
||||
fLookAheadMatches.reset();
|
||||
|
||||
// loop until we reach the end of the text or transition to state 0
|
||||
while (state != STOP_STATE) {
|
||||
@ -1226,16 +1269,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// We have already run the loop one last time with the
|
||||
// character set to the pseudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
|
||||
if (lookaheadResult > result) {
|
||||
// We ran off the end of the string with a pending
|
||||
// look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been
|
||||
// met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Run the loop one last time with the fake end-of-input character category
|
||||
@ -1252,7 +1285,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
category = (short) trie.getCodePointValue(c);
|
||||
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
@ -1265,15 +1298,15 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
}
|
||||
|
||||
if (TRACE) {
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||||
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||||
}
|
||||
|
||||
// Advance to the next character.
|
||||
// Advance to the next character.
|
||||
// If this is a beginning-of-input loop iteration, don't advance.
|
||||
// The next iteration will be processing the first real input character.
|
||||
c = (int)text.next();
|
||||
c = (int)text.next();
|
||||
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
c = nextTrail32(text, c);
|
||||
}
|
||||
@ -1284,7 +1317,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
|
||||
// look up a state transition in the state table
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fRData.getRowIndex(state);
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
|
||||
// Match found, common case
|
||||
@ -1299,40 +1332,30 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
|
||||
}
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
// TODO: make a standalone hard break in a rule work.
|
||||
if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
|
||||
text.setIndex(result);
|
||||
return result;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further. Continue on.
|
||||
// TODO: junk this feature? I don't think it's used anywhere.
|
||||
continue;
|
||||
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
|
||||
text.setIndex(lookaheadResult);
|
||||
return lookaheadResult;
|
||||
}
|
||||
}
|
||||
|
||||
lookaheadResult = text.getIndex();
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int pos = text.getIndex();
|
||||
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
// The iterator has been left in the middle of a surrogate pair.
|
||||
// We want the beginning of it.
|
||||
lookaheadResult--;
|
||||
pos--;
|
||||
}
|
||||
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
|
||||
continue;
|
||||
fLookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
|
||||
// Because this is an accepting state, any in-progress look-ahead match
|
||||
// is no longer relevant. Clear out the pending lookahead status.
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
|
||||
} // End of state machine main loop
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
@ -1340,7 +1363,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// If the iterator failed to advance in the match engine force it ahead by one.
|
||||
// This indicates a defect in the break rules, which should always match
|
||||
// at least one character.
|
||||
|
||||
|
||||
if (result == initialPosition) {
|
||||
if (TRACE) {
|
||||
System.out.println("Iterator did not move. Advancing by 1.");
|
||||
@ -1365,31 +1388,28 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
if (fText == null || stateTable == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int state;
|
||||
int category = 0;
|
||||
int mode;
|
||||
int row;
|
||||
int row;
|
||||
int c;
|
||||
int lookaheadStatus = 0;
|
||||
int result = 0;
|
||||
int initialPosition = 0;
|
||||
int lookaheadResult = 0;
|
||||
boolean lookAheadHardBreak =
|
||||
(fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
fLookAheadMatches.reset();
|
||||
|
||||
// handlePrevious() never gets the rule status.
|
||||
// Flag the status as invalid; if the user ever asks for status, we will need
|
||||
// to back up, then re-find the break position using handleNext(), which does
|
||||
// get the status value.
|
||||
fLastStatusIndexValid = false;
|
||||
fLastRuleStatusIndex = 0;
|
||||
|
||||
|
||||
// set up the starting char
|
||||
initialPosition = fText.getIndex();
|
||||
result = initialPosition;
|
||||
c = previous32(fText);
|
||||
|
||||
|
||||
// Set up the initial state for the state machine
|
||||
state = START_STATE;
|
||||
row = fRData.getRowIndex(state);
|
||||
@ -1399,129 +1419,95 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
category = 2;
|
||||
mode = RBBI_START;
|
||||
}
|
||||
|
||||
|
||||
if (TRACE) {
|
||||
System.out.println("Handle Prev pos char state category ");
|
||||
}
|
||||
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
//
|
||||
mainLoop: for (;;) {
|
||||
innerBlock: {
|
||||
if (c == DONE32) {
|
||||
// Reached end of input string.
|
||||
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
|
||||
// Either this is the old (ICU 3.2 and earlier) format data which
|
||||
// does not support explicit support for matching {eof}, or
|
||||
// we have already done the {eof} iteration. Now is the time
|
||||
// to unconditionally bail out.
|
||||
if (lookaheadResult < result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
// Ran off start, no match found.
|
||||
// Move one position (towards the start, since we are doing previous.)
|
||||
fText.setIndex(initialPosition);
|
||||
previous32(fText);
|
||||
}
|
||||
break mainLoop;
|
||||
if (c == DONE32) {
|
||||
// Reached end of input string.
|
||||
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
|
||||
// Either this is the old (ICU 3.2 and earlier) format data which
|
||||
// does not support explicit support for matching {eof}, or
|
||||
// we have already done the {eof} iteration. Now is the time
|
||||
// to unconditionally bail out.
|
||||
if (result == initialPosition) {
|
||||
// Ran off start, no match found.
|
||||
// Move one position (towards the start, since we are doing previous.)
|
||||
fText.setIndex(initialPosition);
|
||||
previous32(fText);
|
||||
}
|
||||
mode = RBBI_END;
|
||||
category = 1;
|
||||
break mainLoop;
|
||||
}
|
||||
|
||||
if (mode == RBBI_RUN) {
|
||||
// look up the current character's category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
category = (short) fRData.fTrie.getCodePointValue(c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (TRACE) {
|
||||
System.out.print(" " + fText.getIndex() + " ");
|
||||
if (0x20 <= c && c < 0x7f) {
|
||||
System.out.print(" " + c + " ");
|
||||
} else {
|
||||
System.out.print(" " + Integer.toHexString(c) + " ");
|
||||
}
|
||||
System.out.println(" " + state + " " + category + " ");
|
||||
}
|
||||
|
||||
// State Transition - move machine to its next state
|
||||
mode = RBBI_END;
|
||||
category = 1;
|
||||
}
|
||||
|
||||
if (mode == RBBI_RUN) {
|
||||
// look up the current character's category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
|
||||
// Match found, common case, could have lookahead so we move
|
||||
// on to check it
|
||||
result = fText.getIndex();
|
||||
category = (short) fRData.fTrie.getCodePointValue(c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
|
||||
// Lookahead match is completed. Set the result
|
||||
// accordingly, but only
|
||||
// if no other rule has matched further in the mean
|
||||
// time.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
// TODO: make a stand-alone hard break in a rule work.
|
||||
|
||||
if (lookAheadHardBreak) {
|
||||
break mainLoop;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further.
|
||||
// Continue on.
|
||||
// TODO: junk this feature? I don't think that it's used anywhere.
|
||||
break innerBlock;
|
||||
}
|
||||
// Hit a possible look-ahead match. We are at the
|
||||
// position of the '/'. Remember this position.
|
||||
lookaheadResult = fText.getIndex();
|
||||
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
break innerBlock;
|
||||
}
|
||||
|
||||
// not lookahead...
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
|
||||
// This is a plain (non-look-ahead) accepting state.
|
||||
if (!lookAheadHardBreak) {
|
||||
// Clear out any pending look-ahead matches,
|
||||
// but only if not doing the lookAheadHardBreak option
|
||||
// which needs to force a break no matter what is going
|
||||
// on with the rest of the match, i.e. we can't abandon
|
||||
// a partially completed look-ahead match because
|
||||
// some other rule matched further than the '/' position
|
||||
// in the look-ahead match.
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (TRACE) {
|
||||
System.out.print(" " + fText.getIndex() + " ");
|
||||
if (0x20 <= c && c < 0x7f) {
|
||||
System.out.print(" " + c + " ");
|
||||
} else {
|
||||
System.out.print(" " + Integer.toHexString(c) + " ");
|
||||
}
|
||||
|
||||
} // end of innerBlock. "break innerBlock" in above code comes out here.
|
||||
|
||||
|
||||
System.out.println(" " + state + " " + category + " ");
|
||||
}
|
||||
|
||||
// State Transition - move machine to its next state
|
||||
//
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
|
||||
// Match found, common case, could have lookahead so we move
|
||||
// on to check it
|
||||
result = fText.getIndex();
|
||||
}
|
||||
|
||||
|
||||
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed.
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
result = lookaheadResult;
|
||||
break mainLoop;
|
||||
}
|
||||
}
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int pos = fText.getIndex();
|
||||
fLookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
// Normal loop exit is here
|
||||
break mainLoop;
|
||||
}
|
||||
|
||||
|
||||
// then move iterator position backwards one character
|
||||
//
|
||||
if (mode == RBBI_RUN) {
|
||||
@ -1531,10 +1517,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
mode = RBBI_RUN;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
} // End of the main loop.
|
||||
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
//
|
||||
// If the iterator failed to advance in the match engine, force it ahead by one.
|
||||
@ -1545,12 +1531,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
previous32(fText);
|
||||
result = fText.getIndex();
|
||||
}
|
||||
|
||||
|
||||
fText.setIndex(result);
|
||||
if (TRACE) {
|
||||
System.out.println("Result = " + result);
|
||||
}
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:848a445cb828689cd5bca20bfd321db5503ef66c0a94d929fc108a28d0c5595f
|
||||
size 11754757
|
||||
oid sha256:eb9182edec08706f02236909aaefcbf4c98d29d6415d1a8801633233c74f03fb
|
||||
size 11789631
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a75dfbe25f7671a65bb933aed49a71eb9a923767687625982603c54860478ce7
|
||||
oid sha256:cefefda6f12f61e7dcd7767a7b07b0fea3ca53c2a9b1524f3627e94cad6f3ee0
|
||||
size 90259
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user