ICU-12081 RBBI extensions & Emoji rules. Import rule data to Java from C++, port code changes.

X-SVN-Rev: 38422
This commit is contained in:
Andy Heninger 2016-02-28 19:14:48 +00:00
parent 48214e5b5d
commit b552700cc6
8 changed files with 870 additions and 707 deletions

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2010, International Business Machines Corporation and
* Copyright (c) 2001-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -17,7 +17,7 @@ import com.ibm.icu.impl.Assert;
*/
class RBBINode {
// enum NodeType {
static final int setRef = 0;
static final int uset = 1;
@ -36,7 +36,7 @@ class RBBINode {
static final int opReverse = 14;
static final int opLParen = 15;
static final int nodeTypeLimit = 16; // For Assertion checking only.
static final String [] nodeTypeNames = {
"setRef",
"uset",
@ -56,20 +56,20 @@ class RBBINode {
"opLParen"
};
// enum OpPrecedence {
// enum OpPrecedence {
static final int precZero = 0;
static final int precStart = 1;
static final int precLParen = 2;
static final int precOpOr = 3;
static final int precOpCat = 4;
int fType; // enum NodeType
RBBINode fParent;
RBBINode fLeftChild;
RBBINode fRightChild;
UnicodeSet fInputSet; // For uset nodes only.
int fPrecedence = precZero; // enum OpPrecedence, For binary ops only.
String fText; // Text corresponding to this node.
// May be lazily evaluated when (if) needed
// for some node types.
@ -89,12 +89,17 @@ class RBBINode {
// state transition table.
boolean fLookAheadEnd; // For endMark nodes, set TRUE if
// marking the end of a look-ahead rule.
// marking the end of a look-ahead rule.
boolean fRuleRoot; // True if this node is the root of a rule.
boolean fChainIn; // True if chaining into this rule is allowed
// (no '^' present).
Set<RBBINode> fFirstPosSet; // See Aho DFA table generation algorithm
Set<RBBINode> fLastPosSet; // See Aho.
Set<RBBINode> fLastPosSet; // See Aho.
Set<RBBINode> fFollowPos; // See Aho.
int fSerialNum; // Debugging aids. Each node gets a unique serial number.
static int gLastSerial;
@ -129,6 +134,8 @@ class RBBINode {
fLastPos = other.fLastPos;
fNullable = other.fNullable;
fVal = other.fVal;
fRuleRoot = false;
fChainIn = other.fChainIn;
fFirstPosSet = new HashSet<RBBINode>(other.fFirstPosSet);
fLastPosSet = new HashSet<RBBINode>(other.fLastPosSet);
fFollowPos = new HashSet<RBBINode>(other.fFollowPos);
@ -163,6 +170,8 @@ class RBBINode {
n.fRightChild.fParent = n;
}
}
n.fRuleRoot = this.fRuleRoot;
n.fChainIn = this.fChainIn;
return n;
}
@ -259,8 +268,8 @@ class RBBINode {
}
}
//-------------------------------------------------------------------------
//
// print. Print out a single node, for debugging.
@ -279,7 +288,7 @@ class RBBINode {
RBBINode.printInt(n.fRightChild==null? 0 : n.fRightChild.fSerialNum, 12);
RBBINode.printInt(n.fFirstPos, 12);
RBBINode.printInt(n.fVal, 7);
if (n.fType == varRef) {
System.out.print(" " + n.fText);
}
@ -287,7 +296,7 @@ class RBBINode {
System.out.println("");
}
///CLOVER:ON
// Print a String in a fixed field size.
// Debugging function.
@ -344,7 +353,7 @@ class RBBINode {
if (fLeftChild != null) {
fLeftChild.printTree(false);
}
if (fRightChild != null) {
fRightChild.printTree(false);
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines Corporation and
* others. All Rights Reserved.
* Copyright (c) 2003-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -13,6 +13,8 @@ package com.ibm.icu.text;
* rule parser.
* It is generated by the Perl script "rbbicst.pl" from
* the rule parser state definitions file "rbbirpt.txt".
* @internal
*
*/
class RBBIRuleParseTable
{
@ -29,24 +31,25 @@ class RBBIRuleParseTable
static final short doExprStart = 11;
static final short doLParen = 12;
static final short doNOP = 13;
static final short doOptionEnd = 14;
static final short doOptionStart = 15;
static final short doReverseDir = 16;
static final short doRuleChar = 17;
static final short doRuleError = 18;
static final short doRuleErrorAssignExpr = 19;
static final short doScanUnicodeSet = 20;
static final short doSlash = 21;
static final short doStartAssign = 22;
static final short doStartTagValue = 23;
static final short doStartVariableName = 24;
static final short doTagDigit = 25;
static final short doTagExpectedError = 26;
static final short doTagValue = 27;
static final short doUnaryOpPlus = 28;
static final short doUnaryOpQuestion = 29;
static final short doUnaryOpStar = 30;
static final short doVariableNameExpectedErr = 31;
static final short doNoChain = 14;
static final short doOptionEnd = 15;
static final short doOptionStart = 16;
static final short doReverseDir = 17;
static final short doRuleChar = 18;
static final short doRuleError = 19;
static final short doRuleErrorAssignExpr = 20;
static final short doScanUnicodeSet = 21;
static final short doSlash = 22;
static final short doStartAssign = 23;
static final short doStartTagValue = 24;
static final short doStartVariableName = 25;
static final short doTagDigit = 26;
static final short doTagExpectedError = 27;
static final short doTagValue = 28;
static final short doUnaryOpPlus = 29;
static final short doUnaryOpQuestion = 30;
static final short doUnaryOpStar = 31;
static final short doVariableNameExpectedErr = 32;
static final short kRuleSet_default = 255;
static final short kRuleSet_digit_char = 128;
@ -73,104 +76,112 @@ class RBBIRuleParseTable
fNextChar = nc;
fStateName = sn;
}
}
};
static RBBIRuleTableElement[] gRuleParseStateTable = {
new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0
, new RBBIRuleTableElement(doExprStart, 254, 21, 8, false, "start") // 1
, new RBBIRuleTableElement(doExprStart, 254, 29, 9, false, "start") // 1
, new RBBIRuleTableElement(doNOP, 132, 1,0, true, null ) // 2
, new RBBIRuleTableElement(doExprStart,'$', 80, 90, false, null ) // 3
, new RBBIRuleTableElement(doNOP,'!', 11,0, true, null ) // 4
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 5
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 6
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, null ) // 7
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 8
, new RBBIRuleTableElement(doNOP, 132, 8,0, true, null ) // 9
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 10
, new RBBIRuleTableElement(doNOP,'!', 13,0, true, "rev-option") // 11
, new RBBIRuleTableElement(doReverseDir, 255, 20, 8, false, null ) // 12
, new RBBIRuleTableElement(doOptionStart, 130, 15,0, true, "option-scan1") // 13
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 14
, new RBBIRuleTableElement(doNOP, 129, 15,0, true, "option-scan2") // 15
, new RBBIRuleTableElement(doOptionEnd, 255, 17,0, false, null ) // 16
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 17
, new RBBIRuleTableElement(doNOP, 132, 17,0, true, null ) // 18
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 19
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, "reverse-rule") // 20
, new RBBIRuleTableElement(doRuleChar, 254, 30,0, true, "term") // 21
, new RBBIRuleTableElement(doNOP, 132, 21,0, true, null ) // 22
, new RBBIRuleTableElement(doRuleChar, 131, 30,0, true, null ) // 23
, new RBBIRuleTableElement(doNOP,'[', 86, 30, false, null ) // 24
, new RBBIRuleTableElement(doLParen,'(', 21, 30, true, null ) // 25
, new RBBIRuleTableElement(doNOP,'$', 80, 29, false, null ) // 26
, new RBBIRuleTableElement(doDotAny,'.', 30,0, true, null ) // 27
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 28
, new RBBIRuleTableElement(doCheckVarDef, 255, 30,0, false, "term-var-ref") // 29
, new RBBIRuleTableElement(doNOP, 132, 30,0, true, "expr-mod") // 30
, new RBBIRuleTableElement(doUnaryOpStar,'*', 35,0, true, null ) // 31
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 35,0, true, null ) // 32
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 35,0, true, null ) // 33
, new RBBIRuleTableElement(doNOP, 255, 35,0, false, null ) // 34
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont") // 35
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 36
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 37
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 38
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 39
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 40
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 41
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 42
, new RBBIRuleTableElement(doExprCatOperator,'{', 59,0, true, null ) // 43
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 44
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 45
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 46
, new RBBIRuleTableElement(doSlash,'/', 49,0, true, "look-ahead") // 47
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 48
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-slash") // 49
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 50
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 51
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 52
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 53
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 54
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 55
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 56
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 57
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 58
, new RBBIRuleTableElement(doNOP, 132, 59,0, true, "tag-open") // 59
, new RBBIRuleTableElement(doStartTagValue, 128, 62,0, false, null ) // 60
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 61
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-value") // 62
, new RBBIRuleTableElement(doNOP,'}', 66,0, false, null ) // 63
, new RBBIRuleTableElement(doTagDigit, 128, 62,0, true, null ) // 64
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 65
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-close") // 66
, new RBBIRuleTableElement(doTagValue,'}', 69,0, true, null ) // 67
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 68
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-tag") // 69
, new RBBIRuleTableElement(doNOP, 132, 69,0, true, null ) // 70
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 71
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 72
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 73
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 74
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 75
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 76
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 77
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 78
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 79
, new RBBIRuleTableElement(doStartVariableName,'$', 82,0, true, "scan-var-name") // 80
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 81
, new RBBIRuleTableElement(doNOP, 130, 84,0, true, "scan-var-start") // 82
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 95,0, false, null ) // 83
, new RBBIRuleTableElement(doNOP, 129, 84,0, true, "scan-var-body") // 84
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 85
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 86
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 87
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 88
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 89
, new RBBIRuleTableElement(doNOP, 132, 90,0, true, "assign-or-rule") // 90
, new RBBIRuleTableElement(doStartAssign,'=', 21, 93, true, null ) // 91
, new RBBIRuleTableElement(doNOP, 255, 29, 8, false, null ) // 92
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 93
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 95,0, false, null ) // 94
, new RBBIRuleTableElement(doExit, 255, 95,0, true, "errorDeath") // 95
, new RBBIRuleTableElement(doNoChain,'^', 12, 9, true, null ) // 3
, new RBBIRuleTableElement(doExprStart,'$', 88, 98, false, null ) // 4
, new RBBIRuleTableElement(doNOP,'!', 19,0, true, null ) // 5
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 6
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 7
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, null ) // 8
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 9
, new RBBIRuleTableElement(doNOP, 132, 9,0, true, null ) // 10
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 11
, new RBBIRuleTableElement(doExprStart, 254, 29,0, false, "start-after-caret") // 12
, new RBBIRuleTableElement(doNOP, 132, 12,0, true, null ) // 13
, new RBBIRuleTableElement(doRuleError,'^', 103,0, false, null ) // 14
, new RBBIRuleTableElement(doExprStart,'$', 88, 37, false, null ) // 15
, new RBBIRuleTableElement(doRuleError,';', 103,0, false, null ) // 16
, new RBBIRuleTableElement(doRuleError, 252, 103,0, false, null ) // 17
, new RBBIRuleTableElement(doExprStart, 255, 29,0, false, null ) // 18
, new RBBIRuleTableElement(doNOP,'!', 21,0, true, "rev-option") // 19
, new RBBIRuleTableElement(doReverseDir, 255, 28, 9, false, null ) // 20
, new RBBIRuleTableElement(doOptionStart, 130, 23,0, true, "option-scan1") // 21
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 22
, new RBBIRuleTableElement(doNOP, 129, 23,0, true, "option-scan2") // 23
, new RBBIRuleTableElement(doOptionEnd, 255, 25,0, false, null ) // 24
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 25
, new RBBIRuleTableElement(doNOP, 132, 25,0, true, null ) // 26
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 27
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, "reverse-rule") // 28
, new RBBIRuleTableElement(doRuleChar, 254, 38,0, true, "term") // 29
, new RBBIRuleTableElement(doNOP, 132, 29,0, true, null ) // 30
, new RBBIRuleTableElement(doRuleChar, 131, 38,0, true, null ) // 31
, new RBBIRuleTableElement(doNOP,'[', 94, 38, false, null ) // 32
, new RBBIRuleTableElement(doLParen,'(', 29, 38, true, null ) // 33
, new RBBIRuleTableElement(doNOP,'$', 88, 37, false, null ) // 34
, new RBBIRuleTableElement(doDotAny,'.', 38,0, true, null ) // 35
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 36
, new RBBIRuleTableElement(doCheckVarDef, 255, 38,0, false, "term-var-ref") // 37
, new RBBIRuleTableElement(doNOP, 132, 38,0, true, "expr-mod") // 38
, new RBBIRuleTableElement(doUnaryOpStar,'*', 43,0, true, null ) // 39
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 43,0, true, null ) // 40
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 43,0, true, null ) // 41
, new RBBIRuleTableElement(doNOP, 255, 43,0, false, null ) // 42
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont") // 43
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 44
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 45
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 46
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 47
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 48
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 49
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 50
, new RBBIRuleTableElement(doExprCatOperator,'{', 67,0, true, null ) // 51
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 52
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 53
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 54
, new RBBIRuleTableElement(doSlash,'/', 57,0, true, "look-ahead") // 55
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 56
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-slash") // 57
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 58
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 59
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 60
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 61
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 62
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 63
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 64
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 65
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 66
, new RBBIRuleTableElement(doNOP, 132, 67,0, true, "tag-open") // 67
, new RBBIRuleTableElement(doStartTagValue, 128, 70,0, false, null ) // 68
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 69
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-value") // 70
, new RBBIRuleTableElement(doNOP,'}', 74,0, false, null ) // 71
, new RBBIRuleTableElement(doTagDigit, 128, 70,0, true, null ) // 72
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 73
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-close") // 74
, new RBBIRuleTableElement(doTagValue,'}', 77,0, true, null ) // 75
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 76
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-tag") // 77
, new RBBIRuleTableElement(doNOP, 132, 77,0, true, null ) // 78
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 79
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 80
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 81
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 82
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 83
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 84
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 85
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 86
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 87
, new RBBIRuleTableElement(doStartVariableName,'$', 90,0, true, "scan-var-name") // 88
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 89
, new RBBIRuleTableElement(doNOP, 130, 92,0, true, "scan-var-start") // 90
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 103,0, false, null ) // 91
, new RBBIRuleTableElement(doNOP, 129, 92,0, true, "scan-var-body") // 92
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 93
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 94
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 95
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 96
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 97
, new RBBIRuleTableElement(doNOP, 132, 98,0, true, "assign-or-rule") // 98
, new RBBIRuleTableElement(doStartAssign,'=', 29, 101, true, null ) // 99
, new RBBIRuleTableElement(doNOP, 255, 37, 9, false, null ) // 100
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 101
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 103,0, false, null ) // 102
, new RBBIRuleTableElement(doExit, 255, 103,0, true, "errorDeath") // 103
};
}
};

View File

@ -1,9 +1,9 @@
/*
*******************************************************************************
* Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
* Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.ParsePosition;
@ -19,12 +19,12 @@ import com.ibm.icu.lang.UCharacter;
* There is no public API here.
*/
class RBBIRuleScanner {
private final static int kStackSize = 100; // The size of the state stack for
// rules parsing. Corresponds roughly
// to the depth of parentheses nesting
// that is allowed in the rules.
static class RBBIRuleChar {
int fChar;
boolean fEscaped;
@ -33,7 +33,7 @@ class RBBIRuleScanner {
RBBIRuleBuilder fRB; // The rule builder that we are part of.
int fScanIndex; // Index of current character being processed
// in the rule input string.
int fNextIndex; // Index of the next character, which
@ -43,49 +43,52 @@ class RBBIRuleScanner {
int fCharNum; // Char position within the line.
int fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
// processing.
String fVarName; // $variableName, valid when we've just
// scanned one.
short fStack[] = new short[kStackSize]; // State stack, holds state pushes
int fStackPtr; // and pops as specified in the state
// transition rules.
RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
// during the parse of a rule
int fNodeStackPtr;
boolean fReverseRule; // True if the rule currently being scanned
boolean fReverseRule; // True if the rule currently being scanned
// is a reverse direction rule (if it
// starts with a '!')
boolean fLookAheadRule; // True if the rule includes a '/'
boolean fLookAheadRule; // True if the rule includes a '/'
// somewhere within it.
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
boolean fNoChainInRule; // True if the current rule starts with a '^'.
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
// the sets created while parsing rules.
// The key is the string used for creating
// the set.
UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
// the scanning of RBBI rules. The
// indicies for these are assigned by the
// perl script that builds the state tables.
// See rbbirpt.h.
int fRuleNum; // Counts each rule as it is scanned.
int fOptionStart; // Input index of start of a !!option
// keyword, while being scanned.
static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
@ -94,8 +97,8 @@ class RBBIRuleScanner {
static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
static private String kAny = "any";
//----------------------------------------------------------------------------------------
//
@ -139,6 +142,12 @@ class RBBIRuleScanner {
fRuleNum++;
break;
case RBBIRuleParseTable.doNoChain:
// Scanned a '^' while on the rule start state.
fNoChainInRule = true;
break;
case RBBIRuleParseTable.doExprOrOperator: {
fixOpStack(RBBINode.precOpCat);
RBBINode operandNode = fNodeStack[fNodeStackPtr--];
@ -241,11 +250,11 @@ class RBBIRuleScanner {
printNodeStack("end of rule");
}
Assert.assrt(fNodeStackPtr == 1);
RBBINode thisRule = fNodeStack[fNodeStackPtr];
// If this rule includes a look-ahead '/', add a endMark node to the
// expression tree.
if (fLookAheadRule) {
RBBINode thisRule = fNodeStack[fNodeStackPtr];
RBBINode endNode = pushNewNode(RBBINode.endMark);
RBBINode catNode = pushNewNode(RBBINode.opCat);
fNodeStackPtr -= 2;
@ -254,8 +263,24 @@ class RBBIRuleScanner {
fNodeStack[fNodeStackPtr] = catNode;
endNode.fVal = fRuleNum;
endNode.fLookAheadEnd = true;
thisRule = catNode;
// TODO: Disable chaining out of look-ahead (hard break) rules.
// The break on rule match is forced, so there is no point in building up
// the state table to chain into another rule for a longer match.
}
// Mark this node as being the root of a rule.
thisRule.fRuleRoot = true;
// Flag if chaining into this rule is wanted.
//
if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
thisRule.fChainIn = true;
}
// All rule expressions are ORed together.
// The ';' that terminates an expression really just functions as a
// '|' with
@ -269,12 +294,12 @@ class RBBIRuleScanner {
int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
if (fRB.fTreeRoots[destRules] != null) {
// This is not the first rule encounted.
// This is not the first rule encountered.
// OR previous stuff (from *destRules)
// with the current rule expression (on the Node Stack)
// with the resulting OR expression going to *destRules
//
RBBINode thisRule = fNodeStack[fNodeStackPtr];
thisRule = fNodeStack[fNodeStackPtr];
RBBINode prevRules = fRB.fTreeRoots[destRules];
RBBINode orNode = pushNewNode(RBBINode.opOr);
orNode.fLeftChild = prevRules;
@ -289,6 +314,7 @@ class RBBIRuleScanner {
}
fReverseRule = false; // in preparation for the next rule.
fLookAheadRule = false;
fNoChainInRule = false;
fNodeStackPtr = 0;
}
break;

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2002-2009, International Business Machines
* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -28,9 +28,9 @@ import com.ibm.icu.lang.UProperty;
// There is no user-visible public API here.
//
class RBBITableBuilder {
//
// RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
// one for each state.
@ -58,8 +58,8 @@ class RBBITableBuilder {
// symbol.
}
}
private RBBIRuleBuilder fRB;
private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
// for the parse tree to operate on.
@ -84,7 +84,7 @@ class RBBITableBuilder {
//-----------------------------------------------------------------------------
//
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
@ -109,11 +109,11 @@ class RBBITableBuilder {
}
//
// If the rules contained any references to {bof}
// If the rules contained any references to {bof}
// add a {bof} <cat> <former root of tree> to the
// tree. Means that all matches must start out with the
// tree. Means that all matches must start out with the
// {bof} fake character.
//
//
if (fRB.fSetBuilder.sawBOF()) {
RBBINode bofTop = new RBBINode(RBBINode.opCat);
RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
@ -361,6 +361,25 @@ class RBBITableBuilder {
}
}
//-----------------------------------------------------------------------------
//
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
// as roots of a rule to a destination vector.
//
//-----------------------------------------------------------------------------
void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
if (node == null) {
return;
}
if (node.fRuleRoot) {
dest.add(node);
// Note: rules cannot nest. If we found a rule start node,
// no child node can also be a start node.
return;
}
addRuleRootNodes(dest, node.fLeftChild);
addRuleRootNodes(dest, node.fRightChild);
}
//-----------------------------------------------------------------------------
//
@ -379,17 +398,21 @@ class RBBITableBuilder {
// get a list all leaf nodes
tree.findNodes(leafNodes, RBBINode.leafChar);
// Get all nodes that can be the start a match, which is FirstPosition()
// of the portion of the tree corresponding to user-written rules.
// See the tree description in bofFixup().
RBBINode userRuleRoot = tree;
if (fRB.fSetBuilder.sawBOF()) {
userRuleRoot = tree.fLeftChild.fRightChild;
}
Assert.assrt(userRuleRoot != null);
Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;
// Collect all leaf nodes that can start matches for rules
// with inbound chaining enabled, which is the union of the
// firstPosition sets from each of the rule root nodes.
// Iteratate over all leaf nodes,
List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
addRuleRootNodes(ruleRootNodes, tree);
Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
for (RBBINode node: ruleRootNodes) {
if (node.fChainIn) {
matchStartNodes.addAll(node.fFirstPosSet);
}
}
// Iterate over all leaf nodes,
//
for (RBBINode tNode : leafNodes) {
RBBINode endNode = null;
@ -461,9 +484,9 @@ class RBBITableBuilder {
//
// The parse tree looks like this ...
// fTree root --. <cat>
// / \
// / \
// <cat> <#end node>
// / \
// / \
// <bofNode> rest
// of tree
//
@ -477,7 +500,7 @@ class RBBITableBuilder {
// (excluding the fake bofNode)
// We want the nodes that can start a match in the
// part labeled "rest of tree"
//
//
Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
for (RBBINode startNode : matchStartNodes) {
if (startNode.fType != RBBINode.leafChar) {
@ -489,7 +512,7 @@ class RBBITableBuilder {
// explicitly written into a rule.
// Add everything from the followPos set of this node to the
// followPos set of the fake bofNode at the start of the tree.
//
//
bofNode.fFollowPos.addAll(startNode.fFollowPos);
}
}
@ -705,7 +728,7 @@ class RBBITableBuilder {
// The RBBI runtime uses an array of {sets of status values} that can
// be returned for boundaries. Each accepting state that has non-zero
// status includes an index into this array. The format of the array
// is
// is
// Num of status values in group 1
// status val
// status val
@ -718,7 +741,7 @@ class RBBITableBuilder {
//
//
//-----------------------------------------------------------------------------
void mergeRuleStatusVals() {
//
// The basic outline of what happens here is this...
@ -731,14 +754,14 @@ class RBBITableBuilder {
// add the tag list for this state to the global list.
//
int n;
// Pre-load a single tag of {0} into the table.
// We will need this as a default, for rule sets with no explicit tagging,
// or with explicit tagging of {0}.
if (fRB.fRuleStatusVals.size() == 0) {
fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group
fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero
SortedSet<Integer> s0 = new TreeSet<Integer>();
Integer izero = Integer.valueOf(0);
fRB.fStatusSets.put(s0, izero);
@ -756,17 +779,17 @@ class RBBITableBuilder {
if (arrayIndexI == null) {
// This is the first encounter of this set of status values.
// Add them to the statusSets map, This map associates
// the set of status values with an index in the runtime status
// the set of status values with an index in the runtime status
// values array.
arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());
fRB.fStatusSets.put(statusVals, arrayIndexI);
// Add the new set of status values to the vector of values that
// will eventually become the array used by the runtime engine.
fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));
fRB.fRuleStatusVals.addAll(statusVals);
}
// Save the runtime array index back into the state descriptor.
sd.fTagsIdx = arrayIndexI.intValue();
}
@ -784,7 +807,7 @@ class RBBITableBuilder {
// for each node in the tree.
//
//-----------------------------------------------------------------------------
void printPosSets(RBBINode n) {
if (n==null) {
return;
@ -804,7 +827,7 @@ class RBBITableBuilder {
printPosSets(n.fLeftChild);
printPosSets(n.fRightChild);
}
@ -860,7 +883,7 @@ class RBBITableBuilder {
// See struct RBBIStateTable in ICU4C, common/rbbidata.h
//
//-----------------------------------------------------------------------------
short [] exportTable() {
int state;
int col;
@ -870,18 +893,18 @@ class RBBITableBuilder {
}
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
fDStates.size() < 0x7fff);
fDStates.size() < 0x7fff);
int numStates = fDStates.size();
// Size of table size in shorts.
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
int tableSize = getTableSize() / 2;
short [] table = new short[tableSize];
//
// Fill in the header fields.
// Annoying because they really want to be ints, not shorts.
@ -893,7 +916,7 @@ class RBBITableBuilder {
// RBBIStateTable.fRowLen
table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16);
table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
// RBBIStateTable.fFlags
int flags = 0;
if (fRB.fLookAheadHardBreak) {
@ -904,7 +927,7 @@ class RBBITableBuilder {
}
table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16);
table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
for (state=0; state<numStates; state++) {
RBBIStateDescriptor sd = fDStates.get(state);
@ -928,14 +951,14 @@ class RBBITableBuilder {
// printSet Debug function. Print the contents of a set of Nodes
//
//-----------------------------------------------------------------------------
void printSet(Collection<RBBINode> s) {
for (RBBINode n : s) {
RBBINode.printInt(n.fSerialNum, 8);
}
System.out.println();
}
//-----------------------------------------------------------------------------
@ -943,7 +966,7 @@ class RBBITableBuilder {
// printStates Debug Function. Dump the fully constructed state transition table.
//
//-----------------------------------------------------------------------------
void printStates() {
int c; // input "character"
int n; // state number
@ -964,7 +987,7 @@ class RBBITableBuilder {
RBBIStateDescriptor sd = fDStates.get(n);
RBBINode.printInt(n, 5);
System.out.print(" | ");
RBBINode.printInt(sd.fAccepting, 3);
RBBINode.printInt(sd.fLookAhead, 4);
RBBINode.printInt(sd.fTagsIdx, 6);
@ -976,7 +999,7 @@ class RBBITableBuilder {
}
System.out.print("\n\n");
}
@ -985,7 +1008,7 @@ class RBBITableBuilder {
// printRuleStatusTable Debug Function. Dump the common rule status table
//
//-----------------------------------------------------------------------------
void printRuleStatusTable() {
int thisRecord = 0;
int nextRecord = 0;
@ -1007,7 +1030,7 @@ class RBBITableBuilder {
}
System.out.print("\n\n");
}
}

View File

@ -30,17 +30,17 @@ import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
/**
* Rule Based Break Iterator
* Rule Based Break Iterator
* This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
*
*
* @stable ICU 2.0
*/
public class RuleBasedBreakIterator extends BreakIterator {
//=======================================================================
// Constructors & Factories
//=======================================================================
/**
/**
* private constructor
*/
private RuleBasedBreakIterator() {
@ -51,14 +51,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
/**
* Create a break iterator from a precompiled set of break rules.
*
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
*
* @param is an input stream supplying the compiled binary rules.
* @throws IOException if there is an error while reading the rules from the InputStream.
* @see #compileRules(String, OutputStream)
@ -67,7 +67,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
return This;
return This;
}
/**
@ -129,7 +129,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
{
RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
if (fText != null) {
result.fText = (CharacterIterator)(fText.clone());
result.fText = (CharacterIterator)(fText.clone());
}
return result;
}
@ -151,15 +151,15 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
}
if (fRData != null && other.fRData != null &&
if (fRData != null && other.fRData != null &&
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
return false;
}
if (fText == null && other.fText == null) {
return true;
return true;
}
if (fText == null || other.fText == null) {
return false;
return false;
}
return fText.equals(other.fText);
}
@ -188,13 +188,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
public int hashCode()
{
return fRData.fRuleSource.hashCode();
return fRData.fRuleSource.hashCode();
}
private static final int START_STATE = 1; // The state number of the starting state
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
// of user text. A variable with this enum type keeps track of where we
// are. The state machine only fetches user text input while in RUN mode.
@ -206,14 +206,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
* The character iterator through which this BreakIterator accesses the text.
*/
private CharacterIterator fText = new java.text.StringCharacterIterator("");
/**
* The rule data for this BreakIterator instance. Package private.
*/
RBBIDataWrapper fRData;
/*
* Index of the Rule {tag} values for the most recent match.
* Index of the Rule {tag} values for the most recent match.
*/
private int fLastRuleStatusIndex;
@ -245,18 +245,18 @@ public class RuleBasedBreakIterator extends BreakIterator {
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
/**
* What kind of break iterator this is. Set to KIND_LINE by default,
* What kind of break iterator this is. Set to KIND_LINE by default,
* since this produces sensible output.
*/
private int fBreakType = KIND_LINE;
/**
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
* by a dictionary but we have no dictionary implementation for them.
*/
private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
@ -271,8 +271,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
private int fPositionInCache;
private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
new ConcurrentHashMap<Integer, LanguageBreakEngine>();
/**
* Dumps caches and performs other actions associated with a complete change
@ -293,18 +293,18 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
@Deprecated
public void dump() {
this.fRData.dump();
this.fRData.dump();
}
/**
* Compile a set of source break rules into the binary state tables used
* by the break iterator engine. Creating a break iterator from precompiled
* rules is much faster than creating one from source rules.
*
*
* Binary break rules are not guaranteed to be compatible between different
* versions of ICU.
*
*
*
*
* @param rules The source form of the break rules
* @param ruleBinary An output stream to receive the compiled rules.
* @throws IOException If there is an error writing the output.
@ -314,7 +314,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
RBBIRuleBuilder.compileRules(rules, ruleBinary);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
@ -337,7 +337,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
fText.first();
return fText.getIndex();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
@ -364,7 +364,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
fText.setIndex(pos);
return pos;
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
@ -387,7 +387,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
return result;
}
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
@ -424,11 +424,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
* process.
*/
private int checkDictionary(int startPos, int endPos, boolean reverse) {
// Reset the old break cache first.
reset();
// note: code segment below assumes that dictionary chars are in the
// note: code segment below assumes that dictionary chars are in the
// startPos-endPos range
// value returned should be next character in sequence
if ((endPos - startPos) <= 1) {
@ -465,7 +465,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
} while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
// Back up to the last dictionary character
rangeEnd = fText.getIndex();
if (c == CharacterIteration.DONE32) {
@ -497,7 +497,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
category = (short)fRData.fTrie.getCodePointValue(c);
}
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
// any breaks within the span.
@ -518,11 +518,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (current >= rangeEnd) {
break;
}
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
lbe = getLanguageBreakEngine(c);
// Ask the language object if there are any breaks. It will leave the text
// pointer on the other side of its range, ready to search for the next one.
if (lbe != null) {
@ -530,12 +530,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks);
assert fText.getIndex() > startingIdx;
}
// Reload the loop variables for the next go-round
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
}
// If we found breaks, build a new break cache. The first and last entries must
// be the original starting and ending position.
if (foundBreakCount > 0) {
@ -549,15 +549,15 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (endPos > breaks.peek()) {
breaks.push(endPos);
}
// TODO: get rid of this array, use results from the deque directly
fCachedBreakPositions = new int[breaks.size()];
int i = 0;
while (breaks.size() > 0) {
fCachedBreakPositions[i++] = breaks.pollLast();
}
// If there are breaks, then by definition, we are replacing the original
// proposed break by one of the breaks we found. Use following() and
// preceding() to do the work. They should never recurse in this case.
@ -573,10 +573,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
// to the original proposed break.
fText.setIndex(reverse ? startPos : endPos);
return (reverse ? startPos : endPos);
}
/**
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
@ -585,7 +585,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
public int previous() {
int result;
int startPos;
CharacterIterator text = getText();
fLastStatusIndexValid = false;
@ -705,7 +705,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
return text.getIndex();
}
}
private int rulesFollowing(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
@ -744,7 +744,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
if (fRData.fSFTable != null) {
// No Safe point reverse table, but there is a safe pt forward table.
//
//
fText.setIndex(offset);
previous32(fText);
// handle next will give result >= offset
@ -820,7 +820,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
return text.getIndex();
}
}
private int rulesPreceding(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
@ -1002,7 +1002,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
/**
* Get the status (tag) values from the break rule(s) that determined the most
* Get the status (tag) values from the break rule(s) that determined the most
* recently returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
@ -1014,8 +1014,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
* the output will be truncated to the available length. No exception
* will be thrown.
*
* @param fillInArray an array to be filled in with the status values.
* @return The number of rule status values from rules that determined
* @param fillInArray an array to be filled in with the status values.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* In the event that the array is too small, the return value
* is the total number of status values that were available,
@ -1026,7 +1026,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
public int getRuleStatusVec(int[] fillInArray) {
makeRuleStatusValid();
int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
if (fillInArray != null) {
if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i<numToCopy; i++) {
fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
@ -1079,8 +1079,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
ICUDebug.value(RBBI_DEBUG_ARG) : null;
private LanguageBreakEngine getLanguageBreakEngine(int c) {
// We have a dictionary character.
@ -1098,7 +1098,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Fold them together for mapping from script -> engine.
script = UScript.HAN;
}
LanguageBreakEngine eng = fBreakEngines.get(script);
/*
if (eng != null && !eng.handles(c, fBreakType)) {
@ -1158,15 +1158,60 @@ public class RuleBasedBreakIterator extends BreakIterator {
return eng;
}
private static final int kMaxLookaheads = 8;
private static class LookAheadResults {
int fUsedSlotLimit;
int[] fPositions;
int[] fKeys;
LookAheadResults() {
fUsedSlotLimit= 0;
fPositions = new int[kMaxLookaheads];
fKeys = new int[kMaxLookaheads];
}
int getPosition(int key) {
for (int i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
return fPositions[i];
}
}
assert(false);
return -1;
}
void setPosition(int key, int position) {
int i;
for (i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
fPositions[i] = position;
return;
}
}
if (i >= kMaxLookaheads) {
assert(false);
i = kMaxLookaheads - 1;
}
fKeys[i] = key;
fPositions[i] = position;
assert(fUsedSlotLimit == i);
fUsedSlotLimit = i + 1;
}
void reset() {
fUsedSlotLimit = 0;
}
};
private LookAheadResults fLookAheadMatches = new LookAheadResults();
/**
* The State Machine Engine for moving forward is here.
* This function is the heart of the RBBI run time engine.
*
*
* @param stateTable
* @return the new iterator position
*
*
* A note on supplementary characters and the position of underlying
* Java CharacterIterator: Normally, a character iterator is positioned at
* the char most recently returned by next(). Within this function, when
@ -1201,7 +1246,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Set the initial state for the state machine
int state = START_STATE;
int row = fRData.getRowIndex(state);
int row = fRData.getRowIndex(state);
short category = 3;
int flagsState = fRData.getStateTableFlags(stateTable);
int mode = RBBI_RUN;
@ -1209,14 +1254,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
category = 2;
mode = RBBI_START;
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
}
int lookaheadStatus = 0;
int lookaheadTagIdx = 0;
int lookaheadResult = 0;
fLookAheadMatches.reset();
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
@ -1226,16 +1269,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
// We have already run the loop one last time with the
// character set to the pseudo {eof} value. Now it is time
// to unconditionally bail out.
if (lookaheadResult > result) {
// We ran off the end of the string with a pending
// look-ahead match.
// Treat this as if the look-ahead condition had been
// met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
}
break;
}
// Run the loop one last time with the fake end-of-input character category
@ -1252,7 +1285,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// which column in the state table to look at.
//
category = (short) trie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
@ -1265,15 +1298,15 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// Advance to the next character.
// Advance to the next character.
// If this is a beginning-of-input loop iteration, don't advance.
// The next iteration will be processing the first real input character.
c = (int)text.next();
c = (int)text.next();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(text, c);
}
@ -1284,7 +1317,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up a state transition in the state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case
@ -1299,40 +1332,30 @@ public class RuleBasedBreakIterator extends BreakIterator {
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
}
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
if (lookaheadStatus != 0
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
// TODO: make a standalone hard break in a rule work.
if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
text.setIndex(result);
return result;
}
// Look-ahead completed, but other rules may match further. Continue on.
// TODO: junk this feature? I don't think it's used anywhere.
continue;
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
text.setIndex(lookaheadResult);
return lookaheadResult;
}
}
lookaheadResult = text.getIndex();
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
// We want the beginning of it.
lookaheadResult--;
pos--;
}
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
continue;
fLookAheadMatches.setPosition(rule, pos);
}
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relevant. Clear out the pending lookahead status.
lookaheadStatus = 0;
}
} // End of state machine main loop
// The state machine is done. Check whether it found a match...
@ -1340,7 +1363,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// If the iterator failed to advance in the match engine force it ahead by one.
// This indicates a defect in the break rules, which should always match
// at least one character.
if (result == initialPosition) {
if (TRACE) {
System.out.println("Iterator did not move. Advancing by 1.");
@ -1365,31 +1388,28 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (fText == null || stateTable == null) {
return 0;
}
int state;
int category = 0;
int mode;
int row;
int row;
int c;
int lookaheadStatus = 0;
int result = 0;
int initialPosition = 0;
int lookaheadResult = 0;
boolean lookAheadHardBreak =
(fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
fLookAheadMatches.reset();
// handlePrevious() never gets the rule status.
// Flag the status as invalid; if the user ever asks for status, we will need
// to back up, then re-find the break position using handleNext(), which does
// get the status value.
fLastStatusIndexValid = false;
fLastRuleStatusIndex = 0;
// set up the starting char
initialPosition = fText.getIndex();
result = initialPosition;
c = previous32(fText);
// Set up the initial state for the state machine
state = START_STATE;
row = fRData.getRowIndex(state);
@ -1399,129 +1419,95 @@ public class RuleBasedBreakIterator extends BreakIterator {
category = 2;
mode = RBBI_START;
}
if (TRACE) {
System.out.println("Handle Prev pos char state category ");
}
// loop until we reach the beginning of the text or transition to state 0
//
mainLoop: for (;;) {
innerBlock: {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
// Either this is the old (ICU 3.2 and earlier) format data which
// does not support explicit support for matching {eof}, or
// we have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
if (lookaheadResult < result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off start, no match found.
// Move one position (towards the start, since we are doing previous.)
fText.setIndex(initialPosition);
previous32(fText);
}
break mainLoop;
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
// Either this is the old (ICU 3.2 and earlier) format data which
// does not support explicit support for matching {eof}, or
// we have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
if (result == initialPosition) {
// Ran off start, no match found.
// Move one position (towards the start, since we are doing previous.)
fText.setIndex(initialPosition);
previous32(fText);
}
mode = RBBI_END;
category = 1;
break mainLoop;
}
if (mode == RBBI_RUN) {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
}
if (TRACE) {
System.out.print(" " + fText.getIndex() + " ");
if (0x20 <= c && c < 0x7f) {
System.out.print(" " + c + " ");
} else {
System.out.print(" " + Integer.toHexString(c) + " ");
}
System.out.println(" " + state + " " + category + " ");
}
// State Transition - move machine to its next state
mode = RBBI_END;
category = 1;
}
if (mode == RBBI_RUN) {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move
// on to check it
result = fText.getIndex();
category = (short) fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
if (lookaheadStatus != 0
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
// Lookahead match is completed. Set the result
// accordingly, but only
// if no other rule has matched further in the mean
// time.
result = lookaheadResult;
lookaheadStatus = 0;
// TODO: make a stand-alone hard break in a rule work.
if (lookAheadHardBreak) {
break mainLoop;
}
// Look-ahead completed, but other rules may match further.
// Continue on.
// TODO: junk this feature? I don't think that it's used anywhere.
break innerBlock;
}
// Hit a possible look-ahead match. We are at the
// position of the '/'. Remember this position.
lookaheadResult = fText.getIndex();
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
break innerBlock;
}
// not lookahead...
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
// This is a plain (non-look-ahead) accepting state.
if (!lookAheadHardBreak) {
// Clear out any pending look-ahead matches,
// but only if not doing the lookAheadHardBreak option
// which needs to force a break no matter what is going
// on with the rest of the match, i.e. we can't abandon
// a partially completed look-ahead match because
// some other rule matched further than the '/' position
// in the look-ahead match.
lookaheadStatus = 0;
}
}
if (TRACE) {
System.out.print(" " + fText.getIndex() + " ");
if (0x20 <= c && c < 0x7f) {
System.out.print(" " + c + " ");
} else {
System.out.print(" " + Integer.toHexString(c) + " ");
}
} // end of innerBlock. "break innerBlock" in above code comes out here.
System.out.println(" " + state + " " + category + " ");
}
// State Transition - move machine to its next state
//
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move
// on to check it
result = fText.getIndex();
}
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed.
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
result = lookaheadResult;
break mainLoop;
}
}
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = fText.getIndex();
fLookAheadMatches.setPosition(rule, pos);
}
if (state == STOP_STATE) {
// Normal loop exit is here
break mainLoop;
}
// then move iterator position backwards one character
//
if (mode == RBBI_RUN) {
@ -1531,10 +1517,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
mode = RBBI_RUN;
}
}
} // End of the main loop.
// The state machine is done. Check whether it found a match...
//
// If the iterator failed to advance in the match engine, force it ahead by one.
@ -1545,12 +1531,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
previous32(fText);
result = fText.getIndex();
}
fText.setIndex(result);
if (TRACE) {
System.out.println("Result = " + result);
}
return result;
}
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:848a445cb828689cd5bca20bfd321db5503ef66c0a94d929fc108a28d0c5595f
size 11754757
oid sha256:eb9182edec08706f02236909aaefcbf4c98d29d6415d1a8801633233c74f03fb
size 11789631

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a75dfbe25f7671a65bb933aed49a71eb9a923767687625982603c54860478ce7
oid sha256:cefefda6f12f61e7dcd7767a7b07b0fea3ca53c2a9b1524f3627e94cad6f3ee0
size 90259