ICU-12081 RBBI extensions & Emoji rules. Import rule data to Java from C++, port code changes.
X-SVN-Rev: 38422
This commit is contained in:
parent
48214e5b5d
commit
b552700cc6
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2001-2010, International Business Machines Corporation and
|
||||
* Copyright (c) 2001-2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
@ -91,6 +91,11 @@ class RBBINode {
|
||||
boolean fLookAheadEnd; // For endMark nodes, set TRUE if
|
||||
// marking the end of a look-ahead rule.
|
||||
|
||||
boolean fRuleRoot; // True if this node is the root of a rule.
|
||||
boolean fChainIn; // True if chaining into this rule is allowed
|
||||
// (no '^' present).
|
||||
|
||||
|
||||
Set<RBBINode> fFirstPosSet; // See Aho DFA table generation algorithm
|
||||
Set<RBBINode> fLastPosSet; // See Aho.
|
||||
Set<RBBINode> fFollowPos; // See Aho.
|
||||
@ -129,6 +134,8 @@ class RBBINode {
|
||||
fLastPos = other.fLastPos;
|
||||
fNullable = other.fNullable;
|
||||
fVal = other.fVal;
|
||||
fRuleRoot = false;
|
||||
fChainIn = other.fChainIn;
|
||||
fFirstPosSet = new HashSet<RBBINode>(other.fFirstPosSet);
|
||||
fLastPosSet = new HashSet<RBBINode>(other.fLastPosSet);
|
||||
fFollowPos = new HashSet<RBBINode>(other.fFollowPos);
|
||||
@ -163,6 +170,8 @@ class RBBINode {
|
||||
n.fRightChild.fParent = n;
|
||||
}
|
||||
}
|
||||
n.fRuleRoot = this.fRuleRoot;
|
||||
n.fChainIn = this.fChainIn;
|
||||
return n;
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
* Copyright (c) 2003-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
@ -13,6 +13,8 @@ package com.ibm.icu.text;
|
||||
* rule parser.
|
||||
* It is generated by the Perl script "rbbicst.pl" from
|
||||
* the rule parser state definitions file "rbbirpt.txt".
|
||||
* @internal
|
||||
*
|
||||
*/
|
||||
class RBBIRuleParseTable
|
||||
{
|
||||
@ -29,24 +31,25 @@ class RBBIRuleParseTable
|
||||
static final short doExprStart = 11;
|
||||
static final short doLParen = 12;
|
||||
static final short doNOP = 13;
|
||||
static final short doOptionEnd = 14;
|
||||
static final short doOptionStart = 15;
|
||||
static final short doReverseDir = 16;
|
||||
static final short doRuleChar = 17;
|
||||
static final short doRuleError = 18;
|
||||
static final short doRuleErrorAssignExpr = 19;
|
||||
static final short doScanUnicodeSet = 20;
|
||||
static final short doSlash = 21;
|
||||
static final short doStartAssign = 22;
|
||||
static final short doStartTagValue = 23;
|
||||
static final short doStartVariableName = 24;
|
||||
static final short doTagDigit = 25;
|
||||
static final short doTagExpectedError = 26;
|
||||
static final short doTagValue = 27;
|
||||
static final short doUnaryOpPlus = 28;
|
||||
static final short doUnaryOpQuestion = 29;
|
||||
static final short doUnaryOpStar = 30;
|
||||
static final short doVariableNameExpectedErr = 31;
|
||||
static final short doNoChain = 14;
|
||||
static final short doOptionEnd = 15;
|
||||
static final short doOptionStart = 16;
|
||||
static final short doReverseDir = 17;
|
||||
static final short doRuleChar = 18;
|
||||
static final short doRuleError = 19;
|
||||
static final short doRuleErrorAssignExpr = 20;
|
||||
static final short doScanUnicodeSet = 21;
|
||||
static final short doSlash = 22;
|
||||
static final short doStartAssign = 23;
|
||||
static final short doStartTagValue = 24;
|
||||
static final short doStartVariableName = 25;
|
||||
static final short doTagDigit = 26;
|
||||
static final short doTagExpectedError = 27;
|
||||
static final short doTagValue = 28;
|
||||
static final short doUnaryOpPlus = 29;
|
||||
static final short doUnaryOpQuestion = 30;
|
||||
static final short doUnaryOpStar = 31;
|
||||
static final short doVariableNameExpectedErr = 32;
|
||||
|
||||
static final short kRuleSet_default = 255;
|
||||
static final short kRuleSet_digit_char = 128;
|
||||
@ -73,104 +76,112 @@ class RBBIRuleParseTable
|
||||
fNextChar = nc;
|
||||
fStateName = sn;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static RBBIRuleTableElement[] gRuleParseStateTable = {
|
||||
new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0
|
||||
, new RBBIRuleTableElement(doExprStart, 254, 21, 8, false, "start") // 1
|
||||
, new RBBIRuleTableElement(doExprStart, 254, 29, 9, false, "start") // 1
|
||||
, new RBBIRuleTableElement(doNOP, 132, 1,0, true, null ) // 2
|
||||
, new RBBIRuleTableElement(doExprStart,'$', 80, 90, false, null ) // 3
|
||||
, new RBBIRuleTableElement(doNOP,'!', 11,0, true, null ) // 4
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 5
|
||||
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 6
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, null ) // 7
|
||||
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 8
|
||||
, new RBBIRuleTableElement(doNOP, 132, 8,0, true, null ) // 9
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 10
|
||||
, new RBBIRuleTableElement(doNOP,'!', 13,0, true, "rev-option") // 11
|
||||
, new RBBIRuleTableElement(doReverseDir, 255, 20, 8, false, null ) // 12
|
||||
, new RBBIRuleTableElement(doOptionStart, 130, 15,0, true, "option-scan1") // 13
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 14
|
||||
, new RBBIRuleTableElement(doNOP, 129, 15,0, true, "option-scan2") // 15
|
||||
, new RBBIRuleTableElement(doOptionEnd, 255, 17,0, false, null ) // 16
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 17
|
||||
, new RBBIRuleTableElement(doNOP, 132, 17,0, true, null ) // 18
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 19
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, "reverse-rule") // 20
|
||||
, new RBBIRuleTableElement(doRuleChar, 254, 30,0, true, "term") // 21
|
||||
, new RBBIRuleTableElement(doNOP, 132, 21,0, true, null ) // 22
|
||||
, new RBBIRuleTableElement(doRuleChar, 131, 30,0, true, null ) // 23
|
||||
, new RBBIRuleTableElement(doNOP,'[', 86, 30, false, null ) // 24
|
||||
, new RBBIRuleTableElement(doLParen,'(', 21, 30, true, null ) // 25
|
||||
, new RBBIRuleTableElement(doNOP,'$', 80, 29, false, null ) // 26
|
||||
, new RBBIRuleTableElement(doDotAny,'.', 30,0, true, null ) // 27
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 28
|
||||
, new RBBIRuleTableElement(doCheckVarDef, 255, 30,0, false, "term-var-ref") // 29
|
||||
, new RBBIRuleTableElement(doNOP, 132, 30,0, true, "expr-mod") // 30
|
||||
, new RBBIRuleTableElement(doUnaryOpStar,'*', 35,0, true, null ) // 31
|
||||
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 35,0, true, null ) // 32
|
||||
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 35,0, true, null ) // 33
|
||||
, new RBBIRuleTableElement(doNOP, 255, 35,0, false, null ) // 34
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont") // 35
|
||||
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 36
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 37
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 38
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 39
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 40
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 41
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 42
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'{', 59,0, true, null ) // 43
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 44
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 45
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 46
|
||||
, new RBBIRuleTableElement(doSlash,'/', 49,0, true, "look-ahead") // 47
|
||||
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 48
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-slash") // 49
|
||||
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 50
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 51
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 52
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 53
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 54
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 55
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 56
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 57
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 58
|
||||
, new RBBIRuleTableElement(doNOP, 132, 59,0, true, "tag-open") // 59
|
||||
, new RBBIRuleTableElement(doStartTagValue, 128, 62,0, false, null ) // 60
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 61
|
||||
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-value") // 62
|
||||
, new RBBIRuleTableElement(doNOP,'}', 66,0, false, null ) // 63
|
||||
, new RBBIRuleTableElement(doTagDigit, 128, 62,0, true, null ) // 64
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 65
|
||||
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-close") // 66
|
||||
, new RBBIRuleTableElement(doTagValue,'}', 69,0, true, null ) // 67
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 68
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-tag") // 69
|
||||
, new RBBIRuleTableElement(doNOP, 132, 69,0, true, null ) // 70
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 71
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 72
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 73
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 74
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 75
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 76
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 77
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 78
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 79
|
||||
, new RBBIRuleTableElement(doStartVariableName,'$', 82,0, true, "scan-var-name") // 80
|
||||
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 81
|
||||
, new RBBIRuleTableElement(doNOP, 130, 84,0, true, "scan-var-start") // 82
|
||||
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 95,0, false, null ) // 83
|
||||
, new RBBIRuleTableElement(doNOP, 129, 84,0, true, "scan-var-body") // 84
|
||||
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 85
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 86
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 87
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 88
|
||||
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 89
|
||||
, new RBBIRuleTableElement(doNOP, 132, 90,0, true, "assign-or-rule") // 90
|
||||
, new RBBIRuleTableElement(doStartAssign,'=', 21, 93, true, null ) // 91
|
||||
, new RBBIRuleTableElement(doNOP, 255, 29, 8, false, null ) // 92
|
||||
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 93
|
||||
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 95,0, false, null ) // 94
|
||||
, new RBBIRuleTableElement(doExit, 255, 95,0, true, "errorDeath") // 95
|
||||
, new RBBIRuleTableElement(doNoChain,'^', 12, 9, true, null ) // 3
|
||||
, new RBBIRuleTableElement(doExprStart,'$', 88, 98, false, null ) // 4
|
||||
, new RBBIRuleTableElement(doNOP,'!', 19,0, true, null ) // 5
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 6
|
||||
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 7
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, null ) // 8
|
||||
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 9
|
||||
, new RBBIRuleTableElement(doNOP, 132, 9,0, true, null ) // 10
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 11
|
||||
, new RBBIRuleTableElement(doExprStart, 254, 29,0, false, "start-after-caret") // 12
|
||||
, new RBBIRuleTableElement(doNOP, 132, 12,0, true, null ) // 13
|
||||
, new RBBIRuleTableElement(doRuleError,'^', 103,0, false, null ) // 14
|
||||
, new RBBIRuleTableElement(doExprStart,'$', 88, 37, false, null ) // 15
|
||||
, new RBBIRuleTableElement(doRuleError,';', 103,0, false, null ) // 16
|
||||
, new RBBIRuleTableElement(doRuleError, 252, 103,0, false, null ) // 17
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 29,0, false, null ) // 18
|
||||
, new RBBIRuleTableElement(doNOP,'!', 21,0, true, "rev-option") // 19
|
||||
, new RBBIRuleTableElement(doReverseDir, 255, 28, 9, false, null ) // 20
|
||||
, new RBBIRuleTableElement(doOptionStart, 130, 23,0, true, "option-scan1") // 21
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 22
|
||||
, new RBBIRuleTableElement(doNOP, 129, 23,0, true, "option-scan2") // 23
|
||||
, new RBBIRuleTableElement(doOptionEnd, 255, 25,0, false, null ) // 24
|
||||
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 25
|
||||
, new RBBIRuleTableElement(doNOP, 132, 25,0, true, null ) // 26
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 27
|
||||
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, "reverse-rule") // 28
|
||||
, new RBBIRuleTableElement(doRuleChar, 254, 38,0, true, "term") // 29
|
||||
, new RBBIRuleTableElement(doNOP, 132, 29,0, true, null ) // 30
|
||||
, new RBBIRuleTableElement(doRuleChar, 131, 38,0, true, null ) // 31
|
||||
, new RBBIRuleTableElement(doNOP,'[', 94, 38, false, null ) // 32
|
||||
, new RBBIRuleTableElement(doLParen,'(', 29, 38, true, null ) // 33
|
||||
, new RBBIRuleTableElement(doNOP,'$', 88, 37, false, null ) // 34
|
||||
, new RBBIRuleTableElement(doDotAny,'.', 38,0, true, null ) // 35
|
||||
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 36
|
||||
, new RBBIRuleTableElement(doCheckVarDef, 255, 38,0, false, "term-var-ref") // 37
|
||||
, new RBBIRuleTableElement(doNOP, 132, 38,0, true, "expr-mod") // 38
|
||||
, new RBBIRuleTableElement(doUnaryOpStar,'*', 43,0, true, null ) // 39
|
||||
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 43,0, true, null ) // 40
|
||||
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 43,0, true, null ) // 41
|
||||
, new RBBIRuleTableElement(doNOP, 255, 43,0, false, null ) // 42
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont") // 43
|
||||
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 44
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 45
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 46
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 47
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 48
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 49
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 50
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'{', 67,0, true, null ) // 51
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 52
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 53
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 54
|
||||
, new RBBIRuleTableElement(doSlash,'/', 57,0, true, "look-ahead") // 55
|
||||
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 56
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-slash") // 57
|
||||
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 58
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 59
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 60
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 61
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 62
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 63
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 64
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 65
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 66
|
||||
, new RBBIRuleTableElement(doNOP, 132, 67,0, true, "tag-open") // 67
|
||||
, new RBBIRuleTableElement(doStartTagValue, 128, 70,0, false, null ) // 68
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 69
|
||||
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-value") // 70
|
||||
, new RBBIRuleTableElement(doNOP,'}', 74,0, false, null ) // 71
|
||||
, new RBBIRuleTableElement(doTagDigit, 128, 70,0, true, null ) // 72
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 73
|
||||
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-close") // 74
|
||||
, new RBBIRuleTableElement(doTagValue,'}', 77,0, true, null ) // 75
|
||||
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 76
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-tag") // 77
|
||||
, new RBBIRuleTableElement(doNOP, 132, 77,0, true, null ) // 78
|
||||
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 79
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 80
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 81
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 82
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 83
|
||||
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 84
|
||||
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 85
|
||||
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 86
|
||||
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 87
|
||||
, new RBBIRuleTableElement(doStartVariableName,'$', 90,0, true, "scan-var-name") // 88
|
||||
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 89
|
||||
, new RBBIRuleTableElement(doNOP, 130, 92,0, true, "scan-var-start") // 90
|
||||
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 103,0, false, null ) // 91
|
||||
, new RBBIRuleTableElement(doNOP, 129, 92,0, true, "scan-var-body") // 92
|
||||
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 93
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 94
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 95
|
||||
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 96
|
||||
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 97
|
||||
, new RBBIRuleTableElement(doNOP, 132, 98,0, true, "assign-or-rule") // 98
|
||||
, new RBBIRuleTableElement(doStartAssign,'=', 29, 101, true, null ) // 99
|
||||
, new RBBIRuleTableElement(doNOP, 255, 37, 9, false, null ) // 100
|
||||
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 101
|
||||
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 103,0, false, null ) // 102
|
||||
, new RBBIRuleTableElement(doExit, 255, 103,0, true, "errorDeath") // 103
|
||||
};
|
||||
}
|
||||
};
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
|
||||
* Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
@ -66,6 +66,9 @@ class RBBIRuleScanner {
|
||||
boolean fLookAheadRule; // True if the rule includes a '/'
|
||||
// somewhere within it.
|
||||
|
||||
boolean fNoChainInRule; // True if the current rule starts with a '^'.
|
||||
|
||||
|
||||
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
|
||||
// $variable symbols.
|
||||
|
||||
@ -139,6 +142,12 @@ class RBBIRuleScanner {
|
||||
fRuleNum++;
|
||||
break;
|
||||
|
||||
case RBBIRuleParseTable.doNoChain:
|
||||
// Scanned a '^' while on the rule start state.
|
||||
fNoChainInRule = true;
|
||||
break;
|
||||
|
||||
|
||||
case RBBIRuleParseTable.doExprOrOperator: {
|
||||
fixOpStack(RBBINode.precOpCat);
|
||||
RBBINode operandNode = fNodeStack[fNodeStackPtr--];
|
||||
@ -241,11 +250,11 @@ class RBBIRuleScanner {
|
||||
printNodeStack("end of rule");
|
||||
}
|
||||
Assert.assrt(fNodeStackPtr == 1);
|
||||
RBBINode thisRule = fNodeStack[fNodeStackPtr];
|
||||
|
||||
// If this rule includes a look-ahead '/', add a endMark node to the
|
||||
// expression tree.
|
||||
if (fLookAheadRule) {
|
||||
RBBINode thisRule = fNodeStack[fNodeStackPtr];
|
||||
RBBINode endNode = pushNewNode(RBBINode.endMark);
|
||||
RBBINode catNode = pushNewNode(RBBINode.opCat);
|
||||
fNodeStackPtr -= 2;
|
||||
@ -254,8 +263,24 @@ class RBBIRuleScanner {
|
||||
fNodeStack[fNodeStackPtr] = catNode;
|
||||
endNode.fVal = fRuleNum;
|
||||
endNode.fLookAheadEnd = true;
|
||||
thisRule = catNode;
|
||||
|
||||
// TODO: Disable chaining out of look-ahead (hard break) rules.
|
||||
// The break on rule match is forced, so there is no point in building up
|
||||
// the state table to chain into another rule for a longer match.
|
||||
}
|
||||
|
||||
// Mark this node as being the root of a rule.
|
||||
thisRule.fRuleRoot = true;
|
||||
|
||||
// Flag if chaining into this rule is wanted.
|
||||
//
|
||||
if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain
|
||||
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
|
||||
thisRule.fChainIn = true;
|
||||
}
|
||||
|
||||
|
||||
// All rule expressions are ORed together.
|
||||
// The ';' that terminates an expression really just functions as a
|
||||
// '|' with
|
||||
@ -269,12 +294,12 @@ class RBBIRuleScanner {
|
||||
int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
|
||||
|
||||
if (fRB.fTreeRoots[destRules] != null) {
|
||||
// This is not the first rule encounted.
|
||||
// This is not the first rule encountered.
|
||||
// OR previous stuff (from *destRules)
|
||||
// with the current rule expression (on the Node Stack)
|
||||
// with the resulting OR expression going to *destRules
|
||||
//
|
||||
RBBINode thisRule = fNodeStack[fNodeStackPtr];
|
||||
thisRule = fNodeStack[fNodeStackPtr];
|
||||
RBBINode prevRules = fRB.fTreeRoots[destRules];
|
||||
RBBINode orNode = pushNewNode(RBBINode.opOr);
|
||||
orNode.fLeftChild = prevRules;
|
||||
@ -289,6 +314,7 @@ class RBBIRuleScanner {
|
||||
}
|
||||
fReverseRule = false; // in preparation for the next rule.
|
||||
fLookAheadRule = false;
|
||||
fNoChainInRule = false;
|
||||
fNodeStackPtr = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2009, International Business Machines
|
||||
* Copyright (c) 2002-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -361,6 +361,25 @@ class RBBITableBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
|
||||
// as roots of a rule to a destination vector.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
|
||||
if (node == null) {
|
||||
return;
|
||||
}
|
||||
if (node.fRuleRoot) {
|
||||
dest.add(node);
|
||||
// Note: rules cannot nest. If we found a rule start node,
|
||||
// no child node can also be a start node.
|
||||
return;
|
||||
}
|
||||
addRuleRootNodes(dest, node.fLeftChild);
|
||||
addRuleRootNodes(dest, node.fRightChild);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
@ -379,17 +398,21 @@ class RBBITableBuilder {
|
||||
// get a list all leaf nodes
|
||||
tree.findNodes(leafNodes, RBBINode.leafChar);
|
||||
|
||||
// Get all nodes that can be the start a match, which is FirstPosition()
|
||||
// of the portion of the tree corresponding to user-written rules.
|
||||
// See the tree description in bofFixup().
|
||||
RBBINode userRuleRoot = tree;
|
||||
if (fRB.fSetBuilder.sawBOF()) {
|
||||
userRuleRoot = tree.fLeftChild.fRightChild;
|
||||
}
|
||||
Assert.assrt(userRuleRoot != null);
|
||||
Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;
|
||||
// Collect all leaf nodes that can start matches for rules
|
||||
// with inbound chaining enabled, which is the union of the
|
||||
// firstPosition sets from each of the rule root nodes.
|
||||
|
||||
// Iteratate over all leaf nodes,
|
||||
List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
|
||||
addRuleRootNodes(ruleRootNodes, tree);
|
||||
|
||||
Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
|
||||
for (RBBINode node: ruleRootNodes) {
|
||||
if (node.fChainIn) {
|
||||
matchStartNodes.addAll(node.fFirstPosSet);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over all leaf nodes,
|
||||
//
|
||||
for (RBBINode tNode : leafNodes) {
|
||||
RBBINode endNode = null;
|
||||
|
@ -1158,6 +1158,51 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
return eng;
|
||||
}
|
||||
|
||||
private static final int kMaxLookaheads = 8;
|
||||
private static class LookAheadResults {
|
||||
int fUsedSlotLimit;
|
||||
int[] fPositions;
|
||||
int[] fKeys;
|
||||
|
||||
LookAheadResults() {
|
||||
fUsedSlotLimit= 0;
|
||||
fPositions = new int[kMaxLookaheads];
|
||||
fKeys = new int[kMaxLookaheads];
|
||||
}
|
||||
|
||||
int getPosition(int key) {
|
||||
for (int i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
return fPositions[i];
|
||||
}
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void setPosition(int key, int position) {
|
||||
int i;
|
||||
for (i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
fPositions[i] = position;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (i >= kMaxLookaheads) {
|
||||
assert(false);
|
||||
i = kMaxLookaheads - 1;
|
||||
}
|
||||
fKeys[i] = key;
|
||||
fPositions[i] = position;
|
||||
assert(fUsedSlotLimit == i);
|
||||
fUsedSlotLimit = i + 1;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
fUsedSlotLimit = 0;
|
||||
}
|
||||
};
|
||||
private LookAheadResults fLookAheadMatches = new LookAheadResults();
|
||||
|
||||
|
||||
/**
|
||||
@ -1214,9 +1259,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||||
}
|
||||
}
|
||||
int lookaheadStatus = 0;
|
||||
int lookaheadTagIdx = 0;
|
||||
int lookaheadResult = 0;
|
||||
fLookAheadMatches.reset();
|
||||
|
||||
// loop until we reach the end of the text or transition to state 0
|
||||
while (state != STOP_STATE) {
|
||||
@ -1226,16 +1269,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// We have already run the loop one last time with the
|
||||
// character set to the pseudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
|
||||
if (lookaheadResult > result) {
|
||||
// We ran off the end of the string with a pending
|
||||
// look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been
|
||||
// met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Run the loop one last time with the fake end-of-input character category
|
||||
@ -1299,40 +1332,30 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
|
||||
}
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
// TODO: make a standalone hard break in a rule work.
|
||||
if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
|
||||
text.setIndex(result);
|
||||
return result;
|
||||
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
|
||||
text.setIndex(lookaheadResult);
|
||||
return lookaheadResult;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further. Continue on.
|
||||
// TODO: junk this feature? I don't think it's used anywhere.
|
||||
continue;
|
||||
}
|
||||
|
||||
lookaheadResult = text.getIndex();
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int pos = text.getIndex();
|
||||
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
// The iterator has been left in the middle of a surrogate pair.
|
||||
// We want the beginning of it.
|
||||
lookaheadResult--;
|
||||
pos--;
|
||||
}
|
||||
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
|
||||
continue;
|
||||
fLookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
|
||||
// Because this is an accepting state, any in-progress look-ahead match
|
||||
// is no longer relevant. Clear out the pending lookahead status.
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
|
||||
} // End of state machine main loop
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
@ -1371,12 +1394,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
int mode;
|
||||
int row;
|
||||
int c;
|
||||
int lookaheadStatus = 0;
|
||||
int result = 0;
|
||||
int initialPosition = 0;
|
||||
int lookaheadResult = 0;
|
||||
boolean lookAheadHardBreak =
|
||||
(fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
fLookAheadMatches.reset();
|
||||
|
||||
// handlePrevious() never gets the rule status.
|
||||
// Flag the status as invalid; if the user ever asks for status, we will need
|
||||
@ -1407,7 +1427,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
//
|
||||
mainLoop: for (;;) {
|
||||
innerBlock: {
|
||||
if (c == DONE32) {
|
||||
// Reached end of input string.
|
||||
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
|
||||
@ -1415,13 +1434,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// does not support explicit support for matching {eof}, or
|
||||
// we have already done the {eof} iteration. Now is the time
|
||||
// to unconditionally bail out.
|
||||
if (lookaheadResult < result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
if (result == initialPosition) {
|
||||
// Ran off start, no match found.
|
||||
// Move one position (towards the start, since we are doing previous.)
|
||||
fText.setIndex(initialPosition);
|
||||
@ -1473,50 +1486,23 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
result = fText.getIndex();
|
||||
}
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
|
||||
// Lookahead match is completed. Set the result
|
||||
// accordingly, but only
|
||||
// if no other rule has matched further in the mean
|
||||
// time.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
// TODO: make a stand-alone hard break in a rule work.
|
||||
|
||||
if (lookAheadHardBreak) {
|
||||
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed.
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
result = lookaheadResult;
|
||||
break mainLoop;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further.
|
||||
// Continue on.
|
||||
// TODO: junk this feature? I don't think that it's used anywhere.
|
||||
break innerBlock;
|
||||
}
|
||||
// Hit a possible look-ahead match. We are at the
|
||||
// position of the '/'. Remember this position.
|
||||
lookaheadResult = fText.getIndex();
|
||||
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
break innerBlock;
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int pos = fText.getIndex();
|
||||
fLookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
// not lookahead...
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
|
||||
// This is a plain (non-look-ahead) accepting state.
|
||||
if (!lookAheadHardBreak) {
|
||||
// Clear out any pending look-ahead matches,
|
||||
// but only if not doing the lookAheadHardBreak option
|
||||
// which needs to force a break no matter what is going
|
||||
// on with the rest of the match, i.e. we can't abandon
|
||||
// a partially completed look-ahead match because
|
||||
// some other rule matched further than the '/' position
|
||||
// in the look-ahead match.
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // end of innerBlock. "break innerBlock" in above code comes out here.
|
||||
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
// Normal loop exit is here
|
||||
break mainLoop;
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:848a445cb828689cd5bca20bfd321db5503ef66c0a94d929fc108a28d0c5595f
|
||||
size 11754757
|
||||
oid sha256:eb9182edec08706f02236909aaefcbf4c98d29d6415d1a8801633233c74f03fb
|
||||
size 11789631
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a75dfbe25f7671a65bb933aed49a71eb9a923767687625982603c54860478ce7
|
||||
oid sha256:cefefda6f12f61e7dcd7767a7b07b0fea3ca53c2a9b1524f3627e94cad6f3ee0
|
||||
size 90259
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2015 International Business Machines Corporation and
|
||||
* Copyright (C) 2003-2016 International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -88,6 +88,11 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
UnicodeSet fLVTSet;
|
||||
UnicodeSet fHangulSet;
|
||||
UnicodeSet fAnySet;
|
||||
UnicodeSet fEmojiModifierSet;
|
||||
UnicodeSet fEmojiBaseSet;
|
||||
UnicodeSet fZWJSet;
|
||||
UnicodeSet fGAZSet;
|
||||
|
||||
|
||||
StringBuffer fText;
|
||||
|
||||
@ -96,8 +101,8 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fText = null;
|
||||
fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
|
||||
fCRLFSet = new UnicodeSet("[\\r\\n]");
|
||||
fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
|
||||
fControlSet = new UnicodeSet("[[\\p{Grapheme_Cluster_Break = Control}-[:Block=Tags:]]]");
|
||||
fExtendSet = new UnicodeSet("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]");
|
||||
fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
|
||||
fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
|
||||
fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
|
||||
@ -115,6 +120,17 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
|
||||
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
|
||||
|
||||
fEmojiBaseSet = new UnicodeSet(
|
||||
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
|
||||
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
|
||||
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
|
||||
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]");
|
||||
|
||||
fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
|
||||
fZWJSet = new UnicodeSet(0x200D, 0x200D);
|
||||
fGAZSet = new UnicodeSet("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]");
|
||||
|
||||
|
||||
fSets = new ArrayList();
|
||||
fSets.add(fCRLFSet);
|
||||
fSets.add(fControlSet);
|
||||
@ -126,6 +142,10 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fSets.add(fSpacingSet);
|
||||
fSets.add(fHangulSet);
|
||||
fSets.add(fAnySet);
|
||||
fSets.add(fEmojiBaseSet);
|
||||
fSets.add(fEmojiModifierSet);
|
||||
fSets.add(fZWJSet);
|
||||
fSets.add(fGAZSet);
|
||||
}
|
||||
|
||||
|
||||
@ -138,25 +158,26 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
}
|
||||
|
||||
int next(int prevPos) {
|
||||
int p1, p2, p3; // Indices of the significant code points around the
|
||||
int p0, p1, p2, p3; // Indices of the significant code points around the
|
||||
// break position being tested. The candidate break
|
||||
// location is before p2.
|
||||
|
||||
int breakPos = -1;
|
||||
|
||||
int c1, c2, c3; // The code points at p0, p1, p2 & p3.
|
||||
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
|
||||
|
||||
// Previous break at end of string. return DONE.
|
||||
if (prevPos >= fText.length()) {
|
||||
return -1;
|
||||
}
|
||||
p1 = p2 = p3 = prevPos;
|
||||
p0 = p1 = p2 = p3 = prevPos;
|
||||
c3 = UTF16.charAt(fText, prevPos);
|
||||
c1 = c2 = 0;
|
||||
c0 = c1 = c2 = 0;
|
||||
|
||||
// Loop runs once per "significant" character position in the input text.
|
||||
for (;;) {
|
||||
// Move all of the positions forward in the input string.
|
||||
p0 = p1; c0 = c1;
|
||||
p1 = p2; c1 = c2;
|
||||
p2 = p3; c2 = c3;
|
||||
|
||||
@ -219,12 +240,21 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
}
|
||||
|
||||
// Rule (GB8a) Regional_Indicator x Regional_Indicator
|
||||
// Note: The first if condition is a little tricky. We only need to force
|
||||
// a break if there are three or more contiguous RIs. If there are
|
||||
// only two, a break following will occur via other rules, and will include
|
||||
// any trailing extend characters, which is needed behavior.
|
||||
if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
|
||||
&& fRegionalIndicatorSet.contains(c2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB9) Numeric x ALetter
|
||||
if (fExtendSet.contains(c2)) {
|
||||
// Rule (GB9) x Extend
|
||||
if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -237,6 +267,15 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
if (fPrependSet.contains(c1)) {
|
||||
continue;
|
||||
}
|
||||
// Rule (GB9c) Emoji_Base x Emoji_Modifier
|
||||
if ((fEmojiBaseSet.contains(c1) || fGAZSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB9d) ZWJ x Glue_After_Zwj
|
||||
if (fZWJSet.contains(c1) && fGAZSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB10) Any <break> Any
|
||||
break;
|
||||
@ -277,6 +316,10 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
UnicodeSet fExtendNumLetSet;
|
||||
UnicodeSet fOtherSet;
|
||||
UnicodeSet fDictionaryCjkSet;
|
||||
UnicodeSet fEBaseSet;
|
||||
UnicodeSet fEModifierSet;
|
||||
UnicodeSet fZWSSet;
|
||||
UnicodeSet fGAZSet;
|
||||
|
||||
|
||||
RBBIWordMonkey() {
|
||||
@ -300,6 +343,16 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
|
||||
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
|
||||
fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
|
||||
fEBaseSet = new UnicodeSet(
|
||||
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
|
||||
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
|
||||
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
|
||||
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]");
|
||||
|
||||
fEModifierSet = new UnicodeSet("[\\U0001F3FB-\\U0001F3FF]");
|
||||
fZWSSet = new UnicodeSet(0x200D, 0x200D);
|
||||
fGAZSet = new UnicodeSet("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]");
|
||||
fExtendSet.removeAll(fZWSSet);
|
||||
|
||||
fOtherSet = new UnicodeSet();
|
||||
fOtherSet.complement();
|
||||
@ -318,6 +371,11 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fOtherSet.removeAll(fExtendSet);
|
||||
fOtherSet.removeAll(fExtendNumLetSet);
|
||||
fOtherSet.removeAll(fRegionalIndicatorSet);
|
||||
fOtherSet.removeAll(fEBaseSet);
|
||||
fOtherSet.removeAll(fEModifierSet);
|
||||
fOtherSet.removeAll(fZWSSet);
|
||||
fOtherSet.removeAll(fGAZSet);
|
||||
|
||||
// Inhibit dictionary characters from being tested at all.
|
||||
// remove surrogates so as to not generate higher CJK characters
|
||||
fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
|
||||
@ -390,7 +448,7 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
break;
|
||||
}
|
||||
}
|
||||
while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
|
||||
while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWSSet, c3));
|
||||
|
||||
if (p1 == p2) {
|
||||
// Still warming up the loop. (won't work with zero length strings, but we don't care)
|
||||
@ -418,6 +476,13 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
break;
|
||||
}
|
||||
|
||||
// Rule (3c) ZWJ x GAZ (Glue after ZWJ).
|
||||
// Not ignoring extend chars, so peek into input text to
|
||||
// get the potential ZWJ, the character immediately preceding c2.
|
||||
if (fZWSSet.contains(fText.codePointBefore(p2)) && fGAZSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
|
||||
if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
|
||||
(fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
|
||||
@ -509,10 +574,18 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
|
||||
// Rule 13c Do not break between Regional Indicators.
|
||||
// Regional_Indicator × Regional_Indicator
|
||||
if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
|
||||
break;
|
||||
}
|
||||
if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule 13d
|
||||
if ((fEBaseSet.contains(c1) || fGAZSet.contains(c1)) && fEModifierSet.contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule 14. Break found here.
|
||||
break;
|
||||
}
|
||||
@ -570,8 +643,10 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
UnicodeSet fJV;
|
||||
UnicodeSet fJT;
|
||||
UnicodeSet fRI;
|
||||
UnicodeSet fSA;
|
||||
UnicodeSet fXX;
|
||||
UnicodeSet fEB;
|
||||
UnicodeSet fEM;
|
||||
UnicodeSet fZJ;
|
||||
|
||||
StringBuffer fText;
|
||||
int fOrigPositions;
|
||||
@ -621,23 +696,33 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fJV = new UnicodeSet("[\\p{Line_break=JV}]");
|
||||
fJT = new UnicodeSet("[\\p{Line_break=JT}]");
|
||||
fRI = new UnicodeSet("[\\p{Line_break=RI}]");
|
||||
fSA = new UnicodeSet("[\\p{Line_break=SA}]");
|
||||
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
|
||||
fEB = new UnicodeSet(
|
||||
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
|
||||
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
|
||||
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
|
||||
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]");
|
||||
fEM = new UnicodeSet("[\\U0001F3FB-\\U0001F3FF]");
|
||||
fZJ = new UnicodeSet(0x200D, 0x200D);
|
||||
|
||||
// Remove dictionary characters.
|
||||
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
|
||||
// so dictionary characters are omitted from the monkey test data.
|
||||
UnicodeSet dictionarySet = new UnicodeSet(
|
||||
"[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
|
||||
fSA.removeAll(dictionarySet);
|
||||
|
||||
fAL.addAll(fXX); // Default behavior for XX is identical to AL
|
||||
fAL.addAll(fAI); // Default behavior for AI is identical to AL
|
||||
fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
|
||||
fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
|
||||
|
||||
fNS.addAll(fCJ); // Default behavior for CJ is identical to NS.
|
||||
|
||||
fID.addAll(fEB); // Emoji Base and Emoji Modifier behave as ID.
|
||||
fID.addAll(fEM);
|
||||
fAL.removeAll(fEM);
|
||||
fAL.remove(0x2764); // Emoji Proposal: move u2764 from AL to ID
|
||||
fID.add(0x2764);
|
||||
|
||||
fSets.add(fBK);
|
||||
fSets.add(fCR);
|
||||
fSets.add(fLF);
|
||||
@ -674,9 +759,12 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fSets.add(fHL);
|
||||
fSets.add(fID);
|
||||
fSets.add(fWJ);
|
||||
fSets.add(fSA);
|
||||
fSets.add(fSG);
|
||||
fSets.add(fRI);
|
||||
fSets.add(fSG);
|
||||
fSets.add(fEB);
|
||||
fSets.add(fEM);
|
||||
fSets.add(fZJ);
|
||||
|
||||
}
|
||||
|
||||
void setText(StringBuffer s) {
|
||||
@ -810,6 +898,17 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
break;
|
||||
}
|
||||
|
||||
// LB 8a ZJ x ID
|
||||
// The monkey test's way of ignoring combining characters doesn't work
|
||||
// for this rule. ZJ is also a CM. Need to get the actual character
|
||||
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
|
||||
{
|
||||
int prevC = fText.codePointBefore(pos);
|
||||
if (fZJ.contains(prevC) && fID.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 9, 10 Already done, at top of loop.
|
||||
//
|
||||
|
||||
@ -1061,11 +1160,20 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 30a Do not break between regional indicators. RI × RI
|
||||
// LB 30a Break between pairs of Regional Indicators.
|
||||
// RI RI <break> RI
|
||||
// RI x RI
|
||||
if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
|
||||
break;
|
||||
}
|
||||
if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB30b Emoji Base x Emoji Modifier
|
||||
if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
// LB 31 Break everywhere else
|
||||
break;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user