ICU-12081 RBBI extensions & Emoji rules. Import rule data to Java from C++, port code changes.

X-SVN-Rev: 38422
This commit is contained in:
Andy Heninger 2016-02-28 19:14:48 +00:00
parent 48214e5b5d
commit b552700cc6
8 changed files with 870 additions and 707 deletions

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2010, International Business Machines Corporation and
* Copyright (c) 2001-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -91,6 +91,11 @@ class RBBINode {
boolean fLookAheadEnd; // For endMark nodes, set TRUE if
// marking the end of a look-ahead rule.
boolean fRuleRoot; // True if this node is the root of a rule.
boolean fChainIn; // True if chaining into this rule is allowed
// (no '^' present).
Set<RBBINode> fFirstPosSet; // See Aho DFA table generation algorithm
Set<RBBINode> fLastPosSet; // See Aho.
Set<RBBINode> fFollowPos; // See Aho.
@ -129,6 +134,8 @@ class RBBINode {
fLastPos = other.fLastPos;
fNullable = other.fNullable;
fVal = other.fVal;
fRuleRoot = false;
fChainIn = other.fChainIn;
fFirstPosSet = new HashSet<RBBINode>(other.fFirstPosSet);
fLastPosSet = new HashSet<RBBINode>(other.fLastPosSet);
fFollowPos = new HashSet<RBBINode>(other.fFollowPos);
@ -163,6 +170,8 @@ class RBBINode {
n.fRightChild.fParent = n;
}
}
n.fRuleRoot = this.fRuleRoot;
n.fChainIn = this.fChainIn;
return n;
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines Corporation and
* others. All Rights Reserved.
* Copyright (c) 2003-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -13,6 +13,8 @@ package com.ibm.icu.text;
* rule parser.
* It is generated by the Perl script "rbbicst.pl" from
* the rule parser state definitions file "rbbirpt.txt".
* @internal
*
*/
class RBBIRuleParseTable
{
@ -29,24 +31,25 @@ class RBBIRuleParseTable
static final short doExprStart = 11;
static final short doLParen = 12;
static final short doNOP = 13;
static final short doOptionEnd = 14;
static final short doOptionStart = 15;
static final short doReverseDir = 16;
static final short doRuleChar = 17;
static final short doRuleError = 18;
static final short doRuleErrorAssignExpr = 19;
static final short doScanUnicodeSet = 20;
static final short doSlash = 21;
static final short doStartAssign = 22;
static final short doStartTagValue = 23;
static final short doStartVariableName = 24;
static final short doTagDigit = 25;
static final short doTagExpectedError = 26;
static final short doTagValue = 27;
static final short doUnaryOpPlus = 28;
static final short doUnaryOpQuestion = 29;
static final short doUnaryOpStar = 30;
static final short doVariableNameExpectedErr = 31;
static final short doNoChain = 14;
static final short doOptionEnd = 15;
static final short doOptionStart = 16;
static final short doReverseDir = 17;
static final short doRuleChar = 18;
static final short doRuleError = 19;
static final short doRuleErrorAssignExpr = 20;
static final short doScanUnicodeSet = 21;
static final short doSlash = 22;
static final short doStartAssign = 23;
static final short doStartTagValue = 24;
static final short doStartVariableName = 25;
static final short doTagDigit = 26;
static final short doTagExpectedError = 27;
static final short doTagValue = 28;
static final short doUnaryOpPlus = 29;
static final short doUnaryOpQuestion = 30;
static final short doUnaryOpStar = 31;
static final short doVariableNameExpectedErr = 32;
static final short kRuleSet_default = 255;
static final short kRuleSet_digit_char = 128;
@ -73,104 +76,112 @@ class RBBIRuleParseTable
fNextChar = nc;
fStateName = sn;
}
}
};
static RBBIRuleTableElement[] gRuleParseStateTable = {
new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0
, new RBBIRuleTableElement(doExprStart, 254, 21, 8, false, "start") // 1
, new RBBIRuleTableElement(doExprStart, 254, 29, 9, false, "start") // 1
, new RBBIRuleTableElement(doNOP, 132, 1,0, true, null ) // 2
, new RBBIRuleTableElement(doExprStart,'$', 80, 90, false, null ) // 3
, new RBBIRuleTableElement(doNOP,'!', 11,0, true, null ) // 4
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 5
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 6
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, null ) // 7
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 8
, new RBBIRuleTableElement(doNOP, 132, 8,0, true, null ) // 9
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 10
, new RBBIRuleTableElement(doNOP,'!', 13,0, true, "rev-option") // 11
, new RBBIRuleTableElement(doReverseDir, 255, 20, 8, false, null ) // 12
, new RBBIRuleTableElement(doOptionStart, 130, 15,0, true, "option-scan1") // 13
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 14
, new RBBIRuleTableElement(doNOP, 129, 15,0, true, "option-scan2") // 15
, new RBBIRuleTableElement(doOptionEnd, 255, 17,0, false, null ) // 16
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 17
, new RBBIRuleTableElement(doNOP, 132, 17,0, true, null ) // 18
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 19
, new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, "reverse-rule") // 20
, new RBBIRuleTableElement(doRuleChar, 254, 30,0, true, "term") // 21
, new RBBIRuleTableElement(doNOP, 132, 21,0, true, null ) // 22
, new RBBIRuleTableElement(doRuleChar, 131, 30,0, true, null ) // 23
, new RBBIRuleTableElement(doNOP,'[', 86, 30, false, null ) // 24
, new RBBIRuleTableElement(doLParen,'(', 21, 30, true, null ) // 25
, new RBBIRuleTableElement(doNOP,'$', 80, 29, false, null ) // 26
, new RBBIRuleTableElement(doDotAny,'.', 30,0, true, null ) // 27
, new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 28
, new RBBIRuleTableElement(doCheckVarDef, 255, 30,0, false, "term-var-ref") // 29
, new RBBIRuleTableElement(doNOP, 132, 30,0, true, "expr-mod") // 30
, new RBBIRuleTableElement(doUnaryOpStar,'*', 35,0, true, null ) // 31
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 35,0, true, null ) // 32
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 35,0, true, null ) // 33
, new RBBIRuleTableElement(doNOP, 255, 35,0, false, null ) // 34
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont") // 35
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 36
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 37
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 38
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 39
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 40
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 41
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 42
, new RBBIRuleTableElement(doExprCatOperator,'{', 59,0, true, null ) // 43
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 44
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 45
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 46
, new RBBIRuleTableElement(doSlash,'/', 49,0, true, "look-ahead") // 47
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 48
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-slash") // 49
, new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 50
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 51
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 52
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 53
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 54
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 55
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 56
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 57
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 58
, new RBBIRuleTableElement(doNOP, 132, 59,0, true, "tag-open") // 59
, new RBBIRuleTableElement(doStartTagValue, 128, 62,0, false, null ) // 60
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 61
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-value") // 62
, new RBBIRuleTableElement(doNOP,'}', 66,0, false, null ) // 63
, new RBBIRuleTableElement(doTagDigit, 128, 62,0, true, null ) // 64
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 65
, new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-close") // 66
, new RBBIRuleTableElement(doTagValue,'}', 69,0, true, null ) // 67
, new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 68
, new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-tag") // 69
, new RBBIRuleTableElement(doNOP, 132, 69,0, true, null ) // 70
, new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 71
, new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 72
, new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 73
, new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 74
, new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 75
, new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 76
, new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 77
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 78
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 79
, new RBBIRuleTableElement(doStartVariableName,'$', 82,0, true, "scan-var-name") // 80
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 81
, new RBBIRuleTableElement(doNOP, 130, 84,0, true, "scan-var-start") // 82
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 95,0, false, null ) // 83
, new RBBIRuleTableElement(doNOP, 129, 84,0, true, "scan-var-body") // 84
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 85
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 86
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 87
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 88
, new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 89
, new RBBIRuleTableElement(doNOP, 132, 90,0, true, "assign-or-rule") // 90
, new RBBIRuleTableElement(doStartAssign,'=', 21, 93, true, null ) // 91
, new RBBIRuleTableElement(doNOP, 255, 29, 8, false, null ) // 92
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 93
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 95,0, false, null ) // 94
, new RBBIRuleTableElement(doExit, 255, 95,0, true, "errorDeath") // 95
, new RBBIRuleTableElement(doNoChain,'^', 12, 9, true, null ) // 3
, new RBBIRuleTableElement(doExprStart,'$', 88, 98, false, null ) // 4
, new RBBIRuleTableElement(doNOP,'!', 19,0, true, null ) // 5
, new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 6
, new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 7
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, null ) // 8
, new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 9
, new RBBIRuleTableElement(doNOP, 132, 9,0, true, null ) // 10
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 11
, new RBBIRuleTableElement(doExprStart, 254, 29,0, false, "start-after-caret") // 12
, new RBBIRuleTableElement(doNOP, 132, 12,0, true, null ) // 13
, new RBBIRuleTableElement(doRuleError,'^', 103,0, false, null ) // 14
, new RBBIRuleTableElement(doExprStart,'$', 88, 37, false, null ) // 15
, new RBBIRuleTableElement(doRuleError,';', 103,0, false, null ) // 16
, new RBBIRuleTableElement(doRuleError, 252, 103,0, false, null ) // 17
, new RBBIRuleTableElement(doExprStart, 255, 29,0, false, null ) // 18
, new RBBIRuleTableElement(doNOP,'!', 21,0, true, "rev-option") // 19
, new RBBIRuleTableElement(doReverseDir, 255, 28, 9, false, null ) // 20
, new RBBIRuleTableElement(doOptionStart, 130, 23,0, true, "option-scan1") // 21
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 22
, new RBBIRuleTableElement(doNOP, 129, 23,0, true, "option-scan2") // 23
, new RBBIRuleTableElement(doOptionEnd, 255, 25,0, false, null ) // 24
, new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 25
, new RBBIRuleTableElement(doNOP, 132, 25,0, true, null ) // 26
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 27
, new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, "reverse-rule") // 28
, new RBBIRuleTableElement(doRuleChar, 254, 38,0, true, "term") // 29
, new RBBIRuleTableElement(doNOP, 132, 29,0, true, null ) // 30
, new RBBIRuleTableElement(doRuleChar, 131, 38,0, true, null ) // 31
, new RBBIRuleTableElement(doNOP,'[', 94, 38, false, null ) // 32
, new RBBIRuleTableElement(doLParen,'(', 29, 38, true, null ) // 33
, new RBBIRuleTableElement(doNOP,'$', 88, 37, false, null ) // 34
, new RBBIRuleTableElement(doDotAny,'.', 38,0, true, null ) // 35
, new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 36
, new RBBIRuleTableElement(doCheckVarDef, 255, 38,0, false, "term-var-ref") // 37
, new RBBIRuleTableElement(doNOP, 132, 38,0, true, "expr-mod") // 38
, new RBBIRuleTableElement(doUnaryOpStar,'*', 43,0, true, null ) // 39
, new RBBIRuleTableElement(doUnaryOpPlus,'+', 43,0, true, null ) // 40
, new RBBIRuleTableElement(doUnaryOpQuestion,'?', 43,0, true, null ) // 41
, new RBBIRuleTableElement(doNOP, 255, 43,0, false, null ) // 42
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont") // 43
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 44
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 45
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 46
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 47
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 48
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 49
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 50
, new RBBIRuleTableElement(doExprCatOperator,'{', 67,0, true, null ) // 51
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 52
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 53
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 54
, new RBBIRuleTableElement(doSlash,'/', 57,0, true, "look-ahead") // 55
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 56
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-slash") // 57
, new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 58
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 59
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 60
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 61
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 62
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 63
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 64
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 65
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 66
, new RBBIRuleTableElement(doNOP, 132, 67,0, true, "tag-open") // 67
, new RBBIRuleTableElement(doStartTagValue, 128, 70,0, false, null ) // 68
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 69
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-value") // 70
, new RBBIRuleTableElement(doNOP,'}', 74,0, false, null ) // 71
, new RBBIRuleTableElement(doTagDigit, 128, 70,0, true, null ) // 72
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 73
, new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-close") // 74
, new RBBIRuleTableElement(doTagValue,'}', 77,0, true, null ) // 75
, new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 76
, new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-tag") // 77
, new RBBIRuleTableElement(doNOP, 132, 77,0, true, null ) // 78
, new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 79
, new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 80
, new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 81
, new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 82
, new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 83
, new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 84
, new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 85
, new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 86
, new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 87
, new RBBIRuleTableElement(doStartVariableName,'$', 90,0, true, "scan-var-name") // 88
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 89
, new RBBIRuleTableElement(doNOP, 130, 92,0, true, "scan-var-start") // 90
, new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 103,0, false, null ) // 91
, new RBBIRuleTableElement(doNOP, 129, 92,0, true, "scan-var-body") // 92
, new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 93
, new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 94
, new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 95
, new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 96
, new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 97
, new RBBIRuleTableElement(doNOP, 132, 98,0, true, "assign-or-rule") // 98
, new RBBIRuleTableElement(doStartAssign,'=', 29, 101, true, null ) // 99
, new RBBIRuleTableElement(doNOP, 255, 37, 9, false, null ) // 100
, new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 101
, new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 103,0, false, null ) // 102
, new RBBIRuleTableElement(doExit, 255, 103,0, true, "errorDeath") // 103
};
}
};

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
* Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -66,6 +66,9 @@ class RBBIRuleScanner {
boolean fLookAheadRule; // True if the rule includes a '/'
// somewhere within it.
boolean fNoChainInRule; // True if the current rule starts with a '^'.
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
@ -139,6 +142,12 @@ class RBBIRuleScanner {
fRuleNum++;
break;
case RBBIRuleParseTable.doNoChain:
// Scanned a '^' while on the rule start state.
fNoChainInRule = true;
break;
case RBBIRuleParseTable.doExprOrOperator: {
fixOpStack(RBBINode.precOpCat);
RBBINode operandNode = fNodeStack[fNodeStackPtr--];
@ -241,11 +250,11 @@ class RBBIRuleScanner {
printNodeStack("end of rule");
}
Assert.assrt(fNodeStackPtr == 1);
RBBINode thisRule = fNodeStack[fNodeStackPtr];
// If this rule includes a look-ahead '/', add a endMark node to the
// expression tree.
if (fLookAheadRule) {
RBBINode thisRule = fNodeStack[fNodeStackPtr];
RBBINode endNode = pushNewNode(RBBINode.endMark);
RBBINode catNode = pushNewNode(RBBINode.opCat);
fNodeStackPtr -= 2;
@ -254,8 +263,24 @@ class RBBIRuleScanner {
fNodeStack[fNodeStackPtr] = catNode;
endNode.fVal = fRuleNum;
endNode.fLookAheadEnd = true;
thisRule = catNode;
// TODO: Disable chaining out of look-ahead (hard break) rules.
// The break on rule match is forced, so there is no point in building up
// the state table to chain into another rule for a longer match.
}
// Mark this node as being the root of a rule.
thisRule.fRuleRoot = true;
// Flag if chaining into this rule is wanted.
//
if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
thisRule.fChainIn = true;
}
// All rule expressions are ORed together.
// The ';' that terminates an expression really just functions as a
// '|' with
@ -269,12 +294,12 @@ class RBBIRuleScanner {
int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
if (fRB.fTreeRoots[destRules] != null) {
// This is not the first rule encounted.
// This is not the first rule encountered.
// OR previous stuff (from *destRules)
// with the current rule expression (on the Node Stack)
// with the resulting OR expression going to *destRules
//
RBBINode thisRule = fNodeStack[fNodeStackPtr];
thisRule = fNodeStack[fNodeStackPtr];
RBBINode prevRules = fRB.fTreeRoots[destRules];
RBBINode orNode = pushNewNode(RBBINode.opOr);
orNode.fLeftChild = prevRules;
@ -289,6 +314,7 @@ class RBBIRuleScanner {
}
fReverseRule = false; // in preparation for the next rule.
fLookAheadRule = false;
fNoChainInRule = false;
fNodeStackPtr = 0;
}
break;

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2002-2009, International Business Machines
* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -361,6 +361,25 @@ class RBBITableBuilder {
}
}
//-----------------------------------------------------------------------------
//
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
// as roots of a rule to a destination vector.
//
//-----------------------------------------------------------------------------
void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
if (node == null) {
return;
}
if (node.fRuleRoot) {
dest.add(node);
// Note: rules cannot nest. If we found a rule start node,
// no child node can also be a start node.
return;
}
addRuleRootNodes(dest, node.fLeftChild);
addRuleRootNodes(dest, node.fRightChild);
}
//-----------------------------------------------------------------------------
//
@ -379,17 +398,21 @@ class RBBITableBuilder {
// get a list all leaf nodes
tree.findNodes(leafNodes, RBBINode.leafChar);
// Get all nodes that can be the start a match, which is FirstPosition()
// of the portion of the tree corresponding to user-written rules.
// See the tree description in bofFixup().
RBBINode userRuleRoot = tree;
if (fRB.fSetBuilder.sawBOF()) {
userRuleRoot = tree.fLeftChild.fRightChild;
}
Assert.assrt(userRuleRoot != null);
Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;
// Collect all leaf nodes that can start matches for rules
// with inbound chaining enabled, which is the union of the
// firstPosition sets from each of the rule root nodes.
// Iteratate over all leaf nodes,
List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
addRuleRootNodes(ruleRootNodes, tree);
Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
for (RBBINode node: ruleRootNodes) {
if (node.fChainIn) {
matchStartNodes.addAll(node.fFirstPosSet);
}
}
// Iterate over all leaf nodes,
//
for (RBBINode tNode : leafNodes) {
RBBINode endNode = null;

View File

@ -1158,6 +1158,51 @@ public class RuleBasedBreakIterator extends BreakIterator {
return eng;
}
private static final int kMaxLookaheads = 8;
private static class LookAheadResults {
int fUsedSlotLimit;
int[] fPositions;
int[] fKeys;
LookAheadResults() {
fUsedSlotLimit= 0;
fPositions = new int[kMaxLookaheads];
fKeys = new int[kMaxLookaheads];
}
int getPosition(int key) {
for (int i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
return fPositions[i];
}
}
assert(false);
return -1;
}
void setPosition(int key, int position) {
int i;
for (i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
fPositions[i] = position;
return;
}
}
if (i >= kMaxLookaheads) {
assert(false);
i = kMaxLookaheads - 1;
}
fKeys[i] = key;
fPositions[i] = position;
assert(fUsedSlotLimit == i);
fUsedSlotLimit = i + 1;
}
void reset() {
fUsedSlotLimit = 0;
}
};
private LookAheadResults fLookAheadMatches = new LookAheadResults();
/**
@ -1214,9 +1259,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
}
int lookaheadStatus = 0;
int lookaheadTagIdx = 0;
int lookaheadResult = 0;
fLookAheadMatches.reset();
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
@ -1226,16 +1269,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
// We have already run the loop one last time with the
// character set to the pseudo {eof} value. Now it is time
// to unconditionally bail out.
if (lookaheadResult > result) {
// We ran off the end of the string with a pending
// look-ahead match.
// Treat this as if the look-ahead condition had been
// met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
}
break;
}
// Run the loop one last time with the fake end-of-input character category
@ -1299,40 +1332,30 @@ public class RuleBasedBreakIterator extends BreakIterator {
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
}
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
if (lookaheadStatus != 0
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
// TODO: make a standalone hard break in a rule work.
if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
text.setIndex(result);
return result;
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
text.setIndex(lookaheadResult);
return lookaheadResult;
}
// Look-ahead completed, but other rules may match further. Continue on.
// TODO: junk this feature? I don't think it's used anywhere.
continue;
}
lookaheadResult = text.getIndex();
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
// We want the beginning of it.
lookaheadResult--;
pos--;
}
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
continue;
fLookAheadMatches.setPosition(rule, pos);
}
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relevant. Clear out the pending lookahead status.
lookaheadStatus = 0;
}
} // End of state machine main loop
// The state machine is done. Check whether it found a match...
@ -1371,12 +1394,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
int mode;
int row;
int c;
int lookaheadStatus = 0;
int result = 0;
int initialPosition = 0;
int lookaheadResult = 0;
boolean lookAheadHardBreak =
(fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
fLookAheadMatches.reset();
// handlePrevious() never gets the rule status.
// Flag the status as invalid; if the user ever asks for status, we will need
@ -1407,7 +1427,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
// loop until we reach the beginning of the text or transition to state 0
//
mainLoop: for (;;) {
innerBlock: {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
@ -1415,13 +1434,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// does not support explicit support for matching {eof}, or
// we have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
if (lookaheadResult < result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
lookaheadStatus = 0;
} else if (result == initialPosition) {
if (result == initialPosition) {
// Ran off start, no match found.
// Move one position (towards the start, since we are doing previous.)
fText.setIndex(initialPosition);
@ -1473,50 +1486,23 @@ public class RuleBasedBreakIterator extends BreakIterator {
result = fText.getIndex();
}
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
if (lookaheadStatus != 0
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
// Lookahead match is completed. Set the result
// accordingly, but only
// if no other rule has matched further in the mean
// time.
result = lookaheadResult;
lookaheadStatus = 0;
// TODO: make a stand-alone hard break in a rule work.
if (lookAheadHardBreak) {
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed.
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
result = lookaheadResult;
break mainLoop;
}
// Look-ahead completed, but other rules may match further.
// Continue on.
// TODO: junk this feature? I don't think that it's used anywhere.
break innerBlock;
}
// Hit a possible look-ahead match. We are at the
// position of the '/'. Remember this position.
lookaheadResult = fText.getIndex();
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
break innerBlock;
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = fText.getIndex();
fLookAheadMatches.setPosition(rule, pos);
}
// not lookahead...
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
// This is a plain (non-look-ahead) accepting state.
if (!lookAheadHardBreak) {
// Clear out any pending look-ahead matches,
// but only if not doing the lookAheadHardBreak option
// which needs to force a break no matter what is going
// on with the rest of the match, i.e. we can't abandon
// a partially completed look-ahead match because
// some other rule matched further than the '/' position
// in the look-ahead match.
lookaheadStatus = 0;
}
}
} // end of innerBlock. "break innerBlock" in above code comes out here.
if (state == STOP_STATE) {
// Normal loop exit is here
break mainLoop;

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:848a445cb828689cd5bca20bfd321db5503ef66c0a94d929fc108a28d0c5595f
size 11754757
oid sha256:eb9182edec08706f02236909aaefcbf4c98d29d6415d1a8801633233c74f03fb
size 11789631

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a75dfbe25f7671a65bb933aed49a71eb9a923767687625982603c54860478ce7
oid sha256:cefefda6f12f61e7dcd7767a7b07b0fea3ca53c2a9b1524f3627e94cad6f3ee0
size 90259

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2003-2015 International Business Machines Corporation and
* Copyright (C) 2003-2016 International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -88,6 +88,11 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fLVTSet;
UnicodeSet fHangulSet;
UnicodeSet fAnySet;
UnicodeSet fEmojiModifierSet;
UnicodeSet fEmojiBaseSet;
UnicodeSet fZWJSet;
UnicodeSet fGAZSet;
StringBuffer fText;
@ -96,8 +101,8 @@ public class RBBITestMonkey extends TestFmwk {
fText = null;
fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
fCRLFSet = new UnicodeSet("[\\r\\n]");
fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
fControlSet = new UnicodeSet("[[\\p{Grapheme_Cluster_Break = Control}-[:Block=Tags:]]]");
fExtendSet = new UnicodeSet("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]");
fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
@ -115,6 +120,17 @@ public class RBBITestMonkey extends TestFmwk {
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
fEmojiBaseSet = new UnicodeSet(
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]");
fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
fZWJSet = new UnicodeSet(0x200D, 0x200D);
fGAZSet = new UnicodeSet("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]");
fSets = new ArrayList();
fSets.add(fCRLFSet);
fSets.add(fControlSet);
@ -126,6 +142,10 @@ public class RBBITestMonkey extends TestFmwk {
fSets.add(fSpacingSet);
fSets.add(fHangulSet);
fSets.add(fAnySet);
fSets.add(fEmojiBaseSet);
fSets.add(fEmojiModifierSet);
fSets.add(fZWJSet);
fSets.add(fGAZSet);
}
@ -138,25 +158,26 @@ public class RBBITestMonkey extends TestFmwk {
}
int next(int prevPos) {
int p1, p2, p3; // Indices of the significant code points around the
int p0, p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
int breakPos = -1;
int c1, c2, c3; // The code points at p0, p1, p2 & p3.
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
// Previous break at end of string. return DONE.
if (prevPos >= fText.length()) {
return -1;
}
p1 = p2 = p3 = prevPos;
p0 = p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
c1 = c2 = 0;
c0 = c1 = c2 = 0;
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
p0 = p1; c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
@ -219,12 +240,21 @@ public class RBBITestMonkey extends TestFmwk {
}
// Rule (GB8a) Regional_Indicator x Regional_Indicator
// Note: The first if condition is a little tricky. We only need to force
// a break if there are three or more contiguous RIs. If there are
// only two, a break following will occur via other rules, and will include
// any trailing extend characters, which is needed behavior.
if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
&& fRegionalIndicatorSet.contains(c2)) {
break;
}
if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
continue;
}
// Rule (GB9) Numeric x ALetter
if (fExtendSet.contains(c2)) {
// Rule (GB9) x Extend
if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) {
continue;
}
@ -237,6 +267,15 @@ public class RBBITestMonkey extends TestFmwk {
if (fPrependSet.contains(c1)) {
continue;
}
// Rule (GB9c) Emoji_Base x Emoji_Modifier
if ((fEmojiBaseSet.contains(c1) || fGAZSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
continue;
}
// Rule (GB9d) ZWJ x Glue_After_Zwj
if (fZWJSet.contains(c1) && fGAZSet.contains(c2)) {
continue;
}
// Rule (GB10) Any <break> Any
break;
@ -277,6 +316,10 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fExtendNumLetSet;
UnicodeSet fOtherSet;
UnicodeSet fDictionaryCjkSet;
UnicodeSet fEBaseSet;
UnicodeSet fEModifierSet;
UnicodeSet fZWSSet;
UnicodeSet fGAZSet;
RBBIWordMonkey() {
@ -300,6 +343,16 @@ public class RBBITestMonkey extends TestFmwk {
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
fEBaseSet = new UnicodeSet(
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]");
fEModifierSet = new UnicodeSet("[\\U0001F3FB-\\U0001F3FF]");
fZWSSet = new UnicodeSet(0x200D, 0x200D);
fGAZSet = new UnicodeSet("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]");
fExtendSet.removeAll(fZWSSet);
fOtherSet = new UnicodeSet();
fOtherSet.complement();
@ -318,6 +371,11 @@ public class RBBITestMonkey extends TestFmwk {
fOtherSet.removeAll(fExtendSet);
fOtherSet.removeAll(fExtendNumLetSet);
fOtherSet.removeAll(fRegionalIndicatorSet);
fOtherSet.removeAll(fEBaseSet);
fOtherSet.removeAll(fEModifierSet);
fOtherSet.removeAll(fZWSSet);
fOtherSet.removeAll(fGAZSet);
// Inhibit dictionary characters from being tested at all.
// remove surrogates so as to not generate higher CJK characters
fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
@ -390,7 +448,7 @@ public class RBBITestMonkey extends TestFmwk {
break;
}
}
while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWSSet, c3));
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
@ -418,6 +476,13 @@ public class RBBITestMonkey extends TestFmwk {
break;
}
// Rule (3c) ZWJ x GAZ (Glue after ZWJ).
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
if (fZWSSet.contains(fText.codePointBefore(p2)) && fGAZSet.contains(c2)) {
continue;
}
// Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
(fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
@ -509,10 +574,18 @@ public class RBBITestMonkey extends TestFmwk {
// Rule 13c Do not break between Regional Indicators.
// Regional_Indicator × Regional_Indicator
if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
break;
}
if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
continue;
}
// Rule 13d
if ((fEBaseSet.contains(c1) || fGAZSet.contains(c1)) && fEModifierSet.contains(c2)) {
continue;
}
// Rule 14. Break found here.
break;
}
@ -570,8 +643,10 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fJV;
UnicodeSet fJT;
UnicodeSet fRI;
UnicodeSet fSA;
UnicodeSet fXX;
UnicodeSet fEB;
UnicodeSet fEM;
UnicodeSet fZJ;
StringBuffer fText;
int fOrigPositions;
@ -621,23 +696,33 @@ public class RBBITestMonkey extends TestFmwk {
fJV = new UnicodeSet("[\\p{Line_break=JV}]");
fJT = new UnicodeSet("[\\p{Line_break=JT}]");
fRI = new UnicodeSet("[\\p{Line_break=RI}]");
fSA = new UnicodeSet("[\\p{Line_break=SA}]");
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
fEB = new UnicodeSet(
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
+ "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
+ "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
+ "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]");
fEM = new UnicodeSet("[\\U0001F3FB-\\U0001F3FF]");
fZJ = new UnicodeSet(0x200D, 0x200D);
// Remove dictionary characters.
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
// so dictionary characters are omitted from the monkey test data.
UnicodeSet dictionarySet = new UnicodeSet(
"[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
fSA.removeAll(dictionarySet);
fAL.addAll(fXX); // Default behavior for XX is identical to AL
fAL.addAll(fAI); // Default behavior for AI is identical to AL
fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
fNS.addAll(fCJ); // Default behavior for CJ is identical to NS.
fID.addAll(fEB); // Emoji Base and Emoji Modifier behave as ID.
fID.addAll(fEM);
fAL.removeAll(fEM);
fAL.remove(0x2764); // Emoji Proposal: move u2764 from AL to ID
fID.add(0x2764);
fSets.add(fBK);
fSets.add(fCR);
fSets.add(fLF);
@ -674,9 +759,12 @@ public class RBBITestMonkey extends TestFmwk {
fSets.add(fHL);
fSets.add(fID);
fSets.add(fWJ);
fSets.add(fSA);
fSets.add(fSG);
fSets.add(fRI);
fSets.add(fSG);
fSets.add(fEB);
fSets.add(fEM);
fSets.add(fZJ);
}
void setText(StringBuffer s) {
@ -810,6 +898,17 @@ public class RBBITestMonkey extends TestFmwk {
break;
}
// LB 8a ZJ x ID
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int prevC = fText.codePointBefore(pos);
if (fZJ.contains(prevC) && fID.contains(thisChar)) {
continue;
}
}
// LB 9, 10 Already done, at top of loop.
//
@ -1061,11 +1160,20 @@ public class RBBITestMonkey extends TestFmwk {
continue;
}
// LB 30a Do not break between regional indicators. RI × RI
// LB 30a Break between pairs of Regional Indicators.
// RI RI <break> RI
// RI x RI
if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
break;
}
if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
continue;
}
// LB30b Emoji Base x Emoji Modifier
if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
continue;
}
// LB 31 Break everywhere else
break;
}