ICU-2924 Line break update - fix more monkey failures, getting closer.
X-SVN-Rev: 13397
This commit is contained in:
parent
a93d362161
commit
ccba9cce88
@ -350,7 +350,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
|
||||
//
|
||||
int32_t endNodeIx;
|
||||
int32_t startNodeIx;
|
||||
UVector endingNodes(*fStatus);
|
||||
|
||||
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
|
||||
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
|
||||
@ -370,37 +369,19 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
|
||||
}
|
||||
|
||||
// We've got a node that can end a match.
|
||||
// TODO: endingNodes.addElement(endNode, *fStatus);
|
||||
|
||||
// Line Break Specific hack. Does this end val correspond to the $CM char class?
|
||||
// And is it part of a rule of this form: $XX $CM*
|
||||
// If so, we want to chain to rules beginning with $XX, not with $CM.
|
||||
// We still chain from the CM node, but the criteria for choosing
|
||||
// the nodes to chain to is different.
|
||||
// Line Break Specific hack: If this node's val correspond to the $CM char class,
|
||||
// don't chain from it.
|
||||
// TODO: Add rule syntax for this behavior, get specifics out of here and
|
||||
// into the rule file.
|
||||
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
|
||||
U_ASSERT(c != -1);
|
||||
RBBINode *parent = NULL;
|
||||
RBBINode *grandParent = NULL;
|
||||
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
|
||||
if (cLBProp != U_LB_COMBINING_MARK) {
|
||||
goto neverMind;
|
||||
}
|
||||
parent = endNode->fParent;
|
||||
if (parent->fType != RBBINode::opStar) {
|
||||
goto neverMind;
|
||||
}
|
||||
grandParent = parent->fParent;
|
||||
if (grandParent->fType != RBBINode::opCat || grandParent->fRightChild != parent) {
|
||||
goto neverMind;
|
||||
if (cLBProp == U_LB_COMBINING_MARK) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// TODO: grab nodes from grandParent->leftChild->endPos; add to endingNodes
|
||||
|
||||
|
||||
neverMind:
|
||||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
RBBINode *startNode;
|
||||
|
@ -78,19 +78,46 @@ $QUcm = $QU $CM*;
|
||||
$SPcm = $SP $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HY $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$NS $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$SP $CM+;
|
||||
$SY $CM+;
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 3
|
||||
$LB3NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB3NonBreaks? ($BK | $CR | $LF | $NL){100};
|
||||
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
||||
|
||||
$LB3NonBreaks? ($BK | $CR | $LF | $NL){100};
|
||||
$LB5NonBreaks $CM* ($BK | $CR | $LF | $NL){100};
|
||||
$CR $LF {100};
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
$LB3NonBreaks [$SP $ZW];
|
||||
$LB3NonBreaks [$SP $ZW];
|
||||
$LB5NonBreaks $CM* [$SP $ZW];
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
||||
|
||||
|
||||
# LB 7 Combining marks. TODO: get it right!
|
||||
# $SP $CM needs to behave like $ID.
|
||||
@ -99,10 +126,10 @@ $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
||||
[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
|
||||
# LB 8
|
||||
[$LB5NonBreaks] $CL;
|
||||
[$LB5NonBreaks] $EX;
|
||||
[$LB5NonBreaks] $IS;
|
||||
[$LB5NonBreaks] $SY;
|
||||
[$LB5NonBreaks] $CM* $CL;
|
||||
[$LB5NonBreaks] $CM* $EX;
|
||||
[$LB5NonBreaks] $CM* $IS;
|
||||
[$LB5NonBreaks] $CM* $SY;
|
||||
|
||||
# LB 9
|
||||
$OPcm $SP* .?;
|
||||
@ -118,8 +145,8 @@ $CLcm $SP* $NScm;
|
||||
($B2cm)+;
|
||||
|
||||
# LB 11b
|
||||
$LB5NonBreaks $GLcm .?;
|
||||
$LB5NonBreaks $GLcm [$LB5NonBreaks] $CM*;
|
||||
$LB5NonBreaks $CM* $GLcm .?;
|
||||
$LB5NonBreaks $CM* $GLcm [$LB5NonBreaks] $CM*;
|
||||
$GLcm $LB3NonBreaks?;
|
||||
$GLcm [$LB5NonBreaks] $CM*;
|
||||
|
||||
@ -127,35 +154,48 @@ $GLcm [$LB5NonBreaks] $CM*;
|
||||
$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
|
||||
|
||||
# LB 14
|
||||
$LB12NonBreaks $QUcm+ .?;
|
||||
$LB12NonBreaks $QUcm+ [$LB5NonBreaks] $CM*;
|
||||
$LB12NonBreaks $CM* $QUcm+ .?;
|
||||
$LB12NonBreaks $CM* $QUcm+ [$LB5NonBreaks] $CM*;
|
||||
$SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID
|
||||
$SP $CM+ $QUcm+ [$LB5NonBreaks] $CM*;
|
||||
|
||||
$QUcm $LB3NonBreaks?;
|
||||
$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
|
||||
|
||||
# LB 14a
|
||||
$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
|
||||
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
|
||||
|
||||
|
||||
# LB 15
|
||||
$LB14NonBreaks ($BAcm | $HYcm | $NScm);
|
||||
$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
|
||||
$BBcm [^$CB];
|
||||
$BBcm [^$CB $CR $LF $BK $NL $ZW] CM*;
|
||||
|
||||
# LB 16
|
||||
($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
|
||||
#($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
|
||||
$ALcm $INcm;
|
||||
$CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
|
||||
$IDcm $INcm;
|
||||
$SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 17
|
||||
($IDcm | SP CM+) $POcm;
|
||||
($IDcm | $SP $CM+) $POcm;
|
||||
$ALcm+ $NUcm; # includes $LB19
|
||||
$CM+ $NUcm; # Rule 7c
|
||||
$NUcm $ALcm+;
|
||||
|
||||
|
||||
|
||||
# LB 18
|
||||
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
|
||||
#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?;
|
||||
|
||||
# LB 19
|
||||
#$CM* $ALcm+;
|
||||
$ALcm+;
|
||||
$CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -2407,6 +2407,8 @@ private:
|
||||
int32_t *fOrigPositions;
|
||||
|
||||
RegexMatcher *fNumberMatcher;
|
||||
RegexMatcher *fLB10Matcher;
|
||||
RegexMatcher *fLB11Matcher;
|
||||
};
|
||||
|
||||
|
||||
@ -2495,6 +2497,18 @@ RBBILineMonkey::RBBILineMonkey()
|
||||
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
|
||||
0, status);
|
||||
|
||||
fLB10Matcher = new RegexMatcher(
|
||||
"\\p{Line_Break=QU}\\p{Line_Break=CM}*"
|
||||
"\\p{Line_Break=SP}*"
|
||||
"(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
|
||||
0, status);
|
||||
|
||||
fLB11Matcher = new RegexMatcher(
|
||||
"\\p{Line_Break=CL}\\p{Line_Break=CM}*"
|
||||
"\\p{Line_Break=SP}*"
|
||||
"(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
|
||||
0, status);
|
||||
|
||||
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
@ -2637,6 +2651,30 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 10 QU SP* x OP
|
||||
UnicodeString subStr10(*fText, prevPos);
|
||||
fLB10Matcher->reset(subStr10);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB10Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB10Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 11 CL SP* x NS
|
||||
UnicodeString subStr11(*fText, prevPos);
|
||||
fLB11Matcher->reset(subStr11);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB11Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB11Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 4 Don't break before spaces or zero-width space.
|
||||
if (fSP->contains(thisChar)) {
|
||||
continue;
|
||||
@ -2652,6 +2690,9 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
|
||||
// LB 6, LB 7
|
||||
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
|
||||
|
||||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = nextCPPos;
|
||||
UChar32 c = fText->char32At(nextPos);
|
||||
rule67Adjust(pos, &thisChar, &nextPos, &c);
|
||||
|
||||
@ -2662,6 +2703,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Re-apply rules 3c, 4 because these could be affected by having
|
||||
// a new thisChar from doing rule 6 or 7.
|
||||
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c
|
||||
fBK->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fSP->contains(thisChar)) { // LB 4
|
||||
continue;
|
||||
}
|
||||
if (fZW->contains(thisChar)) { // LB 4
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 8 Don't break before closings.
|
||||
// NU x CL and NU x IS are not matched here so that they will
|
||||
// fall into LB 17 and the more general number regular expression.
|
||||
@ -2689,42 +2744,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
continue;
|
||||
|
||||
fall_through_9:
|
||||
|
||||
// LB 10 QU SP* x OP
|
||||
if (fOP->contains(thisChar)) {
|
||||
tPos = prevPos;
|
||||
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
|
||||
if (fOP->contains(fText->char32At(tPos))) {
|
||||
break;
|
||||
}
|
||||
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
|
||||
goto fall_through_10;
|
||||
}
|
||||
}
|
||||
// We match QU SP* x OP
|
||||
// No break at this postion.
|
||||
// Continue the outer loop.
|
||||
continue;
|
||||
}
|
||||
fall_through_10:
|
||||
|
||||
// LB 11 CL SP* x NS
|
||||
if (fNS->contains(thisChar)) {
|
||||
tPos = prevPos;
|
||||
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
|
||||
if (fCL->contains(fText->char32At(tPos))) {
|
||||
break;
|
||||
}
|
||||
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
|
||||
goto fall_through_11;
|
||||
}
|
||||
}
|
||||
// We match CL SP* x NS
|
||||
// No break at this postion.
|
||||
// Continue the outer loop.
|
||||
continue;
|
||||
}
|
||||
fall_through_11:
|
||||
|
||||
// LB 11a B2 x B2
|
||||
if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
|
||||
@ -2783,8 +2803,8 @@ fall_through_11:
|
||||
}
|
||||
|
||||
// LB 18 Numbers
|
||||
UnicodeString subStr(*fText, prevPos);
|
||||
fNumberMatcher->reset(subStr);
|
||||
UnicodeString subStr18(*fText, prevPos);
|
||||
fNumberMatcher->reset(subStr18);
|
||||
if (fNumberMatcher->lookingAt(status)) {
|
||||
// TODO: Check status codes
|
||||
// Matched a number. But could have been just a single digit, which would
|
||||
@ -2864,6 +2884,8 @@ RBBILineMonkey::~RBBILineMonkey() {
|
||||
|
||||
delete fCharBI;
|
||||
delete fNumberMatcher;
|
||||
delete fLB10Matcher;
|
||||
delete fLB11Matcher;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user