ICU-2924 Line break update - fix more monkey failures, getting closer.

X-SVN-Rev: 13397
This commit is contained in:
Andy Heninger 2003-10-13 06:01:21 +00:00
parent a93d362161
commit ccba9cce88
3 changed files with 120 additions and 77 deletions

View File

@ -350,7 +350,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
//
int32_t endNodeIx;
int32_t startNodeIx;
UVector endingNodes(*fStatus);
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
@ -370,37 +369,19 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
}
// We've got a node that can end a match.
// TODO: endingNodes.addElement(endNode, *fStatus);
// Line Break Specific hack. Does this end val correspond to the $CM char class?
// And is it part of a rule of this form: $XX $CM*
// If so, we want to chain to rules beginning with $XX, not with $CM.
// We still chain from the CM node, but the criteria for choosing
// the nodes to chain to is different.
// Line Break Specific hack: If this node's val correspond to the $CM char class,
// don't chain from it.
// TODO: Add rule syntax for this behavior, get specifics out of here and
// into the rule file.
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
U_ASSERT(c != -1);
RBBINode *parent = NULL;
RBBINode *grandParent = NULL;
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
if (cLBProp != U_LB_COMBINING_MARK) {
goto neverMind;
}
parent = endNode->fParent;
if (parent->fType != RBBINode::opStar) {
goto neverMind;
}
grandParent = parent->fParent;
if (grandParent->fType != RBBINode::opCat || grandParent->fRightChild != parent) {
goto neverMind;
if (cLBProp == U_LB_COMBINING_MARK) {
continue;
}
// TODO: grab nodes from grandParent->leftChild->endPos; add to endingNodes
neverMind:
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;

View File

@ -78,19 +78,46 @@ $QUcm = $QU $CM*;
$SPcm = $SP $CM*;
$SYcm = $SY $CM*;
#
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
#
$ALPlus $CM+;
$BA $CM+;
$BB $CM+;
$B2 $CM+;
$CL $CM+;
$EX $CM+;
$GL $CM+;
$HY $CM+;
$ID $CM+;
$IN $CM+;
$IS $CM+;
$NS $CM+;
$NU $CM+;
$OP $CM+;
$PO $CM+;
$PR $CM+;
$QU $CM+;
$SP $CM+;
$SY $CM+;
#
# Rule LB 3
$LB3NonBreaks = [^$BK $CR $LF $NL];
$LB3NonBreaks? ($BK | $CR | $LF | $NL){100};
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
$LB3NonBreaks? ($BK | $CR | $LF | $NL){100};
$LB5NonBreaks $CM* ($BK | $CR | $LF | $NL){100};
$CR $LF {100};
# LB 4 x SP
# x ZW
$LB3NonBreaks [$SP $ZW];
$LB3NonBreaks [$SP $ZW];
$LB5NonBreaks $CM* [$SP $ZW];
# LB 5 Break after zero width space
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
# LB 7 Combining marks. TODO: get it right!
# $SP $CM needs to behave like $ID.
@ -99,10 +126,10 @@ $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules.
# LB 8
[$LB5NonBreaks] $CL;
[$LB5NonBreaks] $EX;
[$LB5NonBreaks] $IS;
[$LB5NonBreaks] $SY;
[$LB5NonBreaks] $CM* $CL;
[$LB5NonBreaks] $CM* $EX;
[$LB5NonBreaks] $CM* $IS;
[$LB5NonBreaks] $CM* $SY;
# LB 9
$OPcm $SP* .?;
@ -118,8 +145,8 @@ $CLcm $SP* $NScm;
($B2cm)+;
# LB 11b
$LB5NonBreaks $GLcm .?;
$LB5NonBreaks $GLcm [$LB5NonBreaks] $CM*;
$LB5NonBreaks $CM* $GLcm .?;
$LB5NonBreaks $CM* $GLcm [$LB5NonBreaks] $CM*;
$GLcm $LB3NonBreaks?;
$GLcm [$LB5NonBreaks] $CM*;
@ -127,35 +154,48 @@ $GLcm [$LB5NonBreaks] $CM*;
$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
# LB 14
$LB12NonBreaks $QUcm+ .?;
$LB12NonBreaks $QUcm+ [$LB5NonBreaks] $CM*;
$LB12NonBreaks $CM* $QUcm+ .?;
$LB12NonBreaks $CM* $QUcm+ [$LB5NonBreaks] $CM*;
$SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID
$SP $CM+ $QUcm+ [$LB5NonBreaks] $CM*;
$QUcm $LB3NonBreaks?;
$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# LB 14a
$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
# LB 15
$LB14NonBreaks ($BAcm | $HYcm | $NScm);
$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
$BBcm [^$CB];
$BBcm [^$CB $CR $LF $BK $NL $ZW] CM*;
# LB 16
($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
#($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
$ALcm $INcm;
$CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
$IDcm $INcm;
$SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID
$INcm $INcm;
$NUcm $INcm;
# $LB 17
($IDcm | SP CM+) $POcm;
($IDcm | $SP $CM+) $POcm;
$ALcm+ $NUcm; # includes $LB19
$CM+ $NUcm; # Rule 7c
$NUcm $ALcm+;
# LB 18
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?;
# LB 19
#$CM* $ALcm+;
$ALcm+;
$CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
#
# Reverse Rules.

View File

@ -2407,6 +2407,8 @@ private:
int32_t *fOrigPositions;
RegexMatcher *fNumberMatcher;
RegexMatcher *fLB10Matcher;
RegexMatcher *fLB11Matcher;
};
@ -2495,6 +2497,18 @@ RBBILineMonkey::RBBILineMonkey()
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
0, status);
fLB10Matcher = new RegexMatcher(
"\\p{Line_Break=QU}\\p{Line_Break=CM}*"
"\\p{Line_Break=SP}*"
"(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
0, status);
fLB11Matcher = new RegexMatcher(
"\\p{Line_Break=CL}\\p{Line_Break=CM}*"
"\\p{Line_Break=SP}*"
"(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
0, status);
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
@ -2637,6 +2651,30 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// LB 10 QU SP* x OP
UnicodeString subStr10(*fText, prevPos);
fLB10Matcher->reset(subStr10);
status = U_ZERO_ERROR;
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB10Matcher->start(1, status);
nextPos = prevPos + fLB10Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
// LB 11 CL SP* x NS
UnicodeString subStr11(*fText, prevPos);
fLB11Matcher->reset(subStr11);
status = U_ZERO_ERROR;
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB11Matcher->start(1, status);
nextPos = prevPos + fLB11Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
// LB 4 Don't break before spaces or zero-width space.
if (fSP->contains(thisChar)) {
continue;
@ -2652,6 +2690,9 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// LB 6, LB 7
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
UChar32 c = fText->char32At(nextPos);
rule67Adjust(pos, &thisChar, &nextPos, &c);
@ -2662,6 +2703,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// Re-apply rules 3c, 4 because these could be affected by having
// a new thisChar from doing rule 6 or 7.
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c
fBK->contains(thisChar)) {
continue;
}
if (fSP->contains(thisChar)) { // LB 4
continue;
}
if (fZW->contains(thisChar)) { // LB 4
continue;
}
// LB 8 Don't break before closings.
// NU x CL and NU x IS are not matched here so that they will
// fall into LB 17 and the more general number regular expression.
@ -2690,41 +2745,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
fall_through_9:
// LB 10 QU SP* x OP
if (fOP->contains(thisChar)) {
tPos = prevPos;
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
if (fOP->contains(fText->char32At(tPos))) {
break;
}
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
goto fall_through_10;
}
}
// We match QU SP* x OP
// No break at this postion.
// Continue the outer loop.
continue;
}
fall_through_10:
// LB 11 CL SP* x NS
if (fNS->contains(thisChar)) {
tPos = prevPos;
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
if (fCL->contains(fText->char32At(tPos))) {
break;
}
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
goto fall_through_11;
}
}
// We match CL SP* x NS
// No break at this postion.
// Continue the outer loop.
continue;
}
fall_through_11:
// LB 11a B2 x B2
if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
@ -2783,8 +2803,8 @@ fall_through_11:
}
// LB 18 Numbers
UnicodeString subStr(*fText, prevPos);
fNumberMatcher->reset(subStr);
UnicodeString subStr18(*fText, prevPos);
fNumberMatcher->reset(subStr18);
if (fNumberMatcher->lookingAt(status)) {
// TODO: Check status codes
// Matched a number. But could have been just a single digit, which would
@ -2864,6 +2884,8 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fCharBI;
delete fNumberMatcher;
delete fLB10Matcher;
delete fLB11Matcher;
}