ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
This commit is contained in:
Syn Wee Quek 2003-11-05 23:50:39 +00:00
parent e7251a2b04
commit 469c2d5b76
10 changed files with 904 additions and 309 deletions

View File

@ -405,45 +405,51 @@ int32_t RuleBasedBreakIterator::previous(void) {
return BreakIterator::DONE;
}
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
// iterator up one step to prevent handlePrevious() from returning
// the current position), but not necessarily the last one before
// where we started
int32_t start = current();
fText->previous32();
int32_t lastResult = handlePrevious();
int32_t result = lastResult;
int32_t lastTag = 0;
UBool breakTagValid = FALSE;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
for (;;) {
result = handleNext();
if (result == BreakIterator::DONE || result >= start) {
break;
// set things up. handlePrevious() will back us up to a safe position
// before the current position to at most 2 breaks beyond. the
// backwards rules may occasionally move the position to less than a
// break beyond
int32_t safe = handlePrevious();
return safe;
/*** int32_t result = handleNext();
// moving forward to a boundary.
if (result < start) {
fLastBreakTag = 0; // for use by getRuleStatus()
fLastBreakTagValid = TRUE; // handlenext called
/// return lastResult;
return result;
}
lastResult = result;
lastTag = fLastBreakTag;
breakTagValid = TRUE;
else {
fText->setIndex(safe);
if (safe == fText->startIndex()) {
// if we are at the start of the text and result == start
// this means that we are already at the previous break
fLastBreakTag = 0; // for use by getRuleStatus()
fLastBreakTagValid = FALSE;
return safe;
}
}
***/
/// lastResult = result;
/// lastTag = fLastBreakTag;
/// breakTagValid = TRUE;
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
// we wont have a tag value for that position, which is only set by handleNext().
/// fText->setIndex(lastResult);
/// fLastBreakTag = lastTag; // for use by getRuleStatus()
/// fLastBreakTagValid = breakTagValid;
/// return lastResult;
}
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
// we wont have a tag value for that position, which is only set by handleNext().
// set the current iteration position to be the last break position
// before where we started, and then return that value
fText->setIndex(lastResult);
fLastBreakTag = lastTag; // for use by getRuleStatus()
fLastBreakTagValid = breakTagValid;
return lastResult;
}
@ -476,9 +482,11 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// otherwise, set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
fText->setIndex(offset);
if (offset == fText->startIndex())
return handleNext();
/// todo synwee
/// fText->setIndex(offset);
fText->setIndex(fText->startIndex());
/// if (offset == fText->startIndex())
/// return handleNext();
// otherwise, we have to sync up first. Use handlePrevious() to back
// us up to a known break position before the specified position (if
@ -488,7 +496,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
int32_t result = previous();
int32_t result = fText->startIndex();/// previous();
while (result != BreakIterator::DONE && result <= offset) {
result = next();
}
@ -517,8 +525,17 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
fText->setIndex(offset);
return previous();
/// todo synwee
/// fText->setIndex(offset);
/// return previous();
int32_t result = fText->endIndex();
fText->setIndex(result);
while (result != BreakIterator::DONE && result >= offset) {
result = next();
}
return result;
}
/**
@ -679,6 +696,35 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
goto continueOn;
}
if (row->fAccepting != 0 && row->fLookAhead != 0) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
///
if (lookaheadResult >= result) {
// U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
fText->setIndex(lookaheadResult);
/// TODO: this is a simple hack since reverse rules only have simple
/// lookahead rules that we can definitely break out from.
/// we need to make the lookahead rules not chain eventually.
return result;
}
int32_t r = fText->getIndex();
if (r > result) {
///
result = r;
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
}
goto continueOn;
}
if (row->fAccepting == -1) {
// Match found, common case, no lookahead involved.
// (It's possible that some lookahead rule matched here also,
@ -695,24 +741,9 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
// TODO: handle case where there's a pending match from a different rule -
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
int32_t r = fText->getIndex();
if (r > result) {
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
}
goto continueOn;
}
if (row->fAccepting != 0 && row->fLookAhead != 0) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
if (lookaheadResult > result) {
U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0;
}
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
goto continueOn;
}
@ -722,7 +753,7 @@ continueOn:
// We have advanced through the string until it is certain that no
// longer match is possible, no matter what characters follow.
break;
}
}
}
// The state machine is done. Check whether it found a match...
@ -749,7 +780,9 @@ continueOn:
// handlePrevious()
//
// This method backs the iterator back up to a "safe position" in the text.
// This is a position that we know, without any context, must be a break position.
// This is a position that we know, without any context, may be any position
// not more than 2 breaks away. Occasionally, the position may be less than
// one break away.
// The various calling methods then iterate forward from this safe position to
// the appropriate position to return.
//
@ -760,18 +793,24 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
if (fText == NULL || fData == NULL) {
return 0;
}
// break tag is no longer valid after icu switched to exact backwards
// positioning.
fLastBreakTagValid = FALSE;
if (fData->fReverseTable == NULL) {
return fText->setToStart();
}
int32_t state = START_STATE;
int32_t state = START_STATE;
int32_t category;
int32_t lastCategory = 0;
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0;
int32_t lookaheadTag = 0;
UChar32 c = fText->current32();
int32_t lastCategory = 0;
UBool hasPassedStartText = !fText->hasPrevious();
UChar32 c = fText->previous32();
// previous character
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;//[] = {0, 0, 0, 0, 0};
int32_t lookaheadResult = 0;//[] = {0, 0, 0, 0, 0};
int32_t lookaheadTag = 0;//[] = {0, 0, 0, 0, 0};
int32_t lookaheadCount = 0;
RBBIStateTableRow *row;
row = (RBBIStateTableRow *)
@ -788,7 +827,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
// loop until we reach the beginning of the text or transition to state 0
for (;;) {
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
if (hasPassedStartText) {
// if we have already considered the start of the text
break;
}
@ -825,9 +866,39 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
goto continueOn;
}
if (row->fAccepting != 0 && row->fLookAhead != 0) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
if (row->fAccepting == lookaheadStatus) { ///lookaheadResult > 0 && lookaheadResult <= result) {
/// what on earth is this?
/// U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
fText->setIndex(lookaheadResult);
/// TODO: this is a simple hack since reverse rules only have simple
/// lookahead rules that we can definitely break out from.
/// we need to make the lookahead rules not chain eventually.
return result;
}
int32_t r = fText->getIndex();
if (r < result) {
result = r;
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
}
goto continueOn;
}
if (row->fAccepting == -1) {
// Match found, common case, no lookahead involved.
result = fText->getIndex();
/// added
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
lookaheadStatus = 0; // clear out any pending look-ahead matches.
goto continueOn;
}
@ -837,43 +908,32 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
// has unconditionally matched to this point.
// TODO: handle case where there's a pending match from a different rule
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
//
int32_t r = fText->getIndex();
if (r > result) {
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
}
goto continueOn;
}
if (row->fAccepting != 0 && row->fLookAhead != 0) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
if (lookaheadResult > result) {
U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0;
}
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
goto continueOn;
}
continueOn:
if (state == STOP_STATE) {
if (state == STOP_STATE) { /// && lookaheadStatus == 0) {
break;
}
// then advance one character backwards
hasPassedStartText = !fText->hasPrevious();
c = fText->previous32();
}
// Note: the result postion isn't what is returned to the user by previous(),
// but where the implementation of previous() turns around and
// starts iterating forward again.
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
result = fText->startIndex();
}
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
/// if (hasPassedStartText) && row->fLookAhead != 0) {
/// return fText->setToStart();
/// return result;
/// }
fText->setIndex(result);
return result;

View File

@ -36,10 +36,11 @@ $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
#
$CR $LF;
([^$Control] | $HangulSyllable) $Extend*;
.;
#
# Reverse Rule, back up to the beginning of some preceding grapheme cluster.
#
! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)?$L* | .);
$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
$BackOneCluster = ($LF $CR) | ($Extend* ([^$Control] | $BackHangulSyllable));
! $BackOneCluster;

View File

@ -12,7 +12,7 @@
# Character Classes defined by TR 14.
#
!!chain ;
!!chain;
!!LBCMNoChain;
$AI = [:LineBreak = Ambiguous:];
@ -136,17 +136,17 @@ $LB5NonBreaks $CM* [$SP $ZW];
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# $CM not covered by the above needs to behave like $AL
[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules.
$LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules.
# LB 8
[$LB5NonBreaks] $CM* $CL;
[$LB5NonBreaks] $CM* $EX;
[$LB5NonBreaks] $CM* $IS;
[$LB5NonBreaks] $CM* $SY;
$LB5NonBreaks $CM* $CL;
$LB5NonBreaks $CM* $EX;
$LB5NonBreaks $CM* $IS;
$LB5NonBreaks $CM* $SY;
# LB 9
$OPcm $SP* .?;
$OPcm $SP* [$LB5NonBreaks] $CM*;
$OPcm $SP* $LB5NonBreaks $CM*;
# LB 10
$QUcm $SP* $OPcm;
@ -159,24 +159,24 @@ $CLcm $SP* $NScm;
# LB 11b
$LB5NonBreaks $CM* $GLcm .?;
$LB5NonBreaks $CM* $GLcm [$LB5NonBreaks] $CM*;
$LB5NonBreaks $CM* $GLcm $LB5NonBreaks $CM*;
$GLcm $LB3NonBreaks?;
$GLcm [$LB5NonBreaks] $CM*;
$GLcm $LB5NonBreaks $CM*;
# LB 12
$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
$LB12NonBreaks = [$LB5NonBreaks - $SP];
# LB 14
$LB12NonBreaks $CM* $QUcm+ .?;
$LB12NonBreaks $CM* $QUcm+ [$LB5NonBreaks] $CM*;
$LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*;
$SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID
$SP $CM+ $QUcm+ [$LB5NonBreaks] $CM*;
$SP $CM+ $QUcm+ $LB5NonBreaks $CM*;
$QUcm $LB3NonBreaks?;
$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
$QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# LB 14a
$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
$LB14NonBreaks = [$LB12NonBreaks - $CB];
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
@ -216,6 +216,112 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
# Note that the initial .. is to back over both halves of a CR/LF sequence
# at the current position.
#
!!reverse;
!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
#!.*;
# !. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
! $CM+ $ALPlus;
! $CM+ $BA;
! $CM+ $BB;
! $CM+ $B2;
! $CM+ $CL;
! $CM+ $EX;
! $CM+ $GL;
! $CM+ $HY;
! $CM+ $ID;
! $CM+ $IN;
! $CM+ $IS;
! $CM+ $NS;
! $CM+ $NU;
! $CM+ $OP;
! $CM+ $PO;
! $CM+ $PR;
! $CM+ $QU;
! $CM+ $SP;
! $CM+ $SY;
# LB 3
! ($BK | $CR | $LF | $NL) $LB3NonBreaks?;
! ($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
! $LF $CR;
# LB 4 x SP
# x ZW
! [$SP $ZW] $LB3NonBreaks;
! [$SP $ZW] $CM* $LB5NonBreaks;
# LB 5 Break after zero width space
# LB 7 Combining marks. TODO: get it right!
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# $CM not covered by the above needs to behave like $AL
! $CM+ $LB5NonBreaks; # Stick together any combining sequences that don't match other rules.
# LB 8
! $CL $CM* $LB5NonBreaks;
! $EX $CM* $LB5NonBreaks;
! $IS $CM* $LB5NonBreaks;
! $SY $CM* $LB5NonBreaks;
# LB 9
! .? $SP* $CM* $OP;
! $CM* $LB5NonBreaks $SP* $CM* $OP;
# LB 10
! $CM* $OP $SP* $CM* $QU;
# LB 11
! $CM* $NS $SP* $CM* $CL;
# LB 11a
! ($CM* $B2)+;
# LB 11b
! .? $CM* $GL $CM* $LB5NonBreaks;
! $CM* $LB5NonBreaks $CM* $GL $CM* $LB5NonBreaks;
! $LB3NonBreaks? $CM* $GL;
! $CM* $LB5NonBreaks $CM* $GL;
# LB 12
# LB 14
! .? ($CM* $QU)+ $CM* $LB12NonBreaks;
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM* $LB12NonBreaks;
! .? ($CM* $QU)+ $CM+ $SP; # LB7a SP CM+ behaves as ID
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM+ $SP;
! $LB3NonBreaks? $CM* $QU;
! $CM* $LB5NonBreaks $CM* $QU; # Don't let a combining mark go onto $CR, $BK, etc.
# LB 14a
$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
# LB 15
! ($CM* $BA | $CM* $HY | $CM* $NS) $BackLB14CanBreakAfter;
! ($CM* $BA | $CM* $HY | $CM* $NS) $CM+ / [$BK $CR $LF $NL $ZW];
! [^$CB] $CM* $BB;
! $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
# LB 16
! $CM* $IN $CM* $ALPlus;
! $CM* $IN $CM+ / [$BK $CR $LF $NL $ZW]; # by rule 7c, any otherwise unattached CM behaves as AL
! $CM* $IN $CM* $ID;
! $CM* $IN $CM+ $SP; # by rule 7a, $SP $CM behaves like ID
! $CM* $IN $CM* $IN;
! $CM* $IN $CM* $NU;
# $LB 17
! $CM* $PO ($CM* $ID | $CM+ $SP);
! $CM* $NU ($CM* $ALPlus)+; # includes $LB19
! ($CM* $NU)+;
! ($CM* $NU)+ $CM+ / [$BK $CR $LF $NL $ZW]; # Rule 7c
! ($CM* $ALPlus)+ $CM* $NU;
# LB 18
! ($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
# LB 19
! ($CM* $ALPlus)+;
! ($CM* $ALPlus)+ $CM+ / [$BK $CR $LF $NL $ZW]; # The $CM* is from rule 7C, and unattached CM is treated as AL

View File

@ -9,6 +9,7 @@
# These rules are based on TR 29 version 4.0.0
#
!!chain;
#
# Character categories as defined in TR 29
@ -30,59 +31,85 @@ $Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
[[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
$Extend = [[:Grapheme_Extend = TRUE:]];
# Define extended forms of the character classes,
# incorporate grapheme cluster + format chars.
$ATermEx = $ATerm $Extend*;
$NumericEx = $Numeric $Extend*;
$UpperEx = $Upper $Extend*;
$CloseEx = $Close $Extend*;
$SpEx = $Sp $Extend*;
$LowerEx = $Lower $Extend*;
$TermEx = $Term $Extend*;
$Extend = [[:Grapheme_Extend = TRUE:]];
$ATermEx = $ATerm $Extend* $Format*;
$NumericEx = $Numeric $Extend* $Format*;
$UpperEx = $Upper $Extend* $Format*;
$TermEx = $Term $Extend* $Format*;
# rule 6
#
# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
#
$SepSeq = $Sep | \u000d\u000a;
$ATermEx $Format* $NumericEx;
# $InteriorChars are those that never trigger a following break.
$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
# rule 7
$UpperEx $ATermEx $Format* $UpperEx;
# rule 8
$ATermEx $Format* $CloseEx* $Format* $SpEx $Format*
[^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
# rule 9 forced to exit by / [^$Close $Sp]
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
# rule 10 forced to exit by / [^$Sp];
# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];
# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
# rule 11 partly included in rule 9 and 10
$TermEx;
$ATermEx;
# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
# because a lower case word follows the period.
$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
# rule 12
# Rules 3, 9, 10, 11
# Matches a simple sentence, or the trailing part of a complex sentence,
# where a simple sentence contains no interior "."s.
$TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
$EndSequence = $InteriorChars* $SepSeq?;
([^$Term $ATerm $Sep] $Extend*)+;
([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);
# Put them all together.
($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM
($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP
#
# Reverse Rules
#
$EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
$ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
$ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
#! .*;
$BackATermEx = $Extend* $ATerm;
$BackNumericEx = $Extend* $Numeric;
$BackUpperEx = $Extend* $Upper;
$BackCloseEx = $Extend* $Close;
$BackSpEx = $Extend* $Sp;
$BackLowerEx = $Extend* $Lower;
$BackTermEx = $Extend* $Term;
# rule 3
! $Sep .;
# rule 6
! $BackNumericEx $Format* $BackATermEx;
# rule 7
! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
# rule 8
! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format*
$BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
# rules 9, 10, 11, 12
$Any = [^$Term $ATerm $Sep];
$Safe = [^$Term $ATerm $Sep $Sp $Close];
$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
! $BackEnd;
! $BackEnd? $Any* $Safe;
! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
! $BackEnd? $Any* $Sp / $Sep;

View File

@ -11,7 +11,7 @@ $NotCased = [^ $Cased];
#
# If the iterator was not stopped on a cased character, advance it to the first cased char
#
($NotCased | $CaseIgnorable)*;
$NotCased+;
#
# If the iterator starts on a cased item, advance through all adjacent cased items plus
@ -22,5 +22,11 @@ $Cased ($Cased | $CaseIgnorable)* $NotCased*;
#
# Reverse Rules
#
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;
! $NotCased+;
#
# If the iterator starts on a cased item, advance through all adjacent cased items plus
# any non-cased stuff, to reach the start of the next word.
#
! $NotCased* ($Cased | $CaseIgnorable)* $Cased;

View File

@ -1,25 +1,28 @@
#
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
# All Rights Reserved.
# Copyright (C) 2002-2003,
# International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: word.txt
# file: word.txt
#
# ICU Word Break Rules
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on Version 4.0.0, dated 2003-04-17
#
####################################################################################
##############################################################################
#
# Character class definitions from TR 29
#
####################################################################################
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
##############################################################################
!!chain;
$Katakana = [[:Script = KATAKANA:]
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
@ -28,122 +31,127 @@ $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
- [:Script = Thai:]
- [:Script = Lao:]
- [:Script = Hiragana:]];
$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
[:name = HEBREW PUNCTUATION GERSHAYIM:]
[:name = RIGHT SINGLE QUOTATION MARK:]
[:name = HYPHENATION POINT:]];
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
$Numeric = [:LineBreak = Numeric:];
#
# Character Class Definitions.
# The names are those from TR29.
#
$CR = \u000d;
$LF = \u000a;
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
$Extend = [[:Grapheme_Extend = TRUE:]];
$CR = \u000d;
$LF = \u000a;
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
$Extend = [[:Grapheme_Extend = TRUE:]];
$Format = [[:Cf:]];
$Hiragana = [:Hiragana:];
$Ideographic = [:IDEOGRAPHIC:];
####################################################################################
#
# Word Break Rules. Definitions and Rules specific to word break begin Here.
#
####################################################################################
$Format = [[:Cf:]];
# ALetter2 - There are some characters, e.g. \u0fa9, that are both combining marks ($Extend)
# and alphabetic (ALetter). $ALetter2 is ALetter from the Unicode TR, less all such chars.
# We need this because of sequences of the form
# <Letter> <MidLetter> <alpha combining mark> <Numeric>
# Rule 3 says treat graphme clusters as a unit, as their first character.
# The <MidLetter> <alpha combining mark> thus should be treated as just <MidLetter>
# Rules for this are awkward, because the sequence
# <Letter> <MidLetter> <Letter> <Numeric>
# should not break, but the sequence
# <Letter> <MidLetter> <Numeric>
# should break after the <Letter>.
$ALetter2 = [$ALetter - $Extend];
# Rule 3: Treat a grapheme cluster as if it were a single character.
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
# because we don't need to find the boundaries between adjacent syllables -
# they won't be word boundaries.
#
#
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
#
$ALetterEx = $ALetter $Extend*;
$ALetter2Ex = $ALetter2 $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumEx = $MidNum $Extend*;
$MidNumLetEx = $MidNumLet $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
#
# Numbers. Rules 8, 11, 12 form the TR.
#
$NumberSequence = $NumericEx ($Format* ($MidNumEx | $MidNumLetEx)? $Format* $NumericEx)*;
$NumberSequence {100};
#
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
# - must include at least one letter.
# - may include both letters and numbers.
# - may include MideLetter, MidNumber punctuation.
#
#$LetterSequence = $ALetterEx ($Format* ($MidLetterEx | $MidNumLetEx)? $Format* $ALetterEx)*; # rules #6, #7
$WordGlue = $MidLetterEx | $MidNumLetEx;
$MidWordFragment = ($WordGlue $ALetter2Ex | $WordGlue $Format+ $ALetterEx);
$WordSequence = $ALetterEx ($Format* ($ALetterEx | $MidWordFragment))*;
$WordSequence2 = $ALetter2Ex ($Format* ($ALetterEx | $MidWordFragment))*;
$WordTail = ($Format* $NumberSequence $Format+ $WordSequence) | ($Format* $NumberSequence $WordSequence2?);
($NumberSequence $Format+)? $WordSequence $WordTail* {200};
($NumberSequence)? $WordSequence2 $WordTail* {200};
#
# Do not break between Katakana. Rule #13.
#
$KatakanaEx ($Format* $KatakanaEx)* {300};
[:Hiragana:] $Extend* {300};
#
# Ideographic Characters. Stand by themselves as words.
# Separated from the "Everything Else" rule, below, only so that they
# can be tagged with a return value. TODO: is this what we want?
#
[:IDEOGRAPHIC:] $Extend* {400};
#
# Everything Else, with no tag.
# Non-Control chars combine with $Extend (combining) chars.
# Controls are do not.
#
[^$Control [:Ideographic:]] $Extend*;
$CR $LF;
#
# Reverse Rules. Back up over any of the chars that can group together.
# (Reverse rules do not need to be exact; they can back up too far,
# but must back up at least enough, and must stop on a boundary.)
#
# rule 3 and 4
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
# a word. (They may also be the first.) The reverse rule skips over these, until it
# reaches something that can only be the start (and probably only) char in a "word".
# A space or punctuation meets the test.
#
$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
$ALetterEx = $ALetter $Extend*;
$ABaseLetterEx = $ABaseLetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumEx = $MidNum $Extend*;
$MidNumLetEx = $MidNumLet $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
#!.*;
! ($NonStarters* | \n \r) .;
[^$Format $Hiragana $Ideographic] $Extend* [$Extend - $ALetter];
# letters should be left alone
[^$Format $ALetter $Numeric $Hiragana $Ideographic] $Extend* $ACMLetter / [^$Extend];
$NumericEx $ACMLetter / $MidLetter;
# rule 5
$ALetterEx ($Format* $ALetterEx)* {200};
# rule 6 and 7
$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $ABaseLetterEx {200};
$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $Format+ $ALetterEx {200};
# rule 8
$NumericEx ($Format* $NumericEx)* {100};
# rule 9
$ALetterEx $Format* $NumericEx {200};
# rule 10
$NumericEx $Format* $ALetterEx {200};
# rule 11 and 12
$NumericEx $Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx {100};
# rule 13
$KatakanaEx ($Format* $KatakanaEx)* {300};
$Hiragana $Extend* {300} / [^$Extend];
$Ideographic $Extend* {400} / [^$Extend];
# reverse rules!!
!!reverse;
$BackALetterEx = $Extend* $ALetter;
$BackABaseLetterEx = $Extend* $ABaseLetter;
$BackACMLetterEx = $Extend* $ACMLetter;
$BackNumericEx = $Extend* $Numeric;
$BackMidNumEx = $Extend* $MidNum;
$BackMidNumLetEx = $Extend* $MidNumLet;
$BackMidLetterEx = $Extend* $MidLetter;
$BackKatakanaEx = $Extend* $Katakana;
! $LF $CR;
! $Extend+ [^$Format];
# rule 5
$BackEndACMLetter = $Format+ $Extend* [^$ALetter $Numeric $MidLetter $MidNumLet];
! $BackALetterEx $Format* $BackABaseLetterEx;
! $BackALetterEx $Format* $BackACMLetterEx / $BackEndACMLetter;
# rule 6 and 7
! $BackABaseLetterEx ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackABaseLetterEx;
! $BackABaseLetterEx ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackACMLetterEx / $BackEndACMLetter;
! $BackALetterEx $Format+ ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackABaseLetterEx;
! $BackALetterEx $Format+ ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackACMLetterEx / $BackEndACMLetter;
# rule 8
! $BackNumericEx $Format* $BackNumericEx;
# rule 9
! $BackNumericEx $Format* (($BackNumericEx | $BackALetterEx) $Format*)* $BackABaseLetterEx;
! $BackNumericEx $Format* $BackACMLetterEx / $BackEndACMLetter; ## problem here
# rule 10
! ($BackALetterEx $Format*)+ $BackNumericEx;
# rule 11 and 12
! $BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
# rule 13
! $BackKatakanaEx $Format* $BackKatakanaEx;

View File

@ -283,25 +283,25 @@ $(BRK_FILES:.brk =.brk
BRKDEPS = "$(ICUBLD)\$(ICUDT)uprops.icu" "$(ICUBLD)\$(ICUDT)unames.icu" "$(ICUBLD)\$(ICUDT)pnames.icu" "$(ICUBLD)\$(ICUDT)unorm.icu"
$(ICUDT)char.brk : "$(ICUBRK)\char.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\char.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\char.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
$(ICUDT)word.brk : "$(ICUBRK)\word.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\word.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\word.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
$(ICUDT)line.brk : "$(ICUBRK)\line.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\line.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\line.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
$(ICUDT)sent.brk : "$(ICUBRK)\sent.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\sent.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\sent.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
$(ICUDT)title.brk : "$(ICUBRK)\title.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\title.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\title.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
$(ICUDT)word_th.brk : "$(ICUBRK)\word_th.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\word_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\word_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
$(ICUDT)line_th.brk : "$(ICUBRK)\line_th.txt" $(BRKDEPS)
genbrk -r "$(ICUBRK)\line_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
genbrk -c -r "$(ICUBRK)\line_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
# utility target to send us to the right dir

View File

@ -839,6 +839,23 @@ void RBBIAPITest::TestRoundtripRules() {
void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
{
if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API ");
switch (index) {
// case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
case 0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
case 1: name = "TestgetRules"; if (exec) TestgetRules(); break;
case 2: name = "TestHashCode"; if (exec) TestHashCode(); break;
case 3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
case 4: name = "extra"; break; /* Extra */
case 5: name = "TestBuilder"; if (exec) TestBuilder(); break;
case 6: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
case 7: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
case 8: name = "TestBug2190"; if (exec) TestBug2190(); break;
case 9: name = "TestRegistration"; if (exec) TestRegistration(); break;
case 10: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
default: name = ""; break; /*needed to end loop*/
}
/*** TODO synwee
switch (index) {
// case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
case 0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
@ -846,8 +863,8 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
case 2: name = "TestHashCode"; if (exec) TestHashCode(); break;
case 3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
case 4: name = "TestIteration"; if (exec) TestIteration(); break;
case 5: name = "extra"; break; /* Extra */
case 6: name = "extra"; break; /* Extra */
case 5: name = "extra"; break; // Extra
case 6: name = "extra"; break; // Extra
case 7: name = "TestBuilder"; if (exec) TestBuilder(); break;
case 8: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
case 9: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
@ -856,8 +873,9 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
case 12: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
case 13: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;
default: name = ""; break; /*needed to end loop*/
default: name = ""; break; // needed to end loop
}
***/
}
//---------------------------------------------

View File

@ -22,6 +22,7 @@
#include "unicode/schriter.h"
#include "unicode/uniset.h"
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
#include "unicode/ustring.h"
#include "intltest.h"
#include "rbbitst.h"
@ -292,6 +293,41 @@ void RBBITest::TestStatusReturn() {
}
static void printStringBreaks(UnicodeString ustr, int expected[],
int expectedcount)
{
UErrorCode status = U_ZERO_ERROR;
char name[100];
printf("code alpha extend alphanum type line name\n");
for (int j = 0; j < ustr.length(); j ++) {
if (expectedcount > 0) {
for (int k = 0; k < expectedcount; k ++) {
if (j == expected[k]) {
printf("------------------------------------------------ %d\n",
j);
}
}
}
UChar32 c = ustr.char32At(j);
if (c > 0xffff) {
j ++;
}
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
printf("%7x %5d %6d %8d %4s %4s %s\n", c,
u_isUAlphabetic(c),
u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
u_isalnum(c),
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
u_charType(c),
U_SHORT_PROPERTY_NAME),
u_getPropertyValueName(UCHAR_LINE_BREAK,
u_getIntPropertyValue(c,
UCHAR_LINE_BREAK),
U_SHORT_PROPERTY_NAME),
name);
}
}
void RBBITest::TestThaiLineBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
@ -517,36 +553,55 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
case 0: name = "TestJapaneseLineBreak";
if(exec) TestJapaneseLineBreak(); break;
case 1: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
case 2: name = "TestLineBreakData";
if(exec) TestLineBreakData(); break;
case 3: name = "TestEmptyString";
if(exec) TestEmptyString(); break;
case 4: name = "TestGetAvailableLocales";
if(exec) TestGetAvailableLocales(); break;
case 5: name = "TestGetDisplayName";
if(exec) TestGetDisplayName(); break;
case 6: name = "TestEndBehaviour";
if(exec) TestEndBehaviour(); break;
case 7: name = "TestBug4153072";
if(exec) TestBug4153072(); break;
case 8: name = "TestWordBoundary";
if(exec) TestWordBoundary(); break;
default: name = ""; break; //needed to end loop
}
/*** TODO synwee
switch (index) {
case 0: name = "TestExtended";
if(exec) TestExtended(); break;
case 1: name = "TestJapaneseLineBrea";
case 1: name = "TestJapaneseLineBreak";
if(exec) TestJapaneseLineBreak(); break;
case 2: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
case 3: name = "TestLineBreakData";
if(exec) TestLineBreakData(); break;
case 4: name = "TestSentenceInvariants";
if(exec) TestSentenceInvariants(); break;
case 5: name = "TestCharacterInvariants";
if(exec) TestCharacterInvariants(); break;
case 6: name = "TestWordInvariants";
if(exec) TestWordInvariants(); break;
case 7: name = "TestEmptyString";
case 4: name = "TestEmptyString";
if(exec) TestEmptyString(); break;
case 8: name = "TestGetAvailableLocales";
case 5: name = "TestGetAvailableLocales";
if(exec) TestGetAvailableLocales(); break;
case 9: name = "TestGetDisplayName";
case 6: name = "TestGetDisplayName";
if(exec) TestGetDisplayName(); break;
case 10: name = "TestEndBehaviour";
case 7: name = "TestEndBehaviour";
if(exec) TestEndBehaviour(); break;
case 11: name = "TestBug4153072";
case 8: name = "TestBug4153072";
if(exec) TestBug4153072(); break;
case 12: name = "TestMonkey";
case 9: name = "TestMonkey";
if(exec) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
TestMonkey(params);
@ -556,18 +611,25 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
}
break;
case 13: name = "TestThaiLineBreak";
case 10: name = "TestThaiLineBreak";
if(exec) TestThaiLineBreak(); break;
case 14: name = "TestMixedThaiLineBreak";
case 11: name = "TestMixedThaiLineBreak";
if(exec) TestMixedThaiLineBreak(); break;
case 15: name = "TestMaiyamok";
case 12: name = "TestMaiyamok";
if(exec) TestMaiyamok(); break;
case 16: name = "TestThaiWordBreak";
case 13: name = "TestThaiWordBreak";
if(exec) TestThaiWordBreak(); break;
case 14: name = "TestWordBreaks";
if(exec) TestWordBreaks(); break;
case 15: name = "TestLineBreaks";
if(exec) TestLineBreaks(); break;
case 16: name = "TestWordBoundary";
if(exec) TestWordBoundary(); break;
case 17: name = "TestSentBreaks";
if(exec) TestSentBreaks(); break;
default: name = ""; break; //needed to end loop
}
***/
}
@ -918,6 +980,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
}
}
if (!seen2) {
printStringBreaks(work, NULL, 0);
errln("No Break between \\U%04x and \\U%04x", c1, c2);
errCount++;
if (errCount >= 75)
@ -1201,14 +1264,18 @@ void RBBITest::executeTest(TestParams *t) {
// and this one.
for (i=prevBP+1; i<bp; i++) {
if (t->expectedBreaks->elementAti(i) != 0) {
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
int expected[] = {0, i};
printStringBreaks(t->dataToBreak, expected, 2);
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
}
}
// Check that the break we did find was expected
if (t->expectedBreaks->elementAti(bp) == 0) {
errln("Forward Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
int expected[] = {0, bp};
printStringBreaks(t->dataToBreak, expected, 2);
errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
} else {
// The break was expected.
@ -1219,7 +1286,7 @@ void RBBITest::executeTest(TestParams *t) {
}
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for break. Pos=%4d File line,col= %4d,%4d.\n"
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
}
@ -1232,7 +1299,7 @@ void RBBITest::executeTest(TestParams *t) {
// Verify that there were no missed expected breaks after the last one found
for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
if (t->expectedBreaks->elementAti(i) != 0) {
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
}
}
@ -1271,7 +1338,7 @@ void RBBITest::executeTest(TestParams *t) {
}
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for break. Pos=%4d File line,col= %4d,%4d.\n"
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
}
@ -2601,6 +2668,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
int32_t nextCPPos; // Index of the code point following "pos."
// May point to a combining mark.
int32_t tPos; // temp value.
UChar32 c;
if (startPos >= fText->length()) {
return -1;
@ -2699,7 +2767,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
UChar32 c = fText->char32At(nextPos);
c = fText->char32At(nextPos);
rule67Adjust(pos, &thisChar, &nextPos, &c);
// If the loop is still warming up - if we haven't shifted the initial
@ -2742,8 +2810,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
goto fall_through_9;
}
}
/***
for (tPos=prevPos; ; tPos = fText->moveIndex32(tPos, -1)) {
if (fOP->contains(fText->char32At(tPos))) {
break;
}
if ((fSP->contains(fText->char32At(tPos)) ||
fCM->contains(fText->char32At(tPos))) == FALSE
|| tPos == 0) {
goto fall_through_9;
}
}
***/
// We match OP SP* x
// No break at this postion.
// Continue the outer loop.
@ -2932,6 +3012,277 @@ static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t d
}
#endif
void RBBITest::TestWordBreaks(void)
{
// <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
UChar str[25];
char *strlist[] =
{"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
"\\u2027\\U000e0067\\u0a47\\u00b7",
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
"\\u0589\\U000e006e\\u0a42\\U000104a5",
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
"\\u0027\\u11af\\U000e0057\\u0602",
"\\U0001d7f2\\U000e007\\u0004\\u0589",
"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
"\\u0233\\U000e0020\\u0a69\\u0d6a",
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
"\\u58f4\\U000e0049\\u20e7\\u2027",
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\ua183\\u102d\\u0bec\\u003a",
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
"\\u003a\\u0e57\\u0fad\\u002e",
"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
"\\U000e005d\\u2044\\u0731\\u0650\\u0061",
"\\u003a\\u0664\\u00b7\\u1fba",
"\\u003b\\u0027\\u00b7\\u47a3",
"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
};
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
printf("looping %d\n", loop);
u_unescape(strlist[loop], str, 25);
UnicodeString ustr(str);
// RBBICharMonkey monkey;
RBBIWordMonkey monkey;
int expected[20];
int forward[20];
int expectedcount = 0;
monkey.setText(ustr);
for (int i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
expected[expectedcount ++] = i;
}
int count = 0;
bi->setText(ustr);
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count > 20 || expected[count] != i) {
errln("happy break forward test failed: expected %d but got %d",
expected[count], i);
}
count ++;
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
errln("happy break test failed: missed a match");
break;
}
for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
errln("happy break test reverse failed: expected %d but got %d",
forward[count], i);
break;
}
}
if (count != 0) {
errln("happy break test failed: missed a match");
}
}
}
void RBBITest::TestWordBoundary(void)
{
// <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
UChar str[20];
char *strlist[] =
{"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
"\\u2027\\U000e0067\\u0a47\\u00b7",
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
"\\u0589\\U000e006e\\u0a42\\U000104a5",
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
"\\u0027\\u11af\\U000e0057\\u0602",
"\\U0001d7f2\\U000e007\\u0004\\u0589",
"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
"\\u0233\\U000e0020\\u0a69\\u0d6a",
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
"\\u58f4\\U000e0049\\u20e7\\u2027",
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\ua183\\u102d\\u0bec\\u003a",
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
"\\u003a\\u0e57\\u0fad\\u002e",
"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
"\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
"\\u003a\\u0664\\u00b7\\u1fba",
"\\u003b\\u0027\\u00b7\\u47a3",
};
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
printf("looping %d\n", loop);
u_unescape(strlist[loop], str, 20);
UnicodeString ustr(str);
int forward[20];
int count = 0;
bi->setText(ustr);
int prev = 0;
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count ++] = i;
if (i > prev) {
for (int j = prev + 1; j < i; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d not a boundary",
j);
break;
}
}
}
if (!bi->isBoundary(i)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d a boundary",
i);
break;
}
prev = i;
}
}
}
void RBBITest::TestLineBreaks(void)
{
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
UChar str[20];
char *strlist[] =
{"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
"\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
"\\u2014\\u0020\\u000a\\u17c5\\u24fc",
"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
};
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
printf("looping %d\n", loop);
u_unescape(strlist[loop], str, 20);
UnicodeString ustr(str);
// RBBICharMonkey monkey;
RBBILineMonkey monkey;
int expected[20];
int forward[20];
int expectedcount = 0;
monkey.setText(ustr);
for (int i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
expected[expectedcount ++] = i;
}
int count = 0;
bi->setText(ustr);
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count < expectedcount && expected[count] != i) {
errln("happy break forward test failed: expected %d but got %d",
expected[count], i);
}
count ++;
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
errln("happy break test failed: missed %d match",
expectedcount - count);
}
for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
errln("happy break test reverse failed: expected %d but got %d",
forward[count], i);
break;
}
}
if (count != 0) {
errln("happy break test failed: missed a match");
}
}
}
void RBBITest::TestSentBreaks(void)
{
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
UChar str[100];
char *strlist[] =
{"This\n",
"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
"\"Sentence ending with a quote.\" Bye.",
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
"Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
"Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
"Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
"Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
"Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
};
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
printf("looping %d\n", loop);
u_unescape(strlist[loop], str, 100);
UnicodeString ustr(str);
int forward[20];
int count = 0;
bi->setText(ustr);
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count ++] = i;
}
int tempcount = count;
for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
tempcount --;
if (forward[tempcount] != i) {
printStringBreaks(ustr, forward, count);
errln("happy break test reverse failed: expected %d but got %d",
forward[tempcount], i);
break;
}
}
if (tempcount != 0) {
errln("happy break test failed: missed a match");
}
}
}
void RBBITest::TestMonkey(char *params) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
@ -3119,7 +3470,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
const char *errorType = NULL;
if (forwardBreaks[i] != expectedBreaks[i]) {
errorType = "next()";
} else if (reverseBreaks[i] != expectedBreaks[i]) {
} else if (reverseBreaks[i] != forwardBreaks[i]) {
errorType = "previous()";
} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
errorType = "isBoundary()";
@ -3135,23 +3486,39 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
int startContext = i;
for (;;) {
if (startContext==0) { break; }
startContext--;
startContext --;
if (expectedBreaks[startContext] != 0) {break;}
}
// End of range is two expected breaks past the start position.
int endContext = i+1;
int endContext = i + 1;
int ci;
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
for (;;) {
if (endContext >= testText.length()) {break;}
if (expectedBreaks[endContext-1] != 0) { break;}
endContext++;
endContext ++;
}
}
// Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
UnicodeString errorText = "<data>";
/***
if (strcmp(errorType, "previous()") == 0) {
startContext = 0;
int j = i;
while (true) {
if (reverseBreaks[j ++] != 0) {
printf("%d\n", j);
break;
}
if (j % 100 == 0) {
printf("continue %d\n", j);
}
}
endContext = j - 1;
}
***/
for (ci=startContext; ci<endContext;) {
UnicodeString hexChars("0123456789abcdef");
UChar32 c;
@ -3181,7 +3548,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
errorText.append("</data>\n");
// Output the error
char charErrorTxt[100];
char charErrorTxt[500];
UErrorCode status = U_ZERO_ERROR;
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
charErrorTxt[sizeof(charErrorTxt)-1] = 0;

View File

@ -70,8 +70,10 @@ public:
UChar *ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status);
void executeTest(TestParams *);
void TestWordBreaks();
void TestWordBoundary();
void TestLineBreaks();
void TestSentBreaks();
/***********************/