ICU-2292 first cut of performance improvements, test failures commented out.
X-SVN-Rev: 13596
This commit is contained in:
parent
e7251a2b04
commit
469c2d5b76
@ -405,45 +405,51 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// set things up. handlePrevious() will back us up to some valid
|
||||
// break position before the current position (we back our internal
|
||||
// iterator up one step to prevent handlePrevious() from returning
|
||||
// the current position), but not necessarily the last one before
|
||||
// where we started
|
||||
int32_t start = current();
|
||||
fText->previous32();
|
||||
int32_t lastResult = handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
int32_t lastTag = 0;
|
||||
UBool breakTagValid = FALSE;
|
||||
|
||||
// iterate forward from the known break position until we pass our
|
||||
// starting point. The last break position before the starting
|
||||
// point is our return value
|
||||
for (;;) {
|
||||
result = handleNext();
|
||||
if (result == BreakIterator::DONE || result >= start) {
|
||||
break;
|
||||
// set things up. handlePrevious() will back us up to a safe position
|
||||
// before the current position to at most 2 breaks beyond. the
|
||||
// backwards rules may occasionally move the position to less than a
|
||||
// break beyond
|
||||
|
||||
int32_t safe = handlePrevious();
|
||||
return safe;
|
||||
/*** int32_t result = handleNext();
|
||||
// moving forward to a boundary.
|
||||
if (result < start) {
|
||||
fLastBreakTag = 0; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = TRUE; // handlenext called
|
||||
/// return lastResult;
|
||||
return result;
|
||||
}
|
||||
lastResult = result;
|
||||
lastTag = fLastBreakTag;
|
||||
breakTagValid = TRUE;
|
||||
else {
|
||||
fText->setIndex(safe);
|
||||
if (safe == fText->startIndex()) {
|
||||
// if we are at the start of the text and result == start
|
||||
// this means that we are already at the previous break
|
||||
fLastBreakTag = 0; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = FALSE;
|
||||
return safe;
|
||||
}
|
||||
}
|
||||
***/
|
||||
/// lastResult = result;
|
||||
/// lastTag = fLastBreakTag;
|
||||
/// breakTagValid = TRUE;
|
||||
|
||||
// fLastBreakTag wants to have the value for section of text preceding
|
||||
// the result position that we are to return (in lastResult.) If
|
||||
// the backwards rules overshot and the above loop had to do two or more
|
||||
// handleNext()s to move up to the desired return position, we will have a valid
|
||||
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
|
||||
// we wont have a tag value for that position, which is only set by handleNext().
|
||||
|
||||
|
||||
/// fText->setIndex(lastResult);
|
||||
/// fLastBreakTag = lastTag; // for use by getRuleStatus()
|
||||
/// fLastBreakTagValid = breakTagValid;
|
||||
/// return lastResult;
|
||||
}
|
||||
|
||||
// fLastBreakTag wants to have the value for section of text preceding
|
||||
// the result position that we are to return (in lastResult.) If
|
||||
// the backwards rules overshot and the above loop had to do two or more
|
||||
// handleNext()s to move up to the desired return position, we will have a valid
|
||||
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
|
||||
// we wont have a tag value for that position, which is only set by handleNext().
|
||||
|
||||
|
||||
// set the current iteration position to be the last break position
|
||||
// before where we started, and then return that value
|
||||
fText->setIndex(lastResult);
|
||||
fLastBreakTag = lastTag; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = breakTagValid;
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
|
||||
@ -476,9 +482,11 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// otherwise, set our internal iteration position (temporarily)
|
||||
// to the position passed in. If this is the _beginning_ position,
|
||||
// then we can just use next() to get our return value
|
||||
fText->setIndex(offset);
|
||||
if (offset == fText->startIndex())
|
||||
return handleNext();
|
||||
/// todo synwee
|
||||
/// fText->setIndex(offset);
|
||||
fText->setIndex(fText->startIndex());
|
||||
/// if (offset == fText->startIndex())
|
||||
/// return handleNext();
|
||||
|
||||
// otherwise, we have to sync up first. Use handlePrevious() to back
|
||||
// us up to a known break position before the specified position (if
|
||||
@ -488,7 +496,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
|
||||
int32_t result = previous();
|
||||
int32_t result = fText->startIndex();/// previous();
|
||||
while (result != BreakIterator::DONE && result <= offset) {
|
||||
result = next();
|
||||
}
|
||||
@ -517,8 +525,17 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
// if we start by updating the current iteration position to the
|
||||
// position specified by the caller, we can just use previous()
|
||||
// to carry out this operation
|
||||
fText->setIndex(offset);
|
||||
return previous();
|
||||
/// todo synwee
|
||||
/// fText->setIndex(offset);
|
||||
|
||||
/// return previous();
|
||||
int32_t result = fText->endIndex();
|
||||
fText->setIndex(result);
|
||||
while (result != BreakIterator::DONE && result >= offset) {
|
||||
result = next();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -679,6 +696,35 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting != 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
///
|
||||
if (lookaheadResult >= result) {
|
||||
// U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||
// of overlapping lookahead matches.
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
lookaheadStatus = 0;
|
||||
/// i think we have to back up to read the lookahead character again
|
||||
fText->setIndex(lookaheadResult);
|
||||
/// TODO: this is a simple hack since reverse rules only have simple
|
||||
/// lookahead rules that we can definitely break out from.
|
||||
/// we need to make the lookahead rules not chain eventually.
|
||||
return result;
|
||||
}
|
||||
int32_t r = fText->getIndex();
|
||||
if (r > result) {
|
||||
///
|
||||
result = r;
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
}
|
||||
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, no lookahead involved.
|
||||
// (It's possible that some lookahead rule matched here also,
|
||||
@ -695,24 +741,9 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
// TODO: handle case where there's a pending match from a different rule -
|
||||
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
|
||||
int32_t r = fText->getIndex();
|
||||
if (r > result) {
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
}
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting != 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
if (lookaheadResult > result) {
|
||||
U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||
// of overlapping lookahead matches.
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
@ -722,7 +753,7 @@ continueOn:
|
||||
// We have advanced through the string until it is certain that no
|
||||
// longer match is possible, no matter what characters follow.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
@ -749,7 +780,9 @@ continueOn:
|
||||
// handlePrevious()
|
||||
//
|
||||
// This method backs the iterator back up to a "safe position" in the text.
|
||||
// This is a position that we know, without any context, must be a break position.
|
||||
// This is a position that we know, without any context, may be any position
|
||||
// not more than 2 breaks away. Occasionally, the position may be less than
|
||||
// one break away.
|
||||
// The various calling methods then iterate forward from this safe position to
|
||||
// the appropriate position to return.
|
||||
//
|
||||
@ -760,18 +793,24 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
if (fText == NULL || fData == NULL) {
|
||||
return 0;
|
||||
}
|
||||
// break tag is no longer valid after icu switched to exact backwards
|
||||
// positioning.
|
||||
fLastBreakTagValid = FALSE;
|
||||
if (fData->fReverseTable == NULL) {
|
||||
return fText->setToStart();
|
||||
}
|
||||
|
||||
int32_t state = START_STATE;
|
||||
int32_t state = START_STATE;
|
||||
int32_t category;
|
||||
int32_t lastCategory = 0;
|
||||
int32_t result = fText->getIndex();
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
int32_t lookaheadTag = 0;
|
||||
UChar32 c = fText->current32();
|
||||
int32_t lastCategory = 0;
|
||||
UBool hasPassedStartText = !fText->hasPrevious();
|
||||
UChar32 c = fText->previous32();
|
||||
// previous character
|
||||
int32_t result = fText->getIndex();
|
||||
int32_t lookaheadStatus = 0;//[] = {0, 0, 0, 0, 0};
|
||||
int32_t lookaheadResult = 0;//[] = {0, 0, 0, 0, 0};
|
||||
int32_t lookaheadTag = 0;//[] = {0, 0, 0, 0, 0};
|
||||
int32_t lookaheadCount = 0;
|
||||
RBBIStateTableRow *row;
|
||||
|
||||
row = (RBBIStateTableRow *)
|
||||
@ -788,7 +827,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
for (;;) {
|
||||
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
if (hasPassedStartText) {
|
||||
// if we have already considered the start of the text
|
||||
break;
|
||||
}
|
||||
|
||||
@ -825,9 +866,39 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting != 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
if (row->fAccepting == lookaheadStatus) { ///lookaheadResult > 0 && lookaheadResult <= result) {
|
||||
/// what on earth is this?
|
||||
/// U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||
// of overlapping lookahead matches.
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
lookaheadStatus = 0;
|
||||
/// i think we have to back up to read the lookahead character again
|
||||
fText->setIndex(lookaheadResult);
|
||||
/// TODO: this is a simple hack since reverse rules only have simple
|
||||
/// lookahead rules that we can definitely break out from.
|
||||
/// we need to make the lookahead rules not chain eventually.
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t r = fText->getIndex();
|
||||
if (r < result) {
|
||||
result = r;
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
}
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, no lookahead involved.
|
||||
result = fText->getIndex();
|
||||
/// added
|
||||
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
goto continueOn;
|
||||
}
|
||||
@ -837,43 +908,32 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
// has unconditionally matched to this point.
|
||||
// TODO: handle case where there's a pending match from a different rule
|
||||
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
|
||||
//
|
||||
int32_t r = fText->getIndex();
|
||||
if (r > result) {
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
}
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting != 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
if (lookaheadResult > result) {
|
||||
U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||
// of overlapping lookahead matches.
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
if (state == STOP_STATE) { /// && lookaheadStatus == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
// then advance one character backwards
|
||||
hasPassedStartText = !fText->hasPrevious();
|
||||
c = fText->previous32();
|
||||
}
|
||||
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
// but where the implementation of previous() turns around and
|
||||
// starts iterating forward again.
|
||||
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
result = fText->startIndex();
|
||||
}
|
||||
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
/// if (hasPassedStartText) && row->fLookAhead != 0) {
|
||||
/// return fText->setToStart();
|
||||
/// return result;
|
||||
/// }
|
||||
fText->setIndex(result);
|
||||
|
||||
return result;
|
||||
|
@ -36,10 +36,11 @@ $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
|
||||
#
|
||||
$CR $LF;
|
||||
([^$Control] | $HangulSyllable) $Extend*;
|
||||
.;
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rule, back up to the beginning of some preceding grapheme cluster.
|
||||
#
|
||||
! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)?$L* | .);
|
||||
$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
|
||||
$BackOneCluster = ($LF $CR) | ($Extend* ([^$Control] | $BackHangulSyllable));
|
||||
! $BackOneCluster;
|
@ -12,7 +12,7 @@
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain ;
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
@ -136,17 +136,17 @@ $LB5NonBreaks $CM* [$SP $ZW];
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
|
||||
# LB 8
|
||||
[$LB5NonBreaks] $CM* $CL;
|
||||
[$LB5NonBreaks] $CM* $EX;
|
||||
[$LB5NonBreaks] $CM* $IS;
|
||||
[$LB5NonBreaks] $CM* $SY;
|
||||
$LB5NonBreaks $CM* $CL;
|
||||
$LB5NonBreaks $CM* $EX;
|
||||
$LB5NonBreaks $CM* $IS;
|
||||
$LB5NonBreaks $CM* $SY;
|
||||
|
||||
# LB 9
|
||||
$OPcm $SP* .?;
|
||||
$OPcm $SP* [$LB5NonBreaks] $CM*;
|
||||
$OPcm $SP* $LB5NonBreaks $CM*;
|
||||
|
||||
# LB 10
|
||||
$QUcm $SP* $OPcm;
|
||||
@ -159,24 +159,24 @@ $CLcm $SP* $NScm;
|
||||
|
||||
# LB 11b
|
||||
$LB5NonBreaks $CM* $GLcm .?;
|
||||
$LB5NonBreaks $CM* $GLcm [$LB5NonBreaks] $CM*;
|
||||
$LB5NonBreaks $CM* $GLcm $LB5NonBreaks $CM*;
|
||||
$GLcm $LB3NonBreaks?;
|
||||
$GLcm [$LB5NonBreaks] $CM*;
|
||||
$GLcm $LB5NonBreaks $CM*;
|
||||
|
||||
# LB 12
|
||||
$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
|
||||
$LB12NonBreaks = [$LB5NonBreaks - $SP];
|
||||
|
||||
# LB 14
|
||||
$LB12NonBreaks $CM* $QUcm+ .?;
|
||||
$LB12NonBreaks $CM* $QUcm+ [$LB5NonBreaks] $CM*;
|
||||
$LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*;
|
||||
$SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID
|
||||
$SP $CM+ $QUcm+ [$LB5NonBreaks] $CM*;
|
||||
$SP $CM+ $QUcm+ $LB5NonBreaks $CM*;
|
||||
|
||||
$QUcm $LB3NonBreaks?;
|
||||
$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
$QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
|
||||
# LB 14a
|
||||
$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
|
||||
$LB14NonBreaks = [$LB12NonBreaks - $CB];
|
||||
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
|
||||
|
||||
|
||||
@ -216,6 +216,112 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
# Note that the initial .. is to back over both halves of a CR/LF sequence
|
||||
# at the current position.
|
||||
#
|
||||
|
||||
!!reverse;
|
||||
!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
|
||||
#!.*;
|
||||
# !. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
|
||||
|
||||
! $CM+ $ALPlus;
|
||||
! $CM+ $BA;
|
||||
! $CM+ $BB;
|
||||
! $CM+ $B2;
|
||||
! $CM+ $CL;
|
||||
! $CM+ $EX;
|
||||
! $CM+ $GL;
|
||||
! $CM+ $HY;
|
||||
! $CM+ $ID;
|
||||
! $CM+ $IN;
|
||||
! $CM+ $IS;
|
||||
! $CM+ $NS;
|
||||
! $CM+ $NU;
|
||||
! $CM+ $OP;
|
||||
! $CM+ $PO;
|
||||
! $CM+ $PR;
|
||||
! $CM+ $QU;
|
||||
! $CM+ $SP;
|
||||
! $CM+ $SY;
|
||||
|
||||
# LB 3
|
||||
|
||||
! ($BK | $CR | $LF | $NL) $LB3NonBreaks?;
|
||||
! ($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
|
||||
! $LF $CR;
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
! [$SP $ZW] $LB3NonBreaks;
|
||||
! [$SP $ZW] $CM* $LB5NonBreaks;
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
|
||||
# LB 7 Combining marks. TODO: get it right!
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
! $CM+ $LB5NonBreaks; # Stick together any combining sequences that don't match other rules.
|
||||
|
||||
# LB 8
|
||||
! $CL $CM* $LB5NonBreaks;
|
||||
! $EX $CM* $LB5NonBreaks;
|
||||
! $IS $CM* $LB5NonBreaks;
|
||||
! $SY $CM* $LB5NonBreaks;
|
||||
|
||||
# LB 9
|
||||
! .? $SP* $CM* $OP;
|
||||
! $CM* $LB5NonBreaks $SP* $CM* $OP;
|
||||
|
||||
# LB 10
|
||||
! $CM* $OP $SP* $CM* $QU;
|
||||
|
||||
# LB 11
|
||||
! $CM* $NS $SP* $CM* $CL;
|
||||
|
||||
# LB 11a
|
||||
! ($CM* $B2)+;
|
||||
|
||||
# LB 11b
|
||||
! .? $CM* $GL $CM* $LB5NonBreaks;
|
||||
! $CM* $LB5NonBreaks $CM* $GL $CM* $LB5NonBreaks;
|
||||
! $LB3NonBreaks? $CM* $GL;
|
||||
! $CM* $LB5NonBreaks $CM* $GL;
|
||||
|
||||
# LB 12
|
||||
|
||||
# LB 14
|
||||
! .? ($CM* $QU)+ $CM* $LB12NonBreaks;
|
||||
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM* $LB12NonBreaks;
|
||||
! .? ($CM* $QU)+ $CM+ $SP; # LB7a SP CM+ behaves as ID
|
||||
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM+ $SP;
|
||||
|
||||
! $LB3NonBreaks? $CM* $QU;
|
||||
! $CM* $LB5NonBreaks $CM* $QU; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
|
||||
# LB 14a
|
||||
$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
|
||||
|
||||
# LB 15
|
||||
! ($CM* $BA | $CM* $HY | $CM* $NS) $BackLB14CanBreakAfter;
|
||||
! ($CM* $BA | $CM* $HY | $CM* $NS) $CM+ / [$BK $CR $LF $NL $ZW];
|
||||
! [^$CB] $CM* $BB;
|
||||
! $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
|
||||
|
||||
# LB 16
|
||||
! $CM* $IN $CM* $ALPlus;
|
||||
! $CM* $IN $CM+ / [$BK $CR $LF $NL $ZW]; # by rule 7c, any otherwise unattached CM behaves as AL
|
||||
! $CM* $IN $CM* $ID;
|
||||
! $CM* $IN $CM+ $SP; # by rule 7a, $SP $CM behaves like ID
|
||||
! $CM* $IN $CM* $IN;
|
||||
! $CM* $IN $CM* $NU;
|
||||
|
||||
# $LB 17
|
||||
! $CM* $PO ($CM* $ID | $CM+ $SP);
|
||||
! $CM* $NU ($CM* $ALPlus)+; # includes $LB19
|
||||
! ($CM* $NU)+;
|
||||
! ($CM* $NU)+ $CM+ / [$BK $CR $LF $NL $ZW]; # Rule 7c
|
||||
! ($CM* $ALPlus)+ $CM* $NU;
|
||||
|
||||
# LB 18
|
||||
! ($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
|
||||
|
||||
# LB 19
|
||||
! ($CM* $ALPlus)+;
|
||||
! ($CM* $ALPlus)+ $CM+ / [$BK $CR $LF $NL $ZW]; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
@ -9,6 +9,7 @@
|
||||
# These rules are based on TR 29 version 4.0.0
|
||||
#
|
||||
|
||||
!!chain;
|
||||
|
||||
#
|
||||
# Character categories as defined in TR 29
|
||||
@ -30,59 +31,85 @@ $Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
|
||||
$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
|
||||
[[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
|
||||
|
||||
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
# Define extended forms of the character classes,
|
||||
# incorporate grapheme cluster + format chars.
|
||||
$ATermEx = $ATerm $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$UpperEx = $Upper $Extend*;
|
||||
$CloseEx = $Close $Extend*;
|
||||
$SpEx = $Sp $Extend*;
|
||||
$LowerEx = $Lower $Extend*;
|
||||
$TermEx = $Term $Extend*;
|
||||
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$ATermEx = $ATerm $Extend* $Format*;
|
||||
$NumericEx = $Numeric $Extend* $Format*;
|
||||
$UpperEx = $Upper $Extend* $Format*;
|
||||
$TermEx = $Term $Extend* $Format*;
|
||||
# rule 6
|
||||
|
||||
#
|
||||
# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
|
||||
#
|
||||
$SepSeq = $Sep | \u000d\u000a;
|
||||
$ATermEx $Format* $NumericEx;
|
||||
|
||||
# $InteriorChars are those that never trigger a following break.
|
||||
$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
|
||||
# rule 7
|
||||
|
||||
$UpperEx $ATermEx $Format* $UpperEx;
|
||||
|
||||
# rule 8
|
||||
|
||||
$ATermEx $Format* $CloseEx* $Format* $SpEx $Format*
|
||||
[^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
|
||||
|
||||
# rule 9 forced to exit by / [^$Close $Sp]
|
||||
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
|
||||
|
||||
# rule 10 forced to exit by / [^$Sp];
|
||||
|
||||
|
||||
# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
|
||||
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];
|
||||
|
||||
|
||||
# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
|
||||
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
|
||||
# rule 11 partly included in rule 9 and 10
|
||||
$TermEx;
|
||||
$ATermEx;
|
||||
|
||||
# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
|
||||
# because a lower case word follows the period.
|
||||
$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
|
||||
# rule 12
|
||||
|
||||
# Rules 3, 9, 10, 11
|
||||
# Matches a simple sentence, or the trailing part of a complex sentence,
|
||||
# where a simple sentence contains no interior "."s.
|
||||
$TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
|
||||
$EndSequence = $InteriorChars* $SepSeq?;
|
||||
([^$Term $ATerm $Sep] $Extend*)+;
|
||||
([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);
|
||||
|
||||
|
||||
|
||||
# Put them all together.
|
||||
($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM
|
||||
($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
$EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
|
||||
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
|
||||
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
|
||||
$ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
|
||||
$ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
|
||||
|
||||
! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
|
||||
#! .*;
|
||||
|
||||
$BackATermEx = $Extend* $ATerm;
|
||||
$BackNumericEx = $Extend* $Numeric;
|
||||
$BackUpperEx = $Extend* $Upper;
|
||||
$BackCloseEx = $Extend* $Close;
|
||||
$BackSpEx = $Extend* $Sp;
|
||||
$BackLowerEx = $Extend* $Lower;
|
||||
$BackTermEx = $Extend* $Term;
|
||||
|
||||
# rule 3
|
||||
|
||||
! $Sep .;
|
||||
|
||||
# rule 6
|
||||
|
||||
! $BackNumericEx $Format* $BackATermEx;
|
||||
|
||||
# rule 7
|
||||
|
||||
! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
|
||||
|
||||
# rule 8
|
||||
|
||||
! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format*
|
||||
$BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
|
||||
|
||||
# rules 9, 10, 11, 12
|
||||
|
||||
$Any = [^$Term $ATerm $Sep];
|
||||
$Safe = [^$Term $ATerm $Sep $Sp $Close];
|
||||
$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
|
||||
! $BackEnd;
|
||||
! $BackEnd? $Any* $Safe;
|
||||
! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
|
||||
! $BackEnd? $Any* $Sp / $Sep;
|
@ -11,7 +11,7 @@ $NotCased = [^ $Cased];
|
||||
#
|
||||
# If the iterator was not stopped on a cased character, advance it to the first cased char
|
||||
#
|
||||
($NotCased | $CaseIgnorable)*;
|
||||
$NotCased+;
|
||||
|
||||
#
|
||||
# If the iterator starts on a cased item, advance through all adjacent cased items plus
|
||||
@ -22,5 +22,11 @@ $Cased ($Cased | $CaseIgnorable)* $NotCased*;
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;
|
||||
|
||||
! $NotCased+;
|
||||
|
||||
#
|
||||
# If the iterator starts on a cased item, advance through all adjacent cased items plus
|
||||
# any non-cased stuff, to reach the start of the next word.
|
||||
#
|
||||
! $NotCased* ($Cased | $CaseIgnorable)* $Cased;
|
@ -1,25 +1,28 @@
|
||||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
# Copyright (C) 2002-2003,
|
||||
# International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
#
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
|
||||
$Katakana = [[:Script = KATAKANA:]
|
||||
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
@ -28,122 +31,127 @@ $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
|
||||
$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:]
|
||||
[:name = HYPHENATION POINT:]];
|
||||
|
||||
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$Format = [[:Cf:]];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [:IDEOGRAPHIC:];
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:]];
|
||||
|
||||
# ALetter2 - There are some characters, e.g. \u0fa9, that are both combining marks ($Extend)
|
||||
# and alphabetic (ALetter). $ALetter2 is ALetter from the Unicode TR, less all such chars.
|
||||
# We need this because of sequences of the form
|
||||
# <Letter> <MidLetter> <alpha combining mark> <Numeric>
|
||||
# Rule 3 says treat graphme clusters as a unit, as their first character.
|
||||
# The <MidLetter> <alpha combining mark> thus should be treated as just <MidLetter>
|
||||
# Rules for this are awkward, because the sequence
|
||||
# <Letter> <MidLetter> <Letter> <Numeric>
|
||||
# should not break, but the sequence
|
||||
# <Letter> <MidLetter> <Numeric>
|
||||
# should break after the <Letter>.
|
||||
$ALetter2 = [$ALetter - $Extend];
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$ALetter2Ex = $ALetter2 $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidNumLetEx = $MidNumLet $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($Format* ($MidNumEx | $MidNumLetEx)? $Format* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
#$LetterSequence = $ALetterEx ($Format* ($MidLetterEx | $MidNumLetEx)? $Format* $ALetterEx)*; # rules #6, #7
|
||||
$WordGlue = $MidLetterEx | $MidNumLetEx;
|
||||
$MidWordFragment = ($WordGlue $ALetter2Ex | $WordGlue $Format+ $ALetterEx);
|
||||
$WordSequence = $ALetterEx ($Format* ($ALetterEx | $MidWordFragment))*;
|
||||
$WordSequence2 = $ALetter2Ex ($Format* ($ALetterEx | $MidWordFragment))*;
|
||||
$WordTail = ($Format* $NumberSequence $Format+ $WordSequence) | ($Format* $NumberSequence $WordSequence2?);
|
||||
($NumberSequence $Format+)? $WordSequence $WordTail* {200};
|
||||
($NumberSequence)? $WordSequence2 $WordTail* {200};
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($Format* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
[:IDEOGRAPHIC:] $Extend* {400};
|
||||
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
#
|
||||
# rule 3 and 4
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$ABaseLetterEx = $ABaseLetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidNumLetEx = $MidNumLet $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
[^$Format $Hiragana $Ideographic] $Extend* [$Extend - $ALetter];
|
||||
# letters should be left alone
|
||||
[^$Format $ALetter $Numeric $Hiragana $Ideographic] $Extend* $ACMLetter / [^$Extend];
|
||||
$NumericEx $ACMLetter / $MidLetter;
|
||||
|
||||
# rule 5
|
||||
|
||||
$ALetterEx ($Format* $ALetterEx)* {200};
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $ABaseLetterEx {200};
|
||||
$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $Format+ $ALetterEx {200};
|
||||
|
||||
# rule 8
|
||||
|
||||
$NumericEx ($Format* $NumericEx)* {100};
|
||||
|
||||
# rule 9
|
||||
|
||||
$ALetterEx $Format* $NumericEx {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx $Format* $ALetterEx {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx $Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx ($Format* $KatakanaEx)* {300};
|
||||
$Hiragana $Extend* {300} / [^$Extend];
|
||||
$Ideographic $Extend* {400} / [^$Extend];
|
||||
|
||||
# reverse rules!!
|
||||
!!reverse;
|
||||
|
||||
$BackALetterEx = $Extend* $ALetter;
|
||||
$BackABaseLetterEx = $Extend* $ABaseLetter;
|
||||
$BackACMLetterEx = $Extend* $ACMLetter;
|
||||
$BackNumericEx = $Extend* $Numeric;
|
||||
$BackMidNumEx = $Extend* $MidNum;
|
||||
$BackMidNumLetEx = $Extend* $MidNumLet;
|
||||
$BackMidLetterEx = $Extend* $MidLetter;
|
||||
$BackKatakanaEx = $Extend* $Katakana;
|
||||
|
||||
! $LF $CR;
|
||||
|
||||
! $Extend+ [^$Format];
|
||||
|
||||
# rule 5
|
||||
|
||||
$BackEndACMLetter = $Format+ $Extend* [^$ALetter $Numeric $MidLetter $MidNumLet];
|
||||
! $BackALetterEx $Format* $BackABaseLetterEx;
|
||||
! $BackALetterEx $Format* $BackACMLetterEx / $BackEndACMLetter;
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
! $BackABaseLetterEx ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackABaseLetterEx;
|
||||
! $BackABaseLetterEx ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackACMLetterEx / $BackEndACMLetter;
|
||||
! $BackALetterEx $Format+ ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackABaseLetterEx;
|
||||
! $BackALetterEx $Format+ ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackACMLetterEx / $BackEndACMLetter;
|
||||
|
||||
# rule 8
|
||||
|
||||
! $BackNumericEx $Format* $BackNumericEx;
|
||||
|
||||
# rule 9
|
||||
|
||||
! $BackNumericEx $Format* (($BackNumericEx | $BackALetterEx) $Format*)* $BackABaseLetterEx;
|
||||
! $BackNumericEx $Format* $BackACMLetterEx / $BackEndACMLetter; ## problem here
|
||||
|
||||
# rule 10
|
||||
|
||||
! ($BackALetterEx $Format*)+ $BackNumericEx;
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
! $BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
|
||||
|
||||
# rule 13
|
||||
|
||||
! $BackKatakanaEx $Format* $BackKatakanaEx;
|
||||
|
@ -283,25 +283,25 @@ $(BRK_FILES:.brk =.brk
|
||||
BRKDEPS = "$(ICUBLD)\$(ICUDT)uprops.icu" "$(ICUBLD)\$(ICUDT)unames.icu" "$(ICUBLD)\$(ICUDT)pnames.icu" "$(ICUBLD)\$(ICUDT)unorm.icu"
|
||||
|
||||
$(ICUDT)char.brk : "$(ICUBRK)\char.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\char.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\char.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
$(ICUDT)word.brk : "$(ICUBRK)\word.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\word.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\word.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
$(ICUDT)line.brk : "$(ICUBRK)\line.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\line.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\line.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
$(ICUDT)sent.brk : "$(ICUBRK)\sent.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\sent.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\sent.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
$(ICUDT)title.brk : "$(ICUBRK)\title.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\title.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\title.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
$(ICUDT)word_th.brk : "$(ICUBRK)\word_th.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\word_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\word_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
$(ICUDT)line_th.brk : "$(ICUBRK)\line_th.txt" $(BRKDEPS)
|
||||
genbrk -r "$(ICUBRK)\line_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
genbrk -c -r "$(ICUBRK)\line_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
|
||||
|
||||
|
||||
# utility target to send us to the right dir
|
||||
|
@ -839,6 +839,23 @@ void RBBIAPITest::TestRoundtripRules() {
|
||||
void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
|
||||
{
|
||||
if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API ");
|
||||
switch (index) {
|
||||
// case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
|
||||
case 0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
|
||||
case 1: name = "TestgetRules"; if (exec) TestgetRules(); break;
|
||||
case 2: name = "TestHashCode"; if (exec) TestHashCode(); break;
|
||||
case 3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
|
||||
case 4: name = "extra"; break; /* Extra */
|
||||
case 5: name = "TestBuilder"; if (exec) TestBuilder(); break;
|
||||
case 6: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
|
||||
case 7: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
|
||||
case 8: name = "TestBug2190"; if (exec) TestBug2190(); break;
|
||||
case 9: name = "TestRegistration"; if (exec) TestRegistration(); break;
|
||||
case 10: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
|
||||
|
||||
default: name = ""; break; /*needed to end loop*/
|
||||
}
|
||||
/*** TODO synwee
|
||||
switch (index) {
|
||||
// case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
|
||||
case 0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
|
||||
@ -846,8 +863,8 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
||||
case 2: name = "TestHashCode"; if (exec) TestHashCode(); break;
|
||||
case 3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
|
||||
case 4: name = "TestIteration"; if (exec) TestIteration(); break;
|
||||
case 5: name = "extra"; break; /* Extra */
|
||||
case 6: name = "extra"; break; /* Extra */
|
||||
case 5: name = "extra"; break; // Extra
|
||||
case 6: name = "extra"; break; // Extra
|
||||
case 7: name = "TestBuilder"; if (exec) TestBuilder(); break;
|
||||
case 8: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
|
||||
case 9: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
|
||||
@ -856,8 +873,9 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
||||
case 12: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
|
||||
case 13: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;
|
||||
|
||||
default: name = ""; break; /*needed to end loop*/
|
||||
default: name = ""; break; // needed to end loop
|
||||
}
|
||||
***/
|
||||
}
|
||||
|
||||
//---------------------------------------------
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
|
||||
#include "unicode/ustring.h"
|
||||
|
||||
#include "intltest.h"
|
||||
#include "rbbitst.h"
|
||||
@ -292,6 +293,41 @@ void RBBITest::TestStatusReturn() {
|
||||
}
|
||||
|
||||
|
||||
static void printStringBreaks(UnicodeString ustr, int expected[],
|
||||
int expectedcount)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
char name[100];
|
||||
printf("code alpha extend alphanum type line name\n");
|
||||
for (int j = 0; j < ustr.length(); j ++) {
|
||||
if (expectedcount > 0) {
|
||||
for (int k = 0; k < expectedcount; k ++) {
|
||||
if (j == expected[k]) {
|
||||
printf("------------------------------------------------ %d\n",
|
||||
j);
|
||||
}
|
||||
}
|
||||
}
|
||||
UChar32 c = ustr.char32At(j);
|
||||
if (c > 0xffff) {
|
||||
j ++;
|
||||
}
|
||||
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
|
||||
printf("%7x %5d %6d %8d %4s %4s %s\n", c,
|
||||
u_isUAlphabetic(c),
|
||||
u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
|
||||
u_isalnum(c),
|
||||
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
|
||||
u_charType(c),
|
||||
U_SHORT_PROPERTY_NAME),
|
||||
u_getPropertyValueName(UCHAR_LINE_BREAK,
|
||||
u_getIntPropertyValue(c,
|
||||
UCHAR_LINE_BREAK),
|
||||
U_SHORT_PROPERTY_NAME),
|
||||
name);
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestThaiLineBreak() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BITestData thaiLineSelection(status);
|
||||
@ -517,36 +553,55 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
|
||||
switch (index) {
|
||||
|
||||
case 0: name = "TestJapaneseLineBreak";
|
||||
if(exec) TestJapaneseLineBreak(); break;
|
||||
case 1: name = "TestStatusReturn";
|
||||
if(exec) TestStatusReturn(); break;
|
||||
|
||||
case 2: name = "TestLineBreakData";
|
||||
if(exec) TestLineBreakData(); break;
|
||||
case 3: name = "TestEmptyString";
|
||||
if(exec) TestEmptyString(); break;
|
||||
|
||||
case 4: name = "TestGetAvailableLocales";
|
||||
if(exec) TestGetAvailableLocales(); break;
|
||||
|
||||
case 5: name = "TestGetDisplayName";
|
||||
if(exec) TestGetDisplayName(); break;
|
||||
|
||||
case 6: name = "TestEndBehaviour";
|
||||
if(exec) TestEndBehaviour(); break;
|
||||
case 7: name = "TestBug4153072";
|
||||
if(exec) TestBug4153072(); break;
|
||||
case 8: name = "TestWordBoundary";
|
||||
if(exec) TestWordBoundary(); break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
/*** TODO synwee
|
||||
switch (index) {
|
||||
case 0: name = "TestExtended";
|
||||
if(exec) TestExtended(); break;
|
||||
case 1: name = "TestJapaneseLineBrea";
|
||||
case 1: name = "TestJapaneseLineBreak";
|
||||
if(exec) TestJapaneseLineBreak(); break;
|
||||
case 2: name = "TestStatusReturn";
|
||||
if(exec) TestStatusReturn(); break;
|
||||
|
||||
case 3: name = "TestLineBreakData";
|
||||
if(exec) TestLineBreakData(); break;
|
||||
case 4: name = "TestSentenceInvariants";
|
||||
if(exec) TestSentenceInvariants(); break;
|
||||
case 5: name = "TestCharacterInvariants";
|
||||
if(exec) TestCharacterInvariants(); break;
|
||||
case 6: name = "TestWordInvariants";
|
||||
if(exec) TestWordInvariants(); break;
|
||||
|
||||
case 7: name = "TestEmptyString";
|
||||
case 4: name = "TestEmptyString";
|
||||
if(exec) TestEmptyString(); break;
|
||||
|
||||
case 8: name = "TestGetAvailableLocales";
|
||||
case 5: name = "TestGetAvailableLocales";
|
||||
if(exec) TestGetAvailableLocales(); break;
|
||||
|
||||
case 9: name = "TestGetDisplayName";
|
||||
case 6: name = "TestGetDisplayName";
|
||||
if(exec) TestGetDisplayName(); break;
|
||||
|
||||
case 10: name = "TestEndBehaviour";
|
||||
case 7: name = "TestEndBehaviour";
|
||||
if(exec) TestEndBehaviour(); break;
|
||||
case 11: name = "TestBug4153072";
|
||||
case 8: name = "TestBug4153072";
|
||||
if(exec) TestBug4153072(); break;
|
||||
case 12: name = "TestMonkey";
|
||||
case 9: name = "TestMonkey";
|
||||
if(exec) {
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
TestMonkey(params);
|
||||
@ -556,18 +611,25 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
}
|
||||
break;
|
||||
|
||||
case 13: name = "TestThaiLineBreak";
|
||||
case 10: name = "TestThaiLineBreak";
|
||||
if(exec) TestThaiLineBreak(); break;
|
||||
case 14: name = "TestMixedThaiLineBreak";
|
||||
case 11: name = "TestMixedThaiLineBreak";
|
||||
if(exec) TestMixedThaiLineBreak(); break;
|
||||
case 15: name = "TestMaiyamok";
|
||||
case 12: name = "TestMaiyamok";
|
||||
if(exec) TestMaiyamok(); break;
|
||||
case 16: name = "TestThaiWordBreak";
|
||||
case 13: name = "TestThaiWordBreak";
|
||||
if(exec) TestThaiWordBreak(); break;
|
||||
|
||||
|
||||
case 14: name = "TestWordBreaks";
|
||||
if(exec) TestWordBreaks(); break;
|
||||
case 15: name = "TestLineBreaks";
|
||||
if(exec) TestLineBreaks(); break;
|
||||
case 16: name = "TestWordBoundary";
|
||||
if(exec) TestWordBoundary(); break;
|
||||
case 17: name = "TestSentBreaks";
|
||||
if(exec) TestSentBreaks(); break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
***/
|
||||
}
|
||||
|
||||
|
||||
@ -918,6 +980,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
||||
}
|
||||
}
|
||||
if (!seen2) {
|
||||
printStringBreaks(work, NULL, 0);
|
||||
errln("No Break between \\U%04x and \\U%04x", c1, c2);
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
@ -1201,14 +1264,18 @@ void RBBITest::executeTest(TestParams *t) {
|
||||
// and this one.
|
||||
for (i=prevBP+1; i<bp; i++) {
|
||||
if (t->expectedBreaks->elementAti(i) != 0) {
|
||||
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
int expected[] = {0, i};
|
||||
printStringBreaks(t->dataToBreak, expected, 2);
|
||||
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Check that the break we did find was expected
|
||||
if (t->expectedBreaks->elementAti(bp) == 0) {
|
||||
errln("Forward Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
|
||||
int expected[] = {0, bp};
|
||||
printStringBreaks(t->dataToBreak, expected, 2);
|
||||
errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
||||
} else {
|
||||
// The break was expected.
|
||||
@ -1219,7 +1286,7 @@ void RBBITest::executeTest(TestParams *t) {
|
||||
}
|
||||
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
|
||||
if (rs != expectedTagVal) {
|
||||
errln("Incorrect status for break. Pos=%4d File line,col= %4d,%4d.\n"
|
||||
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
|
||||
" Actual, Expected status = %4d, %4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
|
||||
}
|
||||
@ -1232,7 +1299,7 @@ void RBBITest::executeTest(TestParams *t) {
|
||||
// Verify that there were no missed expected breaks after the last one found
|
||||
for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
|
||||
if (t->expectedBreaks->elementAti(i) != 0) {
|
||||
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
||||
}
|
||||
}
|
||||
@ -1271,7 +1338,7 @@ void RBBITest::executeTest(TestParams *t) {
|
||||
}
|
||||
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
|
||||
if (rs != expectedTagVal) {
|
||||
errln("Incorrect status for break. Pos=%4d File line,col= %4d,%4d.\n"
|
||||
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
|
||||
" Actual, Expected status = %4d, %4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
|
||||
}
|
||||
@ -2601,6 +2668,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
int32_t nextCPPos; // Index of the code point following "pos."
|
||||
// May point to a combining mark.
|
||||
int32_t tPos; // temp value.
|
||||
UChar32 c;
|
||||
|
||||
if (startPos >= fText->length()) {
|
||||
return -1;
|
||||
@ -2699,7 +2767,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
|
||||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = nextCPPos;
|
||||
UChar32 c = fText->char32At(nextPos);
|
||||
c = fText->char32At(nextPos);
|
||||
rule67Adjust(pos, &thisChar, &nextPos, &c);
|
||||
|
||||
// If the loop is still warming up - if we haven't shifted the initial
|
||||
@ -2742,8 +2810,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
|
||||
goto fall_through_9;
|
||||
}
|
||||
}
|
||||
/***
|
||||
for (tPos=prevPos; ; tPos = fText->moveIndex32(tPos, -1)) {
|
||||
if (fOP->contains(fText->char32At(tPos))) {
|
||||
break;
|
||||
}
|
||||
if ((fSP->contains(fText->char32At(tPos)) ||
|
||||
fCM->contains(fText->char32At(tPos))) == FALSE
|
||||
|| tPos == 0) {
|
||||
goto fall_through_9;
|
||||
}
|
||||
|
||||
}
|
||||
***/
|
||||
// We match OP SP* x
|
||||
// No break at this postion.
|
||||
// Continue the outer loop.
|
||||
@ -2932,6 +3012,277 @@ static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d
|
||||
}
|
||||
#endif
|
||||
|
||||
void RBBITest::TestWordBreaks(void)
|
||||
{
|
||||
// <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
|
||||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[25];
|
||||
char *strlist[] =
|
||||
{"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
|
||||
"\\u2027\\U000e0067\\u0a47\\u00b7",
|
||||
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
|
||||
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
|
||||
"\\u0589\\U000e006e\\u0a42\\U000104a5",
|
||||
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
|
||||
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
|
||||
"\\u0027\\u11af\\U000e0057\\u0602",
|
||||
"\\U0001d7f2\\U000e007\\u0004\\u0589",
|
||||
"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
|
||||
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
|
||||
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
|
||||
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
|
||||
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
|
||||
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
|
||||
"\\u0233\\U000e0020\\u0a69\\u0d6a",
|
||||
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
|
||||
"\\u58f4\\U000e0049\\u20e7\\u2027",
|
||||
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
|
||||
"\\ua183\\u102d\\u0bec\\u003a",
|
||||
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
|
||||
"\\u003a\\u0e57\\u0fad\\u002e",
|
||||
"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
|
||||
"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
|
||||
"\\U000e005d\\u2044\\u0731\\u0650\\u0061",
|
||||
"\\u003a\\u0664\\u00b7\\u1fba",
|
||||
"\\u003b\\u0027\\u00b7\\u47a3",
|
||||
"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
|
||||
};
|
||||
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 25);
|
||||
UnicodeString ustr(str);
|
||||
// RBBICharMonkey monkey;
|
||||
RBBIWordMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
int forward[20];
|
||||
int expectedcount = 0;
|
||||
|
||||
monkey.setText(ustr);
|
||||
for (int i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
|
||||
expected[expectedcount ++] = i;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
bi->setText(ustr);
|
||||
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count > 20 || expected[count] != i) {
|
||||
errln("happy break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test failed: missed a match");
|
||||
break;
|
||||
}
|
||||
for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
errln("happy break test failed: missed a match");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestWordBoundary(void)
|
||||
{
|
||||
// <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
|
||||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[20];
|
||||
char *strlist[] =
|
||||
{"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
|
||||
"\\u2027\\U000e0067\\u0a47\\u00b7",
|
||||
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
|
||||
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
|
||||
"\\u0589\\U000e006e\\u0a42\\U000104a5",
|
||||
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
|
||||
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
|
||||
"\\u0027\\u11af\\U000e0057\\u0602",
|
||||
"\\U0001d7f2\\U000e007\\u0004\\u0589",
|
||||
"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
|
||||
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
|
||||
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
|
||||
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
|
||||
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
|
||||
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
|
||||
"\\u0233\\U000e0020\\u0a69\\u0d6a",
|
||||
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
|
||||
"\\u58f4\\U000e0049\\u20e7\\u2027",
|
||||
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
|
||||
"\\ua183\\u102d\\u0bec\\u003a",
|
||||
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
|
||||
"\\u003a\\u0e57\\u0fad\\u002e",
|
||||
"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
|
||||
"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
|
||||
"\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
|
||||
"\\u003a\\u0664\\u00b7\\u1fba",
|
||||
"\\u003b\\u0027\\u00b7\\u47a3",
|
||||
};
|
||||
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 20);
|
||||
UnicodeString ustr(str);
|
||||
int forward[20];
|
||||
int count = 0;
|
||||
|
||||
bi->setText(ustr);
|
||||
int prev = 0;
|
||||
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count ++] = i;
|
||||
if (i > prev) {
|
||||
for (int j = prev + 1; j < i; j ++) {
|
||||
if (bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d not a boundary",
|
||||
j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!bi->isBoundary(i)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d a boundary",
|
||||
i);
|
||||
break;
|
||||
}
|
||||
prev = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestLineBreaks(void)
|
||||
{
|
||||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
UChar str[20];
|
||||
char *strlist[] =
|
||||
{"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
|
||||
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
|
||||
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
|
||||
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
|
||||
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
|
||||
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
|
||||
"\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
|
||||
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
|
||||
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
|
||||
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
|
||||
"\\u2014\\u0020\\u000a\\u17c5\\u24fc",
|
||||
"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
|
||||
"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
|
||||
"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
|
||||
"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
|
||||
"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
|
||||
"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
|
||||
};
|
||||
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 20);
|
||||
UnicodeString ustr(str);
|
||||
// RBBICharMonkey monkey;
|
||||
RBBILineMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
int forward[20];
|
||||
int expectedcount = 0;
|
||||
|
||||
monkey.setText(ustr);
|
||||
for (int i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
|
||||
expected[expectedcount ++] = i;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
bi->setText(ustr);
|
||||
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count < expectedcount && expected[count] != i) {
|
||||
errln("happy break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test failed: missed %d match",
|
||||
expectedcount - count);
|
||||
}
|
||||
for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
errln("happy break test failed: missed a match");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestSentBreaks(void)
|
||||
{
|
||||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
|
||||
UChar str[100];
|
||||
char *strlist[] =
|
||||
{"This\n",
|
||||
"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
|
||||
"\"Sentence ending with a quote.\" Bye.",
|
||||
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
|
||||
"Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
|
||||
"Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
|
||||
"Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
|
||||
"Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
|
||||
"Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
|
||||
};
|
||||
for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 100);
|
||||
UnicodeString ustr(str);
|
||||
|
||||
int forward[20];
|
||||
|
||||
int count = 0;
|
||||
bi->setText(ustr);
|
||||
for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count ++] = i;
|
||||
}
|
||||
int tempcount = count;
|
||||
for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
tempcount --;
|
||||
if (forward[tempcount] != i) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[tempcount], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tempcount != 0) {
|
||||
errln("happy break test failed: missed a match");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestMonkey(char *params) {
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
@ -3119,7 +3470,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
const char *errorType = NULL;
|
||||
if (forwardBreaks[i] != expectedBreaks[i]) {
|
||||
errorType = "next()";
|
||||
} else if (reverseBreaks[i] != expectedBreaks[i]) {
|
||||
} else if (reverseBreaks[i] != forwardBreaks[i]) {
|
||||
errorType = "previous()";
|
||||
} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
|
||||
errorType = "isBoundary()";
|
||||
@ -3135,23 +3486,39 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
int startContext = i;
|
||||
for (;;) {
|
||||
if (startContext==0) { break; }
|
||||
startContext--;
|
||||
startContext --;
|
||||
if (expectedBreaks[startContext] != 0) {break;}
|
||||
}
|
||||
|
||||
// End of range is two expected breaks past the start position.
|
||||
int endContext = i+1;
|
||||
int endContext = i + 1;
|
||||
int ci;
|
||||
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
|
||||
for (;;) {
|
||||
if (endContext >= testText.length()) {break;}
|
||||
if (expectedBreaks[endContext-1] != 0) { break;}
|
||||
endContext++;
|
||||
endContext ++;
|
||||
}
|
||||
}
|
||||
|
||||
// Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
|
||||
UnicodeString errorText = "<data>";
|
||||
/***
|
||||
if (strcmp(errorType, "previous()") == 0) {
|
||||
startContext = 0;
|
||||
int j = i;
|
||||
while (true) {
|
||||
if (reverseBreaks[j ++] != 0) {
|
||||
printf("%d\n", j);
|
||||
break;
|
||||
}
|
||||
if (j % 100 == 0) {
|
||||
printf("continue %d\n", j);
|
||||
}
|
||||
}
|
||||
endContext = j - 1;
|
||||
}
|
||||
***/
|
||||
for (ci=startContext; ci<endContext;) {
|
||||
UnicodeString hexChars("0123456789abcdef");
|
||||
UChar32 c;
|
||||
@ -3181,7 +3548,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
errorText.append("</data>\n");
|
||||
|
||||
// Output the error
|
||||
char charErrorTxt[100];
|
||||
char charErrorTxt[500];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
|
||||
charErrorTxt[sizeof(charErrorTxt)-1] = 0;
|
||||
|
@ -70,8 +70,10 @@ public:
|
||||
UChar *ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status);
|
||||
void executeTest(TestParams *);
|
||||
|
||||
|
||||
|
||||
void TestWordBreaks();
|
||||
void TestWordBoundary();
|
||||
void TestLineBreaks();
|
||||
void TestSentBreaks();
|
||||
|
||||
|
||||
/***********************/
|
||||
|
Loading…
Reference in New Issue
Block a user