ICU-2292 added support for old data rules
X-SVN-Rev: 13614
This commit is contained in:
parent
1ef0ff982e
commit
ab056703bd
@ -405,51 +405,49 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// old rule syntax
|
||||
// set things up. handlePrevious() will back us up to some valid
|
||||
// break position before the current position (we back our internal
|
||||
// iterator up one step to prevent handlePrevious() from returning
|
||||
// the current position), but not necessarily the last one before
|
||||
// where we started
|
||||
|
||||
int32_t start = current();
|
||||
|
||||
fText->previous32();
|
||||
int32_t lastResult = (fData->fSafeRevTable != NULL) ?
|
||||
handleNewPrevious(): handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
int32_t lastTag = 0;
|
||||
UBool breakTagValid = FALSE;
|
||||
|
||||
// iterate forward from the known break position until we pass our
|
||||
// starting point. The last break position before the starting
|
||||
// point is our return value
|
||||
|
||||
for (;;) {
|
||||
// set things up. handlePrevious() will back us up to a safe position
|
||||
// before the current position to at most 2 breaks beyond. the
|
||||
// backwards rules may occasionally move the position to less than a
|
||||
// break beyond
|
||||
|
||||
int32_t safe = handlePrevious();
|
||||
return safe;
|
||||
/*** int32_t result = handleNext();
|
||||
// moving forward to a boundary.
|
||||
if (result < start) {
|
||||
fLastBreakTag = 0; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = TRUE; // handlenext called
|
||||
/// return lastResult;
|
||||
return result;
|
||||
result = handleNext();
|
||||
if (result == BreakIterator::DONE || result >= start) {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
fText->setIndex(safe);
|
||||
if (safe == fText->startIndex()) {
|
||||
// if we are at the start of the text and result == start
|
||||
// this means that we are already at the previous break
|
||||
fLastBreakTag = 0; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = FALSE;
|
||||
return safe;
|
||||
}
|
||||
}
|
||||
***/
|
||||
/// lastResult = result;
|
||||
/// lastTag = fLastBreakTag;
|
||||
/// breakTagValid = TRUE;
|
||||
|
||||
// fLastBreakTag wants to have the value for section of text preceding
|
||||
// the result position that we are to return (in lastResult.) If
|
||||
// the backwards rules overshot and the above loop had to do two or more
|
||||
// handleNext()s to move up to the desired return position, we will have a valid
|
||||
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
|
||||
// we wont have a tag value for that position, which is only set by handleNext().
|
||||
|
||||
|
||||
/// fText->setIndex(lastResult);
|
||||
/// fLastBreakTag = lastTag; // for use by getRuleStatus()
|
||||
/// fLastBreakTagValid = breakTagValid;
|
||||
/// return lastResult;
|
||||
lastResult = result;
|
||||
lastTag = fLastBreakTag;
|
||||
breakTagValid = TRUE;
|
||||
}
|
||||
|
||||
// fLastBreakTag wants to have the value for section of text preceding
|
||||
// the result position that we are to return (in lastResult.) If
|
||||
// the backwards rules overshot and the above loop had to do two or more
|
||||
// handleNext()s to move up to the desired return position, we will have a valid
|
||||
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
|
||||
// we wont have a tag value for that position, which is only set by handleNext().
|
||||
|
||||
// set the current iteration position to be the last break position
|
||||
// before where we started, and then return that value
|
||||
fText->setIndex(lastResult);
|
||||
fLastBreakTag = lastTag; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = breakTagValid;
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
|
||||
@ -482,21 +480,34 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// otherwise, set our internal iteration position (temporarily)
|
||||
// to the position passed in. If this is the _beginning_ position,
|
||||
// then we can just use next() to get our return value
|
||||
/// todo synwee
|
||||
/// fText->setIndex(offset);
|
||||
fText->setIndex(fText->startIndex());
|
||||
/// if (offset == fText->startIndex())
|
||||
/// return handleNext();
|
||||
|
||||
int32_t result = 0;
|
||||
|
||||
// otherwise, we have to sync up first. Use handlePrevious() to back
|
||||
// us up to a known break position before the specified position (if
|
||||
// we can determine that the specified position is a break position,
|
||||
// we don't back up at all). This may or may not be the last break
|
||||
// position at or before our starting position. Advance forward
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
if (fData->fSafeRevTable != NULL) {
|
||||
// new rule syntax
|
||||
/// todo synwee
|
||||
/// fText->setIndex(offset);
|
||||
fText->setIndex(fText->startIndex());
|
||||
|
||||
result = fText->startIndex();
|
||||
}
|
||||
else {
|
||||
// otherwise, we have to sync up first. Use handlePrevious() to back
|
||||
// us up to a known break position before the specified position (if
|
||||
// we can determine that the specified position is a break position,
|
||||
// we don't back up at all). This may or may not be the last break
|
||||
// position at or before our starting position. Advance forward
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
// old rule syntax
|
||||
|
||||
fText->setIndex(offset);
|
||||
if (offset == fText->startIndex()) {
|
||||
return handleNext();
|
||||
}
|
||||
result = previous();
|
||||
}
|
||||
|
||||
int32_t result = fText->startIndex();/// previous();
|
||||
while (result != BreakIterator::DONE && result <= offset) {
|
||||
result = next();
|
||||
}
|
||||
@ -525,17 +536,22 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
// if we start by updating the current iteration position to the
|
||||
// position specified by the caller, we can just use previous()
|
||||
// to carry out this operation
|
||||
/// todo synwee
|
||||
/// fText->setIndex(offset);
|
||||
|
||||
/// return previous();
|
||||
int32_t result = fText->endIndex();
|
||||
fText->setIndex(result);
|
||||
while (result != BreakIterator::DONE && result >= offset) {
|
||||
result = next();
|
||||
if (fData->fSafeRevTable != NULL) {
|
||||
/// todo synwee
|
||||
// new rule syntax
|
||||
int32_t result = fText->endIndex();
|
||||
fText->setIndex(result);
|
||||
while (result != BreakIterator::DONE && result >= offset) {
|
||||
result = previous();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
return result;
|
||||
// old rule syntax
|
||||
fText->setIndex(offset);
|
||||
return previous();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -759,6 +775,130 @@ continueOn:
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
if (fText == NULL || fData == NULL) {
|
||||
return 0;
|
||||
}
|
||||
if (fData->fReverseTable == NULL) {
|
||||
return fText->setToStart();
|
||||
}
|
||||
|
||||
int32_t state = START_STATE;
|
||||
int32_t category;
|
||||
int32_t lastCategory = 0;
|
||||
int32_t result = fText->getIndex();
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
int32_t lookaheadTag = 0;
|
||||
UChar32 c = fText->current32();
|
||||
RBBIStateTableRow *row;
|
||||
|
||||
row = (RBBIStateTableRow *)
|
||||
(this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
if (fTrace) {
|
||||
RBBIDebugPrintf("Handle Prev pos char state category \n");
|
||||
}
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
for (;;) {
|
||||
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
break;
|
||||
}
|
||||
|
||||
// save the last character's category and look up the current
|
||||
// character's category
|
||||
lastCategory = category;
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
if (fTrace) {
|
||||
RBBIDebugPrintf(" %4d ", fText->getIndex());
|
||||
if (0x20<=c && c<0x7f) {
|
||||
RBBIDebugPrintf("\"%c\" ", c);
|
||||
} else {
|
||||
RBBIDebugPrintf("%5x ", c);
|
||||
}
|
||||
RBBIDebugPrintf("%3d %3d\n", state, category);
|
||||
}
|
||||
|
||||
// look up a state transition in the backwards state table
|
||||
state = row->fNextState[category];
|
||||
row = (RBBIStateTableRow *)
|
||||
(this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
|
||||
|
||||
if (row->fAccepting == 0 && row->fLookAhead == 0) {
|
||||
// No match, nothing of interest happening, common case.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, no lookahead involved.
|
||||
result = fText->getIndex();
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting == 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match point. Remember it, but only if no other rule
|
||||
// has unconditionally matched to this point.
|
||||
// TODO: handle case where there's a pending match from a different rule
|
||||
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
|
||||
int32_t r = fText->getIndex();
|
||||
if (r > result) {
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTag = row->fTag;
|
||||
}
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
if (row->fAccepting != 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
if (lookaheadResult > result) {
|
||||
U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||
// of overlapping lookahead matches.
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
break;
|
||||
}
|
||||
|
||||
// then advance one character backwards
|
||||
c = fText->previous32();
|
||||
}
|
||||
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
// but where the implementation of previous() turns around and
|
||||
// starts iterating forward again.
|
||||
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
result = fText->startIndex();
|
||||
}
|
||||
fText->setIndex(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handlePrevious()
|
||||
@ -773,7 +913,7 @@ continueOn:
|
||||
// The logic of this function is very similar to handleNext(), above.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
||||
if (fText == NULL || fData == NULL) {
|
||||
return 0;
|
||||
}
|
||||
@ -886,7 +1026,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) { /// && lookaheadStatus == 0) {
|
||||
if (state == STOP_STATE) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -898,10 +1038,6 @@ continueOn:
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
// but where the implementation of previous() turns around and
|
||||
// starts iterating forward again.
|
||||
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
/*** if (hasPassedStartText && row->fLookAhead) {
|
||||
return fText->setToStart();
|
||||
}***/
|
||||
fText->setIndex(result);
|
||||
|
||||
return result;
|
||||
|
@ -472,6 +472,17 @@ protected:
|
||||
*/
|
||||
void init();
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* This method backs the iterator back up to a "safe position" in the text.
|
||||
* This is a position that we know, without any context, must be a break position.
|
||||
* The various calling methods then iterate forward from this safe position to
|
||||
* the appropriate position to return. (For more information, see the description
|
||||
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
|
||||
* @internal
|
||||
*/
|
||||
int32_t handleNewPrevious(void);
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -31,16 +31,23 @@ $LVT = [:Hangul_Syllable_Type = LVT:];
|
||||
|
||||
$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
|
||||
|
||||
#
|
||||
# Forward Break Rules
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
([^$Control] | $HangulSyllable) $Extend*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
#
|
||||
# Reverse Rule, back up to the beginning of some preceding grapheme cluster.
|
||||
#
|
||||
$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
|
||||
$BackOneCluster = ($LF $CR) | ($Extend* ([^$Control] | $BackHangulSyllable));
|
||||
! $BackOneCluster;
|
||||
$BackOneCluster;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
$BackOneCluster;
|
@ -553,38 +553,10 @@ void RBBITest::TestThaiWordBreak() {
|
||||
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
|
||||
{
|
||||
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
|
||||
|
||||
switch (index) {
|
||||
case 0: name = "TestJapaneseLineBreak";
|
||||
if(exec) TestJapaneseLineBreak(); break;
|
||||
case 1: name = "TestStatusReturn";
|
||||
if(exec) TestStatusReturn(); break;
|
||||
|
||||
case 2: name = "TestLineBreakData";
|
||||
if(exec) TestLineBreakData(); break;
|
||||
case 3: name = "TestEmptyString";
|
||||
if(exec) TestEmptyString(); break;
|
||||
|
||||
case 4: name = "TestGetAvailableLocales";
|
||||
if(exec) TestGetAvailableLocales(); break;
|
||||
|
||||
case 5: name = "TestGetDisplayName";
|
||||
if(exec) TestGetDisplayName(); break;
|
||||
|
||||
case 6: name = "TestEndBehaviour";
|
||||
if(exec) TestEndBehaviour(); break;
|
||||
case 7: name = "TestBug4153072";
|
||||
case 0: name = "TestBug4153072";
|
||||
if(exec) TestBug4153072(); break;
|
||||
case 8: name = "TestWordBreaks";
|
||||
if(exec) TestWordBreaks(); break;
|
||||
case 9: name = "TestWordBoundary";
|
||||
if(exec) TestWordBoundary(); break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
|
||||
/***
|
||||
switch (index) {
|
||||
case 0: name = "TestExtended";
|
||||
if(exec) TestExtended(); break;
|
||||
case 1: name = "TestJapaneseLineBreak";
|
||||
if(exec) TestJapaneseLineBreak(); break;
|
||||
case 2: name = "TestStatusReturn";
|
||||
@ -603,9 +575,26 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
|
||||
case 7: name = "TestEndBehaviour";
|
||||
if(exec) TestEndBehaviour(); break;
|
||||
case 8: name = "TestBug4153072";
|
||||
if(exec) TestBug4153072(); break;
|
||||
case 9: name = "TestMonkey";
|
||||
case 8: name = "TestMixedThaiLineBreak";
|
||||
if(exec) TestMixedThaiLineBreak(); break;
|
||||
case 9: name = "TestThaiWordBreak";
|
||||
if(exec) TestThaiWordBreak(); break;
|
||||
case 10: name = "TestThaiLineBreak";
|
||||
if(exec) TestThaiLineBreak(); break;
|
||||
case 11: name = "TestMaiyamok";
|
||||
if(exec) TestMaiyamok(); break;
|
||||
case 12: name = "TestWordBreaks";
|
||||
if(exec) TestWordBreaks(); break;
|
||||
case 13: name = "TestWordBoundary";
|
||||
if(exec) TestWordBoundary(); break;
|
||||
/***
|
||||
case 14: name = "TestLineBreaks";
|
||||
if(exec) TestLineBreaks(); break;
|
||||
case 15: name = "TestSentBreaks";
|
||||
if(exec) TestSentBreaks(); break;
|
||||
case 16: name = "TestExtended";
|
||||
if(exec) TestExtended(); break;
|
||||
case 17: name = "TestMonkey";
|
||||
if(exec) {
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
TestMonkey(params);
|
||||
@ -613,26 +602,10 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
|
||||
#endif
|
||||
}
|
||||
***/
|
||||
break;
|
||||
|
||||
case 10: name = "TestThaiLineBreak";
|
||||
if(exec) TestThaiLineBreak(); break;
|
||||
case 11: name = "TestMixedThaiLineBreak";
|
||||
if(exec) TestMixedThaiLineBreak(); break;
|
||||
case 12: name = "TestMaiyamok";
|
||||
if(exec) TestMaiyamok(); break;
|
||||
case 13: name = "TestThaiWordBreak";
|
||||
if(exec) TestThaiWordBreak(); break;
|
||||
case 14: name = "TestWordBreaks";
|
||||
if(exec) TestWordBreaks(); break;
|
||||
case 15: name = "TestLineBreaks";
|
||||
if(exec) TestLineBreaks(); break;
|
||||
case 16: name = "TestWordBoundary";
|
||||
if(exec) TestWordBoundary(); break;
|
||||
case 17: name = "TestSentBreaks";
|
||||
if(exec) TestSentBreaks(); break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
} ***/
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user