ICU-1117 add getRuleStatus() to RBBI

X-SVN-Rev: 8956
This commit is contained in:
Andy Heninger 2002-06-27 01:50:22 +00:00
parent 37792a8277
commit 878c84b1d2
11 changed files with 116 additions and 28 deletions

View File

@ -155,7 +155,7 @@ void RuleBasedBreakIterator::init() {
fText = NULL; fText = NULL;
fData = NULL; fData = NULL;
fCharMappings = NULL; fCharMappings = NULL;
fLastBreakStatus = 0; fLastBreakTag = 0;
fDictionaryCharCount = 0; fDictionaryCharCount = 0;
if (debugInitDone == FALSE) { if (debugInitDone == FALSE) {
@ -494,6 +494,9 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
UChar32 c = fText->current32(); UChar32 c = fText->current32();
RBBIStateTableRow *row; RBBIStateTableRow *row;
int32_t lookaheadStatus = 0; int32_t lookaheadStatus = 0;
int32_t lookaheadTag = 0;
fLastBreakTag = 0;
row = (RBBIStateTableRow *) row = (RBBIStateTableRow *)
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state)); (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
@ -550,10 +553,13 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
goto continueOn; goto continueOn;
} }
if (row->fAccepting != 0 && row->fLookAhead == 0) { if (row->fAccepting == -1) {
// Match found, common case, no lookahead involved. // Match found, common case, no lookahead involved.
// (It's possible that some lookahead rule matched here also,
// but since there's an unconditional match, we'll favor that.)
result = fText->getIndex(); result = fText->getIndex();
lookaheadStatus = 0; // clear out any pending look-ahead matches. lookaheadStatus = 0; // clear out any pending look-ahead matches.
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
goto continueOn; goto continueOn;
} }
@ -566,6 +572,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
if (r > result) { if (r > result) {
lookaheadResult = r; lookaheadResult = r;
lookaheadStatus = row->fLookAhead; lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
} }
goto continueOn; goto continueOn;
} }
@ -577,6 +584,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
assert(row->fAccepting == lookaheadStatus); // TODO: handle this case assert(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches. // of overlapping lookahead matches.
result = lookaheadResult; result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0; lookaheadStatus = 0;
} }
goto continueOn; goto continueOn;
@ -631,6 +639,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
int32_t result = fText->getIndex(); int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0; int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0; int32_t lookaheadResult = 0;
int32_t lookaheadTag = 0;
UChar32 c = fText->current32(); UChar32 c = fText->current32();
RBBIStateTableRow *row; RBBIStateTableRow *row;
@ -685,7 +694,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
goto continueOn; goto continueOn;
} }
if (row->fAccepting != 0 && row->fLookAhead == 0) { if (row->fAccepting == -1) {
// Match found, common case, no lookahead involved. // Match found, common case, no lookahead involved.
result = fText->getIndex(); result = fText->getIndex();
lookaheadStatus = 0; // clear out any pending look-ahead matches. lookaheadStatus = 0; // clear out any pending look-ahead matches.
@ -694,13 +703,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
if (row->fAccepting == 0 && row->fLookAhead != 0) { if (row->fAccepting == 0 && row->fLookAhead != 0) {
// Lookahead match point. Remember it, but only if no other rule // Lookahead match point. Remember it, but only if no other rule
// has unconditinally matched to this point. // has unconditionally matched to this point.
// TODO: handle case where there's a pending match from a different rule // TODO: handle case where there's a pending match from a different rule
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead. // where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
int32_t r = fText->getIndex(); int32_t r = fText->getIndex();
if (r > result) { if (r > result) {
lookaheadResult = r; lookaheadResult = r;
lookaheadStatus = row->fLookAhead; lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
} }
goto continueOn; goto continueOn;
} }
@ -712,6 +722,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
assert(row->fAccepting == lookaheadStatus); // TODO: handle this case assert(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches. // of overlapping lookahead matches.
result = lookaheadResult; result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0; lookaheadStatus = 0;
} }
goto continueOn; goto continueOn;
@ -752,8 +763,8 @@ RuleBasedBreakIterator::reset()
// getRuleStatus() // getRuleStatus()
// //
//------------------------------------------------------------------------------- //-------------------------------------------------------------------------------
int16_t RuleBasedBreakIterator::getRuleStatus() const { int32_t RuleBasedBreakIterator::getRuleStatus() const {
return fLastBreakStatus; return fLastBreakTag;
} }
@ -764,13 +775,13 @@ int16_t RuleBasedBreakIterator::getRuleStatus() const {
// for standard iterator types. // for standard iterator types.
// //
//------------------------------------------------------------------------------- //-------------------------------------------------------------------------------
const uint8_t *RuleBasedBreakIterator::getFlattenedData(uint32_t *length) { const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
const uint8_t *retPtr = NULL; const uint8_t *retPtr = NULL;
*length = 0; length = 0;
if (fData != NULL) { if (fData != NULL) {
retPtr = (const uint8_t *)fData->fHeader; retPtr = (const uint8_t *)fData->fHeader;
*length = fData->fHeader->fLength; length = fData->fHeader->fLength;
} }
return retPtr; return retPtr;
} }

View File

@ -164,10 +164,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
//----------------------------------------------------------------------------------------
// //
// RulesBasedBreakIterator, construct from source rules that are passed in // createRuleBasedBreakIterator construct from source rules that are passed in
// in a UnicodeString // in a UnicodeString
// //
//----------------------------------------------------------------------------------------
BreakIterator * BreakIterator *
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError, UParseError &parseError,

View File

@ -101,7 +101,7 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30 , {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31 , {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32 , {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32
, {doExprCatOperator, 123 /*{*/, 49,0, FALSE} // 33 , {doExprCatOperator, 123 /*{*/, 49,0, TRUE} // 33
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34 , {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35 , {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35
, {doExprFinished, 255, 255,0, FALSE} // 36 , {doExprFinished, 255, 255,0, FALSE} // 36

View File

@ -129,7 +129,7 @@ expr-cont:
'$' term doExprCatOperator '$' term doExprCatOperator
'.' term doExprCatOperator '.' term doExprCatOperator
'/' look-ahead doExprCatOperator '/' look-ahead doExprCatOperator
'{' tag-open doExprCatOperator '{' n tag-open doExprCatOperator
'|' n term doExprOrOperator '|' n term doExprOrOperator
')' n pop doExprRParen ')' n pop doExprRParen
default pop doExprFinished default pop doExprFinished

View File

@ -451,13 +451,15 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action,
case doTagDigit: case doTagDigit:
// Just scanned a decimal digit that's part of a tag value // Just scanned a decimal digit that's part of a tag value
{ {
n = fNodeStack[fNodeStackPtr];
uint32_t v = u_charDigitValue(fC.fChar); uint32_t v = u_charDigitValue(fC.fChar);
assert(v >= 0); assert(v >= 0);
n->fVal *= v; n->fVal = n->fVal*10 + v;
break; break;
} }
case doTagValue: case doTagValue:
n = fNodeStack[fNodeStackPtr];
n->fLastPos = fNextIndex; n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
break; break;
@ -952,6 +954,19 @@ void RBBIRuleScanner::parse() {
} }
//
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
//
if (fRB->fReverseTree == NULL) {
fRB->fReverseTree = pushNewNode(RBBINode::opStar);
RBBINode *operand = pushNewNode(RBBINode::setRef);
findSetFor(kAny, operand);
fRB->fReverseTree->fLeftChild = operand;
operand->fParent = fRB->fReverseTree;
fNodeStackPtr -= 2;
}
// //
// Parsing of the input RBBI rules is complete. // Parsing of the input RBBI rules is complete.
// We now have a parse tree for the rule expressions // We now have a parse tree for the rule expressions

View File

@ -111,6 +111,7 @@ void RBBITableBuilder::build() {
buildStateTable(); buildStateTable();
flagAcceptingStates(); flagAcceptingStates();
flagLookAheadStates(); flagLookAheadStates();
flagTaggedStates();
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();}; if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();};
} }

View File

@ -201,7 +201,7 @@ protected:
// //
RBBIDataWrapper *fData; RBBIDataWrapper *fData;
UTrie *fCharMappings; UTrie *fCharMappings;
int16_t fLastBreakStatus; int32_t fLastBreakTag; // Rule {tag} value for the most recent match.
// //
// Counter for the number of characters encountered with the "dictionary" // Counter for the number of characters encountered with the "dictionary"
@ -414,7 +414,7 @@ protected:
* within brackets, {123}, for example. For rules that do not specify a * within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned. * status, a default value of 0 is returned.
*/ */
virtual int16_t getRuleStatus() const; virtual int32_t getRuleStatus() const;
/** /**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override. * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
@ -446,17 +446,20 @@ protected:
/** /**
* Return the flattened form of compiled break rules, * Return the binary form of compiled break rules,
* which can then be used to create a new break iterator at some * which can then be used to create a new break iterator at some
* time in the future. Creating a break iterator in this way * time in the future. Creating a break iterator in this way
* is much faster than building one from the source form of the * is much faster than building one from the source form of the
* break rules. * break rules.
* *
* @return A pointer to the flattened rule data. The storage * The binary data is can only be used with the same version of ICU
* and on the same platform type (processor endian-ness)
*
* @return A pointer to the binary (compiled) rule data. The storage
* belongs to the RulesBasedBreakIterator object, no the * belongs to the RulesBasedBreakIterator object, no the
* caller, and must not be modified or deleted. * caller, and must not be modified or deleted.
*/ */
virtual const uint8_t *getFlattenedData(uint32_t *length); virtual const uint8_t *getBinaryRules(uint32_t &length);
#ifdef RBBI_DEBUG #ifdef RBBI_DEBUG

View File

@ -610,6 +610,7 @@ void RBBIAPITest::TestBuilder() {
bi->setText(testString1); bi->setText(testString1);
doBoundaryTest(*bi, testString1, bounds1); doBoundaryTest(*bi, testString1, bounds1);
} }
delete bi;
} }

View File

@ -348,7 +348,7 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
// delete rbbi; // delete rbbi;
} }
//-------------------------------------------------------------------- //--------------------------------------------------------------------
//tests default rules based word iteration //tests default rules based sentence iteration
//-------------------------------------------------------------------- //--------------------------------------------------------------------
static const UChar kParagraphSeparator[] = {0x2029, 0}; static const UChar kParagraphSeparator[] = {0x2029, 0};
static const UChar kLineSeparator[] = {0x2028, 0}; static const UChar kLineSeparator[] = {0x2028, 0};
@ -766,6 +766,53 @@ void RBBITest::TestTitleBreak()
delete titleData; delete titleData;
} }
//-----------------------------------------------------------------------------------
//
// Test for status {tag} return value from break rules.
// TODO: a more thorough test.
//
//-----------------------------------------------------------------------------------
void RBBITest::TestStatusReturn() {
UnicodeString rulesString1 = "$Letters = [:L:];\n"
"$Numbers = [:N:];\n"
"$Letters+{1};\n"
"$Numbers+{2};\n"
"Help\\ {4}/me\\!;\n"
"[^$Letters $Numbers];\n"
"!.*;\n";
UnicodeString testString1 = "abc123..abc Help me Help me!";
// 01234567890123456789012345678
int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
if(U_FAILURE(status)) {
errln("FAIL : in construction");
} else {
int32_t pos;
int32_t i = 0;
bi->setText(testString1);
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
if (pos != bounds1[i]) {
errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
break;
}
int tag = bi->getRuleStatus();
if (tag != brkStatus[i]) {
errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
break;
}
i++;
}
}
delete bi;
}
/* /*
//Bug: if there is no word break before and after danda when it is followed by a space //Bug: if there is no word break before and after danda when it is followed by a space
void RBBITest::TestDanda() void RBBITest::TestDanda()
@ -1039,6 +1086,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestHindiWordBreak(); break; if(exec) TestHindiWordBreak(); break;
case 6: name = "TestTitleBreak"; case 6: name = "TestTitleBreak";
if(exec) TestTitleBreak(); break; if(exec) TestTitleBreak(); break;
case 7: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
// case 6: name = "TestDanda()"; // case 6: name = "TestDanda()";
// if(exec) TestDanda(); break; // if(exec) TestDanda(); break;

View File

@ -55,6 +55,12 @@ public:
* Tests Title Case break iteration * Tests Title Case break iteration
**/ **/
void TestTitleBreak(void); void TestTitleBreak(void);
/**
* Tests rule status return values
**/
void TestStatusReturn();
/** /**
* Test Hindi Danda i.e make sure we have a break point before and after danda * Test Hindi Danda i.e make sure we have a break point before and after danda
**/ **/

View File

@ -191,7 +191,7 @@ int main(int argc, char **argv) {
// //
uint32_t outDataSize; uint32_t outDataSize;
const uint8_t *outData; const uint8_t *outData;
outData = bi->getFlattenedData(&outDataSize); outData = bi->getBinaryRules(outDataSize);
// //