ICU-1117 add getRuleStatus() to RBBI
X-SVN-Rev: 8956
This commit is contained in:
parent
37792a8277
commit
878c84b1d2
@ -155,7 +155,7 @@ void RuleBasedBreakIterator::init() {
|
|||||||
fText = NULL;
|
fText = NULL;
|
||||||
fData = NULL;
|
fData = NULL;
|
||||||
fCharMappings = NULL;
|
fCharMappings = NULL;
|
||||||
fLastBreakStatus = 0;
|
fLastBreakTag = 0;
|
||||||
fDictionaryCharCount = 0;
|
fDictionaryCharCount = 0;
|
||||||
|
|
||||||
if (debugInitDone == FALSE) {
|
if (debugInitDone == FALSE) {
|
||||||
@ -494,6 +494,9 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||||||
UChar32 c = fText->current32();
|
UChar32 c = fText->current32();
|
||||||
RBBIStateTableRow *row;
|
RBBIStateTableRow *row;
|
||||||
int32_t lookaheadStatus = 0;
|
int32_t lookaheadStatus = 0;
|
||||||
|
int32_t lookaheadTag = 0;
|
||||||
|
|
||||||
|
fLastBreakTag = 0;
|
||||||
|
|
||||||
row = (RBBIStateTableRow *)
|
row = (RBBIStateTableRow *)
|
||||||
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
|
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
|
||||||
@ -550,10 +553,13 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||||||
goto continueOn;
|
goto continueOn;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (row->fAccepting != 0 && row->fLookAhead == 0) {
|
if (row->fAccepting == -1) {
|
||||||
// Match found, common case, no lookahead involved.
|
// Match found, common case, no lookahead involved.
|
||||||
|
// (It's possible that some lookahead rule matched here also,
|
||||||
|
// but since there's an unconditional match, we'll favor that.)
|
||||||
result = fText->getIndex();
|
result = fText->getIndex();
|
||||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||||
|
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
|
||||||
goto continueOn;
|
goto continueOn;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -566,6 +572,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||||||
if (r > result) {
|
if (r > result) {
|
||||||
lookaheadResult = r;
|
lookaheadResult = r;
|
||||||
lookaheadStatus = row->fLookAhead;
|
lookaheadStatus = row->fLookAhead;
|
||||||
|
lookaheadTag = row->fTag;
|
||||||
}
|
}
|
||||||
goto continueOn;
|
goto continueOn;
|
||||||
}
|
}
|
||||||
@ -577,6 +584,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||||||
assert(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
assert(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||||
// of overlapping lookahead matches.
|
// of overlapping lookahead matches.
|
||||||
result = lookaheadResult;
|
result = lookaheadResult;
|
||||||
|
fLastBreakTag = lookaheadTag;
|
||||||
lookaheadStatus = 0;
|
lookaheadStatus = 0;
|
||||||
}
|
}
|
||||||
goto continueOn;
|
goto continueOn;
|
||||||
@ -631,6 +639,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
|||||||
int32_t result = fText->getIndex();
|
int32_t result = fText->getIndex();
|
||||||
int32_t lookaheadStatus = 0;
|
int32_t lookaheadStatus = 0;
|
||||||
int32_t lookaheadResult = 0;
|
int32_t lookaheadResult = 0;
|
||||||
|
int32_t lookaheadTag = 0;
|
||||||
UChar32 c = fText->current32();
|
UChar32 c = fText->current32();
|
||||||
RBBIStateTableRow *row;
|
RBBIStateTableRow *row;
|
||||||
|
|
||||||
@ -685,7 +694,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
|||||||
goto continueOn;
|
goto continueOn;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (row->fAccepting != 0 && row->fLookAhead == 0) {
|
if (row->fAccepting == -1) {
|
||||||
// Match found, common case, no lookahead involved.
|
// Match found, common case, no lookahead involved.
|
||||||
result = fText->getIndex();
|
result = fText->getIndex();
|
||||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||||
@ -694,13 +703,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
|||||||
|
|
||||||
if (row->fAccepting == 0 && row->fLookAhead != 0) {
|
if (row->fAccepting == 0 && row->fLookAhead != 0) {
|
||||||
// Lookahead match point. Remember it, but only if no other rule
|
// Lookahead match point. Remember it, but only if no other rule
|
||||||
// has unconditinally matched to this point.
|
// has unconditionally matched to this point.
|
||||||
// TODO: handle case where there's a pending match from a different rule
|
// TODO: handle case where there's a pending match from a different rule
|
||||||
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
|
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
|
||||||
int32_t r = fText->getIndex();
|
int32_t r = fText->getIndex();
|
||||||
if (r > result) {
|
if (r > result) {
|
||||||
lookaheadResult = r;
|
lookaheadResult = r;
|
||||||
lookaheadStatus = row->fLookAhead;
|
lookaheadStatus = row->fLookAhead;
|
||||||
|
lookaheadTag = row->fTag;
|
||||||
}
|
}
|
||||||
goto continueOn;
|
goto continueOn;
|
||||||
}
|
}
|
||||||
@ -712,6 +722,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
|||||||
assert(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
assert(row->fAccepting == lookaheadStatus); // TODO: handle this case
|
||||||
// of overlapping lookahead matches.
|
// of overlapping lookahead matches.
|
||||||
result = lookaheadResult;
|
result = lookaheadResult;
|
||||||
|
fLastBreakTag = lookaheadTag;
|
||||||
lookaheadStatus = 0;
|
lookaheadStatus = 0;
|
||||||
}
|
}
|
||||||
goto continueOn;
|
goto continueOn;
|
||||||
@ -752,8 +763,8 @@ RuleBasedBreakIterator::reset()
|
|||||||
// getRuleStatus()
|
// getRuleStatus()
|
||||||
//
|
//
|
||||||
//-------------------------------------------------------------------------------
|
//-------------------------------------------------------------------------------
|
||||||
int16_t RuleBasedBreakIterator::getRuleStatus() const {
|
int32_t RuleBasedBreakIterator::getRuleStatus() const {
|
||||||
return fLastBreakStatus;
|
return fLastBreakTag;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -764,13 +775,13 @@ int16_t RuleBasedBreakIterator::getRuleStatus() const {
|
|||||||
// for standard iterator types.
|
// for standard iterator types.
|
||||||
//
|
//
|
||||||
//-------------------------------------------------------------------------------
|
//-------------------------------------------------------------------------------
|
||||||
const uint8_t *RuleBasedBreakIterator::getFlattenedData(uint32_t *length) {
|
const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
|
||||||
const uint8_t *retPtr = NULL;
|
const uint8_t *retPtr = NULL;
|
||||||
*length = 0;
|
length = 0;
|
||||||
|
|
||||||
if (fData != NULL) {
|
if (fData != NULL) {
|
||||||
retPtr = (const uint8_t *)fData->fHeader;
|
retPtr = (const uint8_t *)fData->fHeader;
|
||||||
*length = fData->fHeader->fLength;
|
length = fData->fHeader->fLength;
|
||||||
}
|
}
|
||||||
return retPtr;
|
return retPtr;
|
||||||
}
|
}
|
||||||
|
@ -164,10 +164,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//----------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// RulesBasedBreakIterator, construct from source rules that are passed in
|
// createRuleBasedBreakIterator construct from source rules that are passed in
|
||||||
// in a UnicodeString
|
// in a UnicodeString
|
||||||
//
|
//
|
||||||
|
//----------------------------------------------------------------------------------------
|
||||||
BreakIterator *
|
BreakIterator *
|
||||||
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
||||||
UParseError &parseError,
|
UParseError &parseError,
|
||||||
|
@ -101,7 +101,7 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
|
|||||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30
|
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30
|
||||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31
|
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31
|
||||||
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32
|
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32
|
||||||
, {doExprCatOperator, 123 /*{*/, 49,0, FALSE} // 33
|
, {doExprCatOperator, 123 /*{*/, 49,0, TRUE} // 33
|
||||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34
|
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34
|
||||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35
|
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35
|
||||||
, {doExprFinished, 255, 255,0, FALSE} // 36
|
, {doExprFinished, 255, 255,0, FALSE} // 36
|
||||||
|
@ -129,7 +129,7 @@ expr-cont:
|
|||||||
'$' term doExprCatOperator
|
'$' term doExprCatOperator
|
||||||
'.' term doExprCatOperator
|
'.' term doExprCatOperator
|
||||||
'/' look-ahead doExprCatOperator
|
'/' look-ahead doExprCatOperator
|
||||||
'{' tag-open doExprCatOperator
|
'{' n tag-open doExprCatOperator
|
||||||
'|' n term doExprOrOperator
|
'|' n term doExprOrOperator
|
||||||
')' n pop doExprRParen
|
')' n pop doExprRParen
|
||||||
default pop doExprFinished
|
default pop doExprFinished
|
||||||
|
@ -451,13 +451,15 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action,
|
|||||||
case doTagDigit:
|
case doTagDigit:
|
||||||
// Just scanned a decimal digit that's part of a tag value
|
// Just scanned a decimal digit that's part of a tag value
|
||||||
{
|
{
|
||||||
|
n = fNodeStack[fNodeStackPtr];
|
||||||
uint32_t v = u_charDigitValue(fC.fChar);
|
uint32_t v = u_charDigitValue(fC.fChar);
|
||||||
assert(v >= 0);
|
assert(v >= 0);
|
||||||
n->fVal *= v;
|
n->fVal = n->fVal*10 + v;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case doTagValue:
|
case doTagValue:
|
||||||
|
n = fNodeStack[fNodeStackPtr];
|
||||||
n->fLastPos = fNextIndex;
|
n->fLastPos = fNextIndex;
|
||||||
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
|
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
|
||||||
break;
|
break;
|
||||||
@ -952,6 +954,19 @@ void RBBIRuleScanner::parse() {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
|
||||||
|
//
|
||||||
|
if (fRB->fReverseTree == NULL) {
|
||||||
|
fRB->fReverseTree = pushNewNode(RBBINode::opStar);
|
||||||
|
RBBINode *operand = pushNewNode(RBBINode::setRef);
|
||||||
|
findSetFor(kAny, operand);
|
||||||
|
fRB->fReverseTree->fLeftChild = operand;
|
||||||
|
operand->fParent = fRB->fReverseTree;
|
||||||
|
fNodeStackPtr -= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Parsing of the input RBBI rules is complete.
|
// Parsing of the input RBBI rules is complete.
|
||||||
// We now have a parse tree for the rule expressions
|
// We now have a parse tree for the rule expressions
|
||||||
|
@ -111,6 +111,7 @@ void RBBITableBuilder::build() {
|
|||||||
buildStateTable();
|
buildStateTable();
|
||||||
flagAcceptingStates();
|
flagAcceptingStates();
|
||||||
flagLookAheadStates();
|
flagLookAheadStates();
|
||||||
|
flagTaggedStates();
|
||||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();};
|
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -201,7 +201,7 @@ protected:
|
|||||||
//
|
//
|
||||||
RBBIDataWrapper *fData;
|
RBBIDataWrapper *fData;
|
||||||
UTrie *fCharMappings;
|
UTrie *fCharMappings;
|
||||||
int16_t fLastBreakStatus;
|
int32_t fLastBreakTag; // Rule {tag} value for the most recent match.
|
||||||
|
|
||||||
//
|
//
|
||||||
// Counter for the number of characters encountered with the "dictionary"
|
// Counter for the number of characters encountered with the "dictionary"
|
||||||
@ -414,7 +414,7 @@ protected:
|
|||||||
* within brackets, {123}, for example. For rules that do not specify a
|
* within brackets, {123}, for example. For rules that do not specify a
|
||||||
* status, a default value of 0 is returned.
|
* status, a default value of 0 is returned.
|
||||||
*/
|
*/
|
||||||
virtual int16_t getRuleStatus() const;
|
virtual int32_t getRuleStatus() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
||||||
@ -446,17 +446,20 @@ protected:
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the flattened form of compiled break rules,
|
* Return the binary form of compiled break rules,
|
||||||
* which can then be used to create a new break iterator at some
|
* which can then be used to create a new break iterator at some
|
||||||
* time in the future. Creating a break iterator in this way
|
* time in the future. Creating a break iterator in this way
|
||||||
* is much faster than building one from the source form of the
|
* is much faster than building one from the source form of the
|
||||||
* break rules.
|
* break rules.
|
||||||
*
|
*
|
||||||
* @return A pointer to the flattened rule data. The storage
|
* The binary data is can only be used with the same version of ICU
|
||||||
|
* and on the same platform type (processor endian-ness)
|
||||||
|
*
|
||||||
|
* @return A pointer to the binary (compiled) rule data. The storage
|
||||||
* belongs to the RulesBasedBreakIterator object, no the
|
* belongs to the RulesBasedBreakIterator object, no the
|
||||||
* caller, and must not be modified or deleted.
|
* caller, and must not be modified or deleted.
|
||||||
*/
|
*/
|
||||||
virtual const uint8_t *getFlattenedData(uint32_t *length);
|
virtual const uint8_t *getBinaryRules(uint32_t &length);
|
||||||
|
|
||||||
|
|
||||||
#ifdef RBBI_DEBUG
|
#ifdef RBBI_DEBUG
|
||||||
|
@ -610,6 +610,7 @@ void RBBIAPITest::TestBuilder() {
|
|||||||
bi->setText(testString1);
|
bi->setText(testString1);
|
||||||
doBoundaryTest(*bi, testString1, bounds1);
|
doBoundaryTest(*bi, testString1, bounds1);
|
||||||
}
|
}
|
||||||
|
delete bi;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -348,7 +348,7 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
|
|||||||
// delete rbbi;
|
// delete rbbi;
|
||||||
}
|
}
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
//tests default rules based word iteration
|
//tests default rules based sentence iteration
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
static const UChar kParagraphSeparator[] = {0x2029, 0};
|
static const UChar kParagraphSeparator[] = {0x2029, 0};
|
||||||
static const UChar kLineSeparator[] = {0x2028, 0};
|
static const UChar kLineSeparator[] = {0x2028, 0};
|
||||||
@ -766,6 +766,53 @@ void RBBITest::TestTitleBreak()
|
|||||||
delete titleData;
|
delete titleData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Test for status {tag} return value from break rules.
|
||||||
|
// TODO: a more thorough test.
|
||||||
|
//
|
||||||
|
//-----------------------------------------------------------------------------------
|
||||||
|
void RBBITest::TestStatusReturn() {
|
||||||
|
UnicodeString rulesString1 = "$Letters = [:L:];\n"
|
||||||
|
"$Numbers = [:N:];\n"
|
||||||
|
"$Letters+{1};\n"
|
||||||
|
"$Numbers+{2};\n"
|
||||||
|
"Help\\ {4}/me\\!;\n"
|
||||||
|
"[^$Letters $Numbers];\n"
|
||||||
|
"!.*;\n";
|
||||||
|
UnicodeString testString1 = "abc123..abc Help me Help me!";
|
||||||
|
// 01234567890123456789012345678
|
||||||
|
int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
|
||||||
|
int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
|
||||||
|
|
||||||
|
UErrorCode status=U_ZERO_ERROR;
|
||||||
|
UParseError parseError;
|
||||||
|
|
||||||
|
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
|
||||||
|
if(U_FAILURE(status)) {
|
||||||
|
errln("FAIL : in construction");
|
||||||
|
} else {
|
||||||
|
int32_t pos;
|
||||||
|
int32_t i = 0;
|
||||||
|
bi->setText(testString1);
|
||||||
|
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
|
||||||
|
if (pos != bounds1[i]) {
|
||||||
|
errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int tag = bi->getRuleStatus();
|
||||||
|
if (tag != brkStatus[i]) {
|
||||||
|
errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete bi;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
//Bug: if there is no word break before and after danda when it is followed by a space
|
//Bug: if there is no word break before and after danda when it is followed by a space
|
||||||
void RBBITest::TestDanda()
|
void RBBITest::TestDanda()
|
||||||
@ -1039,6 +1086,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||||||
if(exec) TestHindiWordBreak(); break;
|
if(exec) TestHindiWordBreak(); break;
|
||||||
case 6: name = "TestTitleBreak";
|
case 6: name = "TestTitleBreak";
|
||||||
if(exec) TestTitleBreak(); break;
|
if(exec) TestTitleBreak(); break;
|
||||||
|
case 7: name = "TestStatusReturn";
|
||||||
|
if(exec) TestStatusReturn(); break;
|
||||||
|
|
||||||
// case 6: name = "TestDanda()";
|
// case 6: name = "TestDanda()";
|
||||||
// if(exec) TestDanda(); break;
|
// if(exec) TestDanda(); break;
|
||||||
|
@ -55,6 +55,12 @@ public:
|
|||||||
* Tests Title Case break iteration
|
* Tests Title Case break iteration
|
||||||
**/
|
**/
|
||||||
void TestTitleBreak(void);
|
void TestTitleBreak(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests rule status return values
|
||||||
|
**/
|
||||||
|
void TestStatusReturn();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Hindi Danda i.e make sure we have a break point before and after danda
|
* Test Hindi Danda i.e make sure we have a break point before and after danda
|
||||||
**/
|
**/
|
||||||
|
@ -191,7 +191,7 @@ int main(int argc, char **argv) {
|
|||||||
//
|
//
|
||||||
uint32_t outDataSize;
|
uint32_t outDataSize;
|
||||||
const uint8_t *outData;
|
const uint8_t *outData;
|
||||||
outData = bi->getFlattenedData(&outDataSize);
|
outData = bi->getBinaryRules(outDataSize);
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
|
Loading…
Reference in New Issue
Block a user