From acae049ee11700513ba60fd5bfaf7e8c705f4bf9 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 13 Feb 2018 01:08:29 +0000 Subject: [PATCH] ICU-13569 rbbi table, remove duplicated states, working for C++. X-SVN-Rev: 40902 --- icu4c/source/common/rbbirb.cpp | 5 ++- icu4c/source/common/rbbitblb.cpp | 21 ++++++++----- icu4c/source/test/intltest/rbbitst.cpp | 42 ++++++++------------------ 3 files changed, 31 insertions(+), 37 deletions(-) diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 99c8e5dd5a..61e596d6ed 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -358,7 +358,7 @@ void RBBIRuleBuilder::optimizeTables() { int32_t rightClass; leftClass = 3; - rightClass = 4; + rightClass = 0; while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) { fSetBuilder->mergeCategories(leftClass, rightClass); fForwardTables->removeColumn(rightClass); @@ -368,6 +368,9 @@ void RBBIRuleBuilder::optimizeTables() { } fForwardTables->removeDuplicateStates(); + fReverseTables->removeDuplicateStates(); + fSafeFwdTables->removeDuplicateStates(); + fSafeRevTables->removeDuplicateStates(); diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 68e9ffb666..58168922d4 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -762,7 +762,7 @@ void RBBITableBuilder::flagAcceptingStates() { // if sd->fAccepting already had a value other than 0 or -1, leave it be. // If the end marker node is from a look-ahead rule, set - // the fLookAhead field or this state also. + // the fLookAhead field for this state also. if (endMarker->fLookAheadEnd) { // TODO: don't change value if already set? // TODO: allow for more than one active look-ahead rule in engine. @@ -1085,8 +1085,6 @@ bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &dup int32_t numStates = fDStates->size(); int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); - U_ASSERT(baseCategory < duplCategory); - uint16_t table_base; uint16_t table_dupl; for (; baseCategory < numCols-1; ++baseCategory) { @@ -1171,12 +1169,22 @@ void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) { int32_t existingVal = sd->fDtran->elementAti(col); int32_t newVal = existingVal; if (existingVal == duplState) { - existingVal = keepState; + newVal = keepState; } else if (existingVal > duplState) { newVal = existingVal - 1; } sd->fDtran->setElementAt(newVal, col); } + if (sd->fAccepting == duplState) { + sd->fAccepting = keepState; + } else if (sd->fAccepting > duplState) { + sd->fAccepting--; + } + if (sd->fLookAhead == duplState) { + sd->fLookAhead = keepState; + } else if (sd->fLookAhead > duplState) { + sd->fLookAhead--; + } } } @@ -1185,13 +1193,12 @@ void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) { * RemoveDuplicateStates */ void RBBITableBuilder::removeDuplicateStates() { - int32_t firstState = 0; + int32_t firstState = 3; int32_t duplicateState = 0; while (findDuplicateState(firstState, duplicateState)) { - printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState); + // printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState); removeState(firstState, duplicateState); } - } //----------------------------------------------------------------------------- diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index fd150617a7..1e0901c427 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -4462,32 +4462,17 @@ void RBBITest::TestBug12677() { void RBBITest::TestTableRedundancies() { UErrorCode status = U_ZERO_ERROR; - UnicodeString rules {u"$s0=[;,*]; \n" - "$s1=[a-z]; \n" - "$s2=[i-n]; \n" - "$s3=[x-z]; \n" - "!!forward; \n" - "($s0 | '?')*; \n" - "($s1 | $s2 | $s3)*; \n" }; - - RuleBasedBreakIterator *lbi = - (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); - //lbi->dumpTables(); - UnicodeString lbRules = lbi->getRules(); - delete lbi; - - UParseError pe {}; - RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(lbRules, pe, status); + LocalPointer bi ( + (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status)); assertSuccess(WHERE, status); if (U_FAILURE(status)) return; - bi->dumpTables(); RBBIDataWrapper *dw = bi->fData; const RBBIStateTable *fwtbl = dw->fForwardTable; int32_t numCharClasses = dw->fHeader->fCatCount; - printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); + // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); - // Check for duplicate columns + // Check for duplicate columns (character categories) std::vector columns; for (int32_t column = 0; column < numCharClasses; column++) { @@ -4498,23 +4483,23 @@ void RBBITest::TestTableRedundancies() { } columns.push_back(s); } - for (int c1=0; c1 rows; for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) { UnicodeString s; RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); - if (row->fAccepting < -1) { - printf("row %d accepting = %d\n", r, row->fAccepting); - } + assertTrue(WHERE, row->fAccepting >= -1); s.append(row->fAccepting + 1); // values of -1 are expected. s.append(row->fLookAhead); s.append(row->fTagIdx); @@ -4523,15 +4508,14 @@ void RBBITest::TestTableRedundancies() { } rows.push_back(s); } - for (int r1=0; r1<(int32_t)fwtbl->fNumStates; r1++) { + for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) { for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) { if (rows.at(r1) == rows.at(r2)) { - printf("Duplicate rows (%d, %d)\n", r1, r2); - break; + errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2); + return; } } } - delete bi; }