ICU-13194 RBBI safe tables, improve code consistency between C++ and Java.

X-SVN-Rev: 41203
This commit is contained in:
Andy Heninger 2018-04-06 00:19:32 +00:00
parent b6a2b3fddb
commit 7adb68f6f8
6 changed files with 37 additions and 39 deletions

View File

@ -303,16 +303,14 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
}
void RBBIRuleBuilder::optimizeTables() {
int32_t leftClass;
int32_t rightClass;
leftClass = 3;
rightClass = 0;
// Begin looking for duplicates with char class 3.
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
// and should not have other categories merged into them.
IntPair duplPair = {3, 0};
while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
fSetBuilder->mergeCategories(duplPair.first, duplPair.second);
fSetBuilder->mergeCategories(duplPair);
fForwardTable->removeColumn(duplPair.second);
}
fForwardTable->removeDuplicateStates();

View File

@ -270,15 +270,15 @@ void RBBISetBuilder::buildTrie() {
}
void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
U_ASSERT(left >= 1);
U_ASSERT(right > left);
void RBBISetBuilder::mergeCategories(IntPair categories) {
U_ASSERT(categories.first >= 1);
U_ASSERT(categories.second > categories.first);
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
int32_t rangeNum = rd->fNum & ~DICT_BIT;
int32_t rangeDict = rd->fNum & DICT_BIT;
if (rangeNum == right) {
rd->fNum = left | rangeDict;
} else if (rangeNum > right) {
if (rangeNum == categories.second) {
rd->fNum = categories.first | rangeDict;
} else if (rangeNum > categories.second) {
rd->fNum--;
}
}

View File

@ -94,10 +94,12 @@ public:
UChar32 getFirstChar(int32_t val) const;
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
// character were encountered.
/** merge two character categories that have been identified as having equivalent behavior.
* The ranges belonging to the right category (table column) will be added to the left.
/**
* Merge two character categories that have been identified as having equivalent behavior.
* The ranges belonging to the second category (table column) will be added to the first.
* @param categories the pair of categories to be merged.
*/
void mergeCategories(int32_t left, int32_t right);
void mergeCategories(IntPair categories);
static constexpr int32_t DICT_BIT = 0x4000;

View File

@ -1179,7 +1179,9 @@ bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
}
void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
void RBBITableBuilder::removeState(IntPair duplStates) {
const int32_t keepState = duplStates.first;
const int32_t duplState = duplStates.second;
U_ASSERT(keepState < duplState);
U_ASSERT(duplState < fDStates->size());
@ -1214,7 +1216,9 @@ void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
}
}
void RBBITableBuilder::removeSafeState(int32_t keepState, int32_t duplState) {
void RBBITableBuilder::removeSafeState(IntPair duplStates) {
const int32_t keepState = duplStates.first;
const int32_t duplState = duplStates.second;
U_ASSERT(keepState < duplState);
U_ASSERT(duplState < fSafeTable->size());
@ -1245,7 +1249,7 @@ void RBBITableBuilder::removeDuplicateStates() {
IntPair dupls = {3, 0};
while (findDuplicateState(&dupls)) {
// printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
removeState(dupls.first, dupls.second);
removeState(dupls);
}
}
@ -1430,7 +1434,7 @@ void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
IntPair states = {1, 0};
while (findDuplicateSafeState(&states)) {
// printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
removeSafeState(states.first, states.second);
removeSafeState(states);
}
}

View File

@ -51,14 +51,15 @@ public:
void exportTable(void *where);
/**
* Find duplicate (redundant) character classes, beginning at the specified
* pair, within this state table. This is an iterator-like function, used to
* identify character classes (state table columns) that can be eliminated.
* Find duplicate (redundant) character classes. Begin looking with categories.first.
* Duplicate, if found are returned in the categories parameter.
* This is an iterator-like function, used to identify character classes
* (state table columns) that can be eliminated.
* @param categories in/out parameter, specifies where to start looking for duplicates,
* and returns the first pair of duplicates found, if any.
* @return true if duplicate char classes were found, false otherwise.
*/
bool findDuplCharClassFrom(IntPair *statePair);
bool findDuplCharClassFrom(IntPair *categories);
/** Remove a column from the state table. Used when two character categories
* have been found equivalent, and merged together, to eliminate the uneeded table column.
@ -110,12 +111,12 @@ private:
*/
bool findDuplicateState(IntPair *states);
/** Remove a duplicate state/
* @param keepState First of the duplicate pair. Keep it.
* @param duplState Duplicate state. Remove it. Redirect all references to the duplicate state
* to refer to keepState instead.
/** Remove a duplicate state.
* @param duplStates The duplicate states. The first is kept, the second is removed.
* All references to the second in the state table are retargeted
* to the first.
*/
void removeState(int32_t keepState, int32_t duplState);
void removeState(IntPair duplStates);
/** Find the next duplicate state in the safe reverse table. An iterator function.
* @param states in/out parameter, specifies where to start looking for duplicates,
@ -125,11 +126,11 @@ private:
bool findDuplicateSafeState(IntPair *states);
/** Remove a duplicate state from the safe table.
* @param keepState First of the duplicate pair. Keep it.
* @param duplState Duplicate state. Remove it. Redirect all table references to the duplicate state
* to refer to keepState instead.
* @param duplStates The duplicate states. The first is kept, the second is removed.
* All references to the second in the state table are retargeted
* to the first.
*/
void removeSafeState(int32_t keepState, int32_t duplState);
void removeSafeState(IntPair duplStates);
// Set functions for UVector.
// TODO: make a USet subclass of UVector

View File

@ -70,13 +70,6 @@ public:
RBBIDataWrapper *fData;
private:
/**
* The iteration state - current position, rule status for the current position,
* and whether the iterator ran off the end, yielding UBRK_DONE.
* Current position is pinned to be 0 < position <= text.length.
* Current position is always set to a boundary.
* @internal
*/
/**
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
* Never has the value UBRK_DONE (-1).