ICU-13194 RBBI safe rules, work in progress.

X-SVN-Rev: 41135
This commit is contained in:
Andy Heninger 2018-03-22 17:31:00 +00:00
parent 660d38bc7f
commit 62dd66a13d
6 changed files with 201 additions and 42 deletions

View File

@ -356,12 +356,16 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
int32_t ruleStatusIndex = 0;
// TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > 20) {
int32_t backupPos = fBI->handlePrevious(position);
int32_t backupPos = fBI->handleSafePrevious(position);
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals.
// Safe rules work on pairs. +1 from start pos may be a false match.
aBoundary = fBI->handleNext();
}
ruleStatusIndex = fBI->fRuleStatusIndex;
}
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
@ -485,14 +489,17 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
if (backupPosition <= 0) {
backupPosition = 0;
} else {
backupPosition = fBI->handlePrevious(backupPosition);
backupPosition = fBI->handleSafePrevious(backupPosition);
}
if (backupPosition == UBRK_DONE || backupPosition == 0) {
position = 0;
positionStatusIdx = 0;
} else {
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
position = fBI->handleNext();
position = fBI->handleNext(); // TODO: supplementals don't work with the +1.
if (position == backupPosition + 1) {
position = fBI->handleNext(); // Safe rules identify safe pairs.
};
positionStatusIdx = fBI->fRuleStatusIndex;
}

View File

@ -154,13 +154,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
int32_t reverseTableSize = align8(fForwardTables->getSafeTableSize());
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); // TODO: remove hand-written rules.
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize
+ forwardTableSize
+ forwardTableSize
+ reverseTableSize
+ safeRevTableSize
+ statusTableSize + trieSize + rulesSize;
@ -180,27 +182,18 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fLength = totalSize;
data->fCatCount = fSetBuilder->getNumCharCategories();
// Only save the forward table and the safe reverse table,
// because these are the only ones used at run-time.
//
// For the moment, we still build the other tables if they are present in the rule source files,
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
//
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
// reverse rules to actually be the safe reverse rules.
data->fFTable = headerSize;
data->fFTableLen = forwardTableSize;
// Do not save Reverse Table.
data->fRTable = data->fFTable + forwardTableSize;
data->fRTableLen = 0;
data->fRTable = data->fFTable + data->fFTableLen;
data->fRTableLen = reverseTableSize;
// Do not save the Safe Forward table.
data->fSFTable = data->fRTable + 0;
data->fSFTable = data->fRTable + data->fRTableLen;
data->fSFTableLen = 0;
data->fSRTable = data->fSFTable + 0;
// Hand written reverse rules. TODO: remove, once synthesized ones are working.
data->fSRTable = data->fSFTable + data->fSFTableLen;
data->fSRTableLen = safeRevTableSize;
U_ASSERT(safeRevTableSize > 0);
@ -214,6 +207,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
@ -297,22 +291,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
if (fForwardTables == nullptr || fSafeRevTables == nullptr)
{
status = U_MEMORY_ALLOCATION_ERROR;
delete fForwardTables; fForwardTables = nullptr;
delete fSafeRevTables; fSafeRevTables = nullptr;
return nullptr;
}
fForwardTables->build();
fForwardTables->buildSafe(status);
fSafeRevTables->build();
optimizeTables();
fForwardTables->buildSafe(status);
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
fForwardTables
fForwardTables->printRuleStatusTable();
fForwardTables->printSafeTable();
}
#endif
optimizeTables();
fSetBuilder->buildTrie();
//

View File

@ -38,10 +38,8 @@ RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UEr
}
// fDStates is UVector<RBBIStateDescriptor *>
fDStates = new UVector(status);
// SafeTable is UVector<UnicodeString *>. Contents owned by the UVector.
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
status = U_MEMORY_ALLOCATION_ERROR;;
if (U_SUCCESS(status) && fDStates == nullptr ) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
@ -190,8 +188,6 @@ void RBBITableBuilder::build() {
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
@ -1316,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
if (wantedEndState == endState) {
int32_t pair = c1 << 16 | c2;
safePairs.addElement(pair, status);
// printf("(%d, %d) ", c1, c2);
printf("(%d, %d) ", c1, c2);
}
}
//printf("\n");
printf("\n");
}
// Populate the initial safe table.
@ -1329,21 +1325,25 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
// Row 1 is the start sate.
// Row 2 and beyond are other states, initially one per char class, but
// after initial construction, many of the states will be combined, compacting the table.)
// The String holds the nextState data only. The four leading fields of a row, fAccepting,
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
U_ASSERT(fSafeTable == nullptr);
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
for (int32_t row=0; row<numCharClasses + 2; ++row) {
fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
}
// From the start state, each input char class transitions to the state for that input.
UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
// Note: +2 for the start & stop state; +4 for header columns in state table.
startState.setCharAt(charClass+4, charClass+2);
// Note: +2 for the start & stop state.
startState.setCharAt(charClass, charClass+2);
}
// Initially make every other state table row look like the start state row,
for (int32_t row=2; row<numCharClasses+2; ++row) {
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(row);
rowState = startState; // UnicodeString assignment, copies contents.
}
@ -1355,13 +1355,85 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
int32_t c2 = pair & 0x0000ffff;
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
rowState.setCharAt(c1 + 4, 0);
rowState.setCharAt(c1, 0);
}
// Merge similar states.
}
//-----------------------------------------------------------------------------
//
// getSafeTableSize() Calculate the size of the runtime form of this
// safe state table.
//
//-----------------------------------------------------------------------------
int32_t RBBITableBuilder::getSafeTableSize() const {
int32_t size = 0;
int32_t numRows;
int32_t numCols;
int32_t rowSize;
if (fSafeTable == nullptr) {
return 0;
}
size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
numRows = fSafeTable->size();
numCols = fRB->fSetBuilder->getNumCharCategories();
rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
size += numRows * rowSize;
return size;
}
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
// by the runtime engine. getTableSize() bytes of memory
// must be available at the output address "where".
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::exportSafeTable(void *where) {
RBBIStateTable *table = (RBBIStateTable *)where;
uint32_t state;
int col;
if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
return;
}
int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
if (catCount > 0x7fff ||
fSafeTable->size() > 0x7fff) {
*fStatus = U_BRK_INTERNAL_ERROR;
return;
}
table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
table->fNumStates = fSafeTable->size();
table->fFlags = 0;
table->fReserved = 0;
for (state=0; state<table->fNumStates; state++) {
UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
row->fAccepting = 0;
row->fLookAhead = 0;
row->fTagIdx = 0;
row->fReserved = 0;
for (col=0; col<catCount; col++) {
row->fNextState[col] = rowString->charAt(col);
}
}
}
//-----------------------------------------------------------------------------
//
// printSet Debug function. Print the contents of a UVector
@ -1415,6 +1487,47 @@ void RBBITableBuilder::printStates() {
#endif
//-----------------------------------------------------------------------------
//
// printSafeTable Debug Function. Dump the fully constructed safe table.
//
//-----------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBITableBuilder::printSafeTable() {
int c; // input "character"
int n; // state number
RBBIDebugPrintf(" Safe Reverse Table \n");
if (fSafeTable == nullptr) {
RBBIDebugPrintf(" --- nullptr ---\n");
return;
}
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
RBBIDebugPrintf(" | Acc LA Tag");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", c);
}
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" |---------------");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf("---");
}
RBBIDebugPrintf("\n");
for (n=0; n<fSafeTable->size(); n++) {
UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
RBBIDebugPrintf(" %3d | " , n);
RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", rowString->charAt(c));
}
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\n\n");
}
#endif
//-----------------------------------------------------------------------------
//

View File

@ -123,11 +123,13 @@ public:
void printPosSets(RBBINode *n /* = NULL*/);
void printStates();
void printRuleStatusTable();
void printSafeTable();
#else
#define printSet(s)
#define printPosSets(n)
#define printStates()
#define printRuleStatusTable()
#define printSafeTable()
#endif
private:

View File

@ -4544,6 +4544,47 @@ void RBBITest::TestBug13447() {
// for tracing without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
const UnicodeString &rules = bi->getRules();
UParseError pe;
LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
assertSuccess(WHERE, status);
#if 0
bi->dumpTables();
RBBIDataWrapper *dw = bi->fData;
const RBBIStateTable *fwtbl = dw->fForwardTable;
int32_t numCharClasses = dw->fHeader->fCatCount;
printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
for (int32_t c1=0; c1<numCharClasses; ++c1) {
for (int32_t c2=0; c2 < numCharClasses; ++c2) {
int32_t wantedEndState = -1;
int32_t endState = 0;
for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
int32_t s2 = row->fNextState[c1];
row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
endState = row->fNextState[c2];
if (wantedEndState < 0) {
wantedEndState = endState;
} else {
if (wantedEndState != endState) {
break;
}
}
}
if (wantedEndState == endState) {
printf("(%d, %d) ", c1, c2);
}
}
printf("\n");
}
printf("\n");
#endif
}
void RBBITest::TestProperties() {

View File

@ -1370,7 +1370,7 @@ Bangkok)•</data>
<rules>
!!forward;
Hello\ World;
!!reverse;
!!safe_reverse;
.*;
</rules>
<data>•Hello World•</data>
@ -1379,7 +1379,7 @@ Bangkok)•</data>
!!quoted_literals_only;
!!forward;
Hello\ World;
!!reverse;
!!safe_reverse;
.*;
</badrules>
@ -1387,7 +1387,7 @@ Bangkok)•</data>
!!quoted_literals_only;
!!forward;
'Hello World';
!!reverse;
!!safe_reverse;
.*;
</rules>
<data>•Hello World•</data>
@ -1399,7 +1399,7 @@ Bangkok)•</data>
<rules>
!!forward;
.;
!!reverse;
!!safe_reverse;
.*;
</rules>
<data>•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•</data>