ICU-13194 RBBI safe rules, work in progress.
X-SVN-Rev: 41135
This commit is contained in:
parent
660d38bc7f
commit
62dd66a13d
@ -356,9 +356,13 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
|
||||
int32_t ruleStatusIndex = 0;
|
||||
// TODO: check for position == length of text. Although may still need to back up to get rule status.
|
||||
if (position > 20) {
|
||||
int32_t backupPos = fBI->handlePrevious(position);
|
||||
int32_t backupPos = fBI->handleSafePrevious(position);
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
|
||||
if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals.
|
||||
// Safe rules work on pairs. +1 from start pos may be a false match.
|
||||
aBoundary = fBI->handleNext();
|
||||
}
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
@ -485,14 +489,17 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
|
||||
if (backupPosition <= 0) {
|
||||
backupPosition = 0;
|
||||
} else {
|
||||
backupPosition = fBI->handlePrevious(backupPosition);
|
||||
backupPosition = fBI->handleSafePrevious(backupPosition);
|
||||
}
|
||||
if (backupPosition == UBRK_DONE || backupPosition == 0) {
|
||||
position = 0;
|
||||
positionStatusIdx = 0;
|
||||
} else {
|
||||
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
|
||||
position = fBI->handleNext();
|
||||
position = fBI->handleNext(); // TODO: supplementals don't work with the +1.
|
||||
if (position == backupPosition + 1) {
|
||||
position = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
};
|
||||
positionStatusIdx = fBI->fRuleStatusIndex;
|
||||
|
||||
}
|
||||
|
@ -154,13 +154,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
//
|
||||
int32_t headerSize = align8(sizeof(RBBIDataHeader));
|
||||
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
||||
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
|
||||
int32_t reverseTableSize = align8(fForwardTables->getSafeTableSize());
|
||||
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); // TODO: remove hand-written rules.
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
||||
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
int32_t totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
+ reverseTableSize
|
||||
+ safeRevTableSize
|
||||
+ statusTableSize + trieSize + rulesSize;
|
||||
|
||||
@ -180,27 +182,18 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
data->fLength = totalSize;
|
||||
data->fCatCount = fSetBuilder->getNumCharCategories();
|
||||
|
||||
// Only save the forward table and the safe reverse table,
|
||||
// because these are the only ones used at run-time.
|
||||
//
|
||||
// For the moment, we still build the other tables if they are present in the rule source files,
|
||||
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
|
||||
//
|
||||
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
|
||||
// reverse rules to actually be the safe reverse rules.
|
||||
|
||||
data->fFTable = headerSize;
|
||||
data->fFTableLen = forwardTableSize;
|
||||
|
||||
// Do not save Reverse Table.
|
||||
data->fRTable = data->fFTable + forwardTableSize;
|
||||
data->fRTableLen = 0;
|
||||
data->fRTable = data->fFTable + data->fFTableLen;
|
||||
data->fRTableLen = reverseTableSize;
|
||||
|
||||
// Do not save the Safe Forward table.
|
||||
data->fSFTable = data->fRTable + 0;
|
||||
data->fSFTable = data->fRTable + data->fRTableLen;
|
||||
data->fSFTableLen = 0;
|
||||
|
||||
data->fSRTable = data->fSFTable + 0;
|
||||
// Hand written reverse rules. TODO: remove, once synthesized ones are working.
|
||||
data->fSRTable = data->fSFTable + data->fSFTableLen;
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
U_ASSERT(safeRevTableSize > 0);
|
||||
|
||||
@ -214,6 +207,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
@ -297,22 +291,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
||||
if (fForwardTables == nullptr || fSafeRevTables == nullptr)
|
||||
{
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete fForwardTables; fForwardTables = nullptr;
|
||||
delete fSafeRevTables; fSafeRevTables = nullptr;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fForwardTables->build();
|
||||
fForwardTables->buildSafe(status);
|
||||
fSafeRevTables->build();
|
||||
optimizeTables();
|
||||
fForwardTables->buildSafe(status);
|
||||
|
||||
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
|
||||
fForwardTables
|
||||
fForwardTables->printRuleStatusTable();
|
||||
fForwardTables->printSafeTable();
|
||||
}
|
||||
#endif
|
||||
|
||||
optimizeTables();
|
||||
fSetBuilder->buildTrie();
|
||||
|
||||
//
|
||||
|
@ -38,10 +38,8 @@ RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UEr
|
||||
}
|
||||
// fDStates is UVector<RBBIStateDescriptor *>
|
||||
fDStates = new UVector(status);
|
||||
// SafeTable is UVector<UnicodeString *>. Contents owned by the UVector.
|
||||
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
|
||||
if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;;
|
||||
if (U_SUCCESS(status) && fDStates == nullptr ) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
@ -190,8 +188,6 @@ void RBBITableBuilder::build() {
|
||||
// for all tables. Merge the ones from this table into the global set.
|
||||
//
|
||||
mergeRuleStatusVals();
|
||||
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
|
||||
}
|
||||
|
||||
|
||||
@ -1316,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
||||
if (wantedEndState == endState) {
|
||||
int32_t pair = c1 << 16 | c2;
|
||||
safePairs.addElement(pair, status);
|
||||
// printf("(%d, %d) ", c1, c2);
|
||||
printf("(%d, %d) ", c1, c2);
|
||||
}
|
||||
}
|
||||
//printf("\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Populate the initial safe table.
|
||||
@ -1329,21 +1325,25 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
||||
// Row 1 is the start sate.
|
||||
// Row 2 and beyond are other states, initially one per char class, but
|
||||
// after initial construction, many of the states will be combined, compacting the table.)
|
||||
// The String holds the nextState data only. The four leading fields of a row, fAccepting,
|
||||
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
|
||||
|
||||
U_ASSERT(fSafeTable == nullptr);
|
||||
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
|
||||
for (int32_t row=0; row<numCharClasses + 2; ++row) {
|
||||
fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
|
||||
fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
|
||||
}
|
||||
|
||||
// From the start state, each input char class transitions to the state for that input.
|
||||
UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
|
||||
for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
|
||||
// Note: +2 for the start & stop state; +4 for header columns in state table.
|
||||
startState.setCharAt(charClass+4, charClass+2);
|
||||
// Note: +2 for the start & stop state.
|
||||
startState.setCharAt(charClass, charClass+2);
|
||||
}
|
||||
|
||||
// Initially make every other state table row look like the start state row,
|
||||
for (int32_t row=2; row<numCharClasses+2; ++row) {
|
||||
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
|
||||
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(row);
|
||||
rowState = startState; // UnicodeString assignment, copies contents.
|
||||
}
|
||||
|
||||
@ -1355,13 +1355,85 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
||||
int32_t c2 = pair & 0x0000ffff;
|
||||
|
||||
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
|
||||
rowState.setCharAt(c1 + 4, 0);
|
||||
rowState.setCharAt(c1, 0);
|
||||
}
|
||||
|
||||
// Merge similar states.
|
||||
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// getSafeTableSize() Calculate the size of the runtime form of this
|
||||
// safe state table.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
int32_t RBBITableBuilder::getSafeTableSize() const {
|
||||
int32_t size = 0;
|
||||
int32_t numRows;
|
||||
int32_t numCols;
|
||||
int32_t rowSize;
|
||||
|
||||
if (fSafeTable == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
|
||||
|
||||
numRows = fSafeTable->size();
|
||||
numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
|
||||
size += numRows * rowSize;
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// exportTable() export the state transition table in the format required
|
||||
// by the runtime engine. getTableSize() bytes of memory
|
||||
// must be available at the output address "where".
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::exportSafeTable(void *where) {
|
||||
RBBIStateTable *table = (RBBIStateTable *)where;
|
||||
uint32_t state;
|
||||
int col;
|
||||
|
||||
if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
|
||||
if (catCount > 0x7fff ||
|
||||
fSafeTable->size() > 0x7fff) {
|
||||
*fStatus = U_BRK_INTERNAL_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
|
||||
table->fNumStates = fSafeTable->size();
|
||||
table->fFlags = 0;
|
||||
table->fReserved = 0;
|
||||
|
||||
for (state=0; state<table->fNumStates; state++) {
|
||||
UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
|
||||
row->fAccepting = 0;
|
||||
row->fLookAhead = 0;
|
||||
row->fTagIdx = 0;
|
||||
row->fReserved = 0;
|
||||
for (col=0; col<catCount; col++) {
|
||||
row->fNextState[col] = rowString->charAt(col);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// printSet Debug function. Print the contents of a UVector
|
||||
@ -1415,6 +1487,47 @@ void RBBITableBuilder::printStates() {
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// printSafeTable Debug Function. Dump the fully constructed safe table.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBITableBuilder::printSafeTable() {
|
||||
int c; // input "character"
|
||||
int n; // state number
|
||||
|
||||
RBBIDebugPrintf(" Safe Reverse Table \n");
|
||||
if (fSafeTable == nullptr) {
|
||||
RBBIDebugPrintf(" --- nullptr ---\n");
|
||||
return;
|
||||
}
|
||||
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
|
||||
RBBIDebugPrintf(" | Acc LA Tag");
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
RBBIDebugPrintf(" %2d", c);
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
RBBIDebugPrintf(" |---------------");
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
RBBIDebugPrintf("---");
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
|
||||
for (n=0; n<fSafeTable->size(); n++) {
|
||||
UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
|
||||
RBBIDebugPrintf(" %3d | " , n);
|
||||
RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
RBBIDebugPrintf(" %2d", rowString->charAt(c));
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
RBBIDebugPrintf("\n\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -123,11 +123,13 @@ public:
|
||||
void printPosSets(RBBINode *n /* = NULL*/);
|
||||
void printStates();
|
||||
void printRuleStatusTable();
|
||||
void printSafeTable();
|
||||
#else
|
||||
#define printSet(s)
|
||||
#define printPosSets(n)
|
||||
#define printStates()
|
||||
#define printRuleStatusTable()
|
||||
#define printSafeTable()
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
@ -4544,6 +4544,47 @@ void RBBITest::TestBug13447() {
|
||||
// for tracing without a lot of unwanted extra stuff happening.
|
||||
//
|
||||
void RBBITest::TestDebug(void) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
|
||||
BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
|
||||
const UnicodeString &rules = bi->getRules();
|
||||
UParseError pe;
|
||||
LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
|
||||
assertSuccess(WHERE, status);
|
||||
|
||||
#if 0
|
||||
bi->dumpTables();
|
||||
|
||||
RBBIDataWrapper *dw = bi->fData;
|
||||
const RBBIStateTable *fwtbl = dw->fForwardTable;
|
||||
int32_t numCharClasses = dw->fHeader->fCatCount;
|
||||
printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
|
||||
|
||||
for (int32_t c1=0; c1<numCharClasses; ++c1) {
|
||||
for (int32_t c2=0; c2 < numCharClasses; ++c2) {
|
||||
int32_t wantedEndState = -1;
|
||||
int32_t endState = 0;
|
||||
for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
|
||||
int32_t s2 = row->fNextState[c1];
|
||||
row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
|
||||
endState = row->fNextState[c2];
|
||||
if (wantedEndState < 0) {
|
||||
wantedEndState = endState;
|
||||
} else {
|
||||
if (wantedEndState != endState) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (wantedEndState == endState) {
|
||||
printf("(%d, %d) ", c1, c2);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
void RBBITest::TestProperties() {
|
||||
|
8
icu4c/source/test/testdata/rbbitst.txt
vendored
8
icu4c/source/test/testdata/rbbitst.txt
vendored
@ -1370,7 +1370,7 @@ Bangkok)•</data>
|
||||
<rules>
|
||||
!!forward;
|
||||
Hello\ World;
|
||||
!!reverse;
|
||||
!!safe_reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•Hello World•</data>
|
||||
@ -1379,7 +1379,7 @@ Bangkok)•</data>
|
||||
!!quoted_literals_only;
|
||||
!!forward;
|
||||
Hello\ World;
|
||||
!!reverse;
|
||||
!!safe_reverse;
|
||||
.*;
|
||||
</badrules>
|
||||
|
||||
@ -1387,7 +1387,7 @@ Bangkok)•</data>
|
||||
!!quoted_literals_only;
|
||||
!!forward;
|
||||
'Hello World';
|
||||
!!reverse;
|
||||
!!safe_reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•Hello World•</data>
|
||||
@ -1399,7 +1399,7 @@ Bangkok)•</data>
|
||||
<rules>
|
||||
!!forward;
|
||||
.;
|
||||
!!reverse;
|
||||
!!safe_reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•</data>
|
||||
|
Loading…
Reference in New Issue
Block a user