ICU-13590 RBBI, improve handling of concurrent look-ahead rules.
Change the mapping from rule number to boundary position to use a simple array instead of a linear search lookup map. Look-ahead rules have a preceding context, a boundary position, and following context. In the implementation, when the preceding context matches, the potential boundary position is saved. Then, if the following context proves to match, the saved boundary is returned as an actual boundary. Look-ahead rules are numbered, and the implementation maintains a map from rule number to the tentative saved boundary position. In an earlier improvement to the rule builder, the rule numbering was changed to be a contiguous sequence, from the original sparse numbering. In anticipation of changing the mapping from number to position to use a simple array.
This commit is contained in:
parent
2de2585f1b
commit
003b431540
@ -68,10 +68,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
|
||||
init(status);
|
||||
fData = new RBBIDataWrapper(data, status); // status checked in constructor
|
||||
if (U_FAILURE(status)) {return;}
|
||||
if(fData == 0) {
|
||||
if(fData == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if (fData->fForwardTable->fLookAheadResultsSize > 0) {
|
||||
fLookAheadMatches = static_cast<int32_t *>(
|
||||
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
|
||||
if (fLookAheadMatches == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
@ -98,10 +106,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
|
||||
}
|
||||
fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
|
||||
if (U_FAILURE(status)) {return;}
|
||||
if(fData == 0) {
|
||||
if(fData == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if (fData->fForwardTable->fLookAheadResultsSize > 0) {
|
||||
fLookAheadMatches = static_cast<int32_t *>(
|
||||
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
|
||||
if (fLookAheadMatches == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -117,10 +133,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
|
||||
init(status);
|
||||
fData = new RBBIDataWrapper(udm, status); // status checked in constructor
|
||||
if (U_FAILURE(status)) {return;}
|
||||
if(fData == 0) {
|
||||
if(fData == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if (fData->fForwardTable->fLookAheadResultsSize > 0) {
|
||||
fLookAheadMatches = static_cast<int32_t *>(
|
||||
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
|
||||
if (fLookAheadMatches == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -188,30 +212,34 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
|
||||
// fCharIter was adopted from the outside.
|
||||
delete fCharIter;
|
||||
}
|
||||
fCharIter = NULL;
|
||||
fCharIter = nullptr;
|
||||
|
||||
utext_close(&fText);
|
||||
|
||||
if (fData != NULL) {
|
||||
if (fData != nullptr) {
|
||||
fData->removeReference();
|
||||
fData = NULL;
|
||||
fData = nullptr;
|
||||
}
|
||||
delete fBreakCache;
|
||||
fBreakCache = NULL;
|
||||
fBreakCache = nullptr;
|
||||
|
||||
delete fDictionaryCache;
|
||||
fDictionaryCache = NULL;
|
||||
fDictionaryCache = nullptr;
|
||||
|
||||
delete fLanguageBreakEngines;
|
||||
fLanguageBreakEngines = NULL;
|
||||
fLanguageBreakEngines = nullptr;
|
||||
|
||||
delete fUnhandledBreakEngine;
|
||||
fUnhandledBreakEngine = NULL;
|
||||
fUnhandledBreakEngine = nullptr;
|
||||
|
||||
uprv_free(fLookAheadMatches);
|
||||
fLookAheadMatches = nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
* TODO: needs better handling of memory allocation errors.
|
||||
*/
|
||||
RuleBasedBreakIterator&
|
||||
RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
||||
@ -252,6 +280,14 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
||||
fData = that.fData->addReference();
|
||||
}
|
||||
|
||||
uprv_free(fLookAheadMatches);
|
||||
fLookAheadMatches = nullptr;
|
||||
if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) {
|
||||
fLookAheadMatches = static_cast<int32_t *>(
|
||||
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
|
||||
}
|
||||
|
||||
|
||||
fPosition = that.fPosition;
|
||||
fRuleStatusIndex = that.fRuleStatusIndex;
|
||||
fDone = that.fDone;
|
||||
@ -275,16 +311,17 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RuleBasedBreakIterator::init(UErrorCode &status) {
|
||||
fCharIter = NULL;
|
||||
fData = NULL;
|
||||
fCharIter = nullptr;
|
||||
fData = nullptr;
|
||||
fPosition = 0;
|
||||
fRuleStatusIndex = 0;
|
||||
fDone = false;
|
||||
fDictionaryCharCount = 0;
|
||||
fLanguageBreakEngines = NULL;
|
||||
fUnhandledBreakEngine = NULL;
|
||||
fBreakCache = NULL;
|
||||
fDictionaryCache = NULL;
|
||||
fLanguageBreakEngines = nullptr;
|
||||
fUnhandledBreakEngine = nullptr;
|
||||
fBreakCache = nullptr;
|
||||
fDictionaryCache = nullptr;
|
||||
fLookAheadMatches = nullptr;
|
||||
|
||||
// Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
|
||||
// fText = UTEXT_INITIALIZER;
|
||||
@ -700,52 +737,6 @@ enum RBBIRunMode {
|
||||
};
|
||||
|
||||
|
||||
// Map from look-ahead break states (corresponds to rules) to boundary positions.
|
||||
// Allows multiple lookahead break rules to be in flight at the same time.
|
||||
//
|
||||
// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
|
||||
// in the state table be sequential, then we can just index an array. And the
|
||||
// table could also tell us in advance how big that array needs to be.
|
||||
//
|
||||
// Before ICU 57 there was just a single simple variable for a look-ahead match that
|
||||
// was in progress. Two rules at once did not work.
|
||||
|
||||
static const int32_t kMaxLookaheads = 8;
|
||||
struct LookAheadResults {
|
||||
int32_t fUsedSlotLimit;
|
||||
int32_t fPositions[8];
|
||||
uint16_t fKeys[8];
|
||||
|
||||
LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
|
||||
|
||||
int32_t getPosition(uint16_t key) {
|
||||
for (int32_t i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
return fPositions[i];
|
||||
}
|
||||
}
|
||||
UPRV_UNREACHABLE;
|
||||
}
|
||||
|
||||
void setPosition(uint16_t key, int32_t position) {
|
||||
int32_t i;
|
||||
for (i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
fPositions[i] = position;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (i >= kMaxLookaheads) {
|
||||
UPRV_UNREACHABLE;
|
||||
}
|
||||
fKeys[i] = key;
|
||||
fPositions[i] = position;
|
||||
U_ASSERT(fUsedSlotLimit == i);
|
||||
fUsedSlotLimit = i + 1;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
|
||||
// instantiation, based on whether an 8 or 16 bit table is required.
|
||||
//
|
||||
@ -809,7 +800,6 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
|
||||
RowType *row;
|
||||
UChar32 c;
|
||||
LookAheadResults lookAheadMatches;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
const RBBIStateTable *statetable = fData->fForwardTable;
|
||||
@ -912,7 +902,8 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
|
||||
} else if (accepting > ACCEPTING_UNCONDITIONAL) {
|
||||
// Lookahead match is completed.
|
||||
int32_t lookaheadResult = lookAheadMatches.getPosition(accepting);
|
||||
U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
|
||||
int32_t lookaheadResult = fLookAheadMatches[accepting];
|
||||
if (lookaheadResult >= 0) {
|
||||
fRuleStatusIndex = row->fTagsIdx;
|
||||
fPosition = lookaheadResult;
|
||||
@ -927,9 +918,11 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
// But there are line break test failures when trying this. Investigate.
|
||||
// Issue ICU-20837
|
||||
uint16_t rule = row->fLookAhead;
|
||||
if (rule != 0) {
|
||||
U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL);
|
||||
U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize);
|
||||
if (rule > ACCEPTING_UNCONDITIONAL) {
|
||||
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
lookAheadMatches.setPosition(rule, pos);
|
||||
fLookAheadMatches[rule] = pos;
|
||||
}
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
|
@ -230,14 +230,16 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
|
||||
uint32_t c;
|
||||
uint32_t s;
|
||||
|
||||
RBBIDebugPrintf(" %s\n", heading);
|
||||
RBBIDebugPrintf("%s\n", heading);
|
||||
|
||||
RBBIDebugPrintf("Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
|
||||
RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart);
|
||||
RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize);
|
||||
RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
|
||||
table->fFlags,
|
||||
table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
|
||||
table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
|
||||
table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
|
||||
RBBIDebugPrintf("State | Acc LA TagIx");
|
||||
RBBIDebugPrintf("\nState | Acc LA TagIx");
|
||||
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
|
||||
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("----");
|
||||
|
@ -137,6 +137,8 @@ struct RBBIStateTable {
|
||||
uint32_t fDictCategoriesStart; // Char category number of the first dictionary
|
||||
// char class, or the the largest category number + 1
|
||||
// if there are no dictionary categories.
|
||||
uint32_t fLookAheadResultsSize; // Size of run-time array required for holding
|
||||
// look-ahead results. Indexed by row.fLookAhead.
|
||||
uint32_t fFlags; // Option Flags for this state table.
|
||||
char fTableData[1]; // First RBBIStateTableRow begins here.
|
||||
// Variable-length array declared with length 1
|
||||
|
@ -715,12 +715,6 @@ void RBBITableBuilder::mapLookAheadRules() {
|
||||
}
|
||||
fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
|
||||
|
||||
// Counter used when assigning lookahead rule numbers.
|
||||
// Contains the last look-ahead number already in use.
|
||||
// The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
|
||||
// for non-lookahead accepting states. See the declarations of RBBIStateTableRowT.
|
||||
int32_t laSlotsInUse = ACCEPTING_UNCONDITIONAL;
|
||||
|
||||
for (int32_t n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
int32_t laSlotForState = 0;
|
||||
@ -756,7 +750,7 @@ void RBBITableBuilder::mapLookAheadRules() {
|
||||
}
|
||||
|
||||
if (laSlotForState == 0) {
|
||||
laSlotForState = ++laSlotsInUse;
|
||||
laSlotForState = ++fLASlotsInUse;
|
||||
}
|
||||
|
||||
// For each look ahead node covered by this state,
|
||||
@ -1386,6 +1380,7 @@ void RBBITableBuilder::exportTable(void *where) {
|
||||
|
||||
table->fNumStates = fDStates->size();
|
||||
table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
|
||||
table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
|
||||
table->fFlags = 0;
|
||||
if (use8BitsForTable()) {
|
||||
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "rbbidata.h"
|
||||
#include "rbbirb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
@ -184,9 +185,15 @@ private:
|
||||
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
|
||||
UVector32 *fLookAheadRuleMap = nullptr;
|
||||
|
||||
/* Counter used when assigning lookahead rule numbers.
|
||||
* Contains the last look-ahead number already in use.
|
||||
* The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
|
||||
* for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
|
||||
int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL;
|
||||
|
||||
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
|
||||
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
|
||||
|
||||
RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class
|
||||
RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -142,6 +142,11 @@ private:
|
||||
*/
|
||||
UBool fDone;
|
||||
|
||||
/**
|
||||
* Array of look-ahead tentative results.
|
||||
*/
|
||||
int32_t *fLookAheadMatches;
|
||||
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
@ -133,6 +133,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
|
||||
TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
|
||||
TESTCASE_AUTO(TestTable_8_16_Bits);
|
||||
TESTCASE_AUTO(TestBug13590);
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
TESTCASE_AUTO(TestTraceCreateCharacter);
|
||||
@ -5078,6 +5079,63 @@ void RBBITest::TestTable_8_16_Bits() {
|
||||
}
|
||||
}
|
||||
|
||||
/* Test handling of a large number of look-ahead rules.
|
||||
* The number of rules in the test exceeds the implementation limits prior to the
|
||||
* improvements introduced with #13590.
|
||||
*
|
||||
* The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
|
||||
* The text being matched is sequential, "ABCDEFGHI..."
|
||||
*
|
||||
* The upshot is that the look-ahead rules all match on their preceding context,
|
||||
* and consequently must save a potential result, but then fail to match on their
|
||||
* trailing context, so that they don't actually cause a boundary.
|
||||
*
|
||||
* Additionally, add a ".*" rule, so there are no boundaries unless a
|
||||
* look-ahead hard-break rule forces one.
|
||||
*/
|
||||
void RBBITest::TestBug13590() {
|
||||
UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
|
||||
|
||||
const int NUM_LOOKAHEAD_RULES = 50;
|
||||
const char16_t STARTING_CHAR = u'\u5000';
|
||||
char16_t firstChar;
|
||||
for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
|
||||
firstChar = STARTING_CHAR + ruleNum*2;
|
||||
rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
|
||||
.append(u' ') .append(u'/') .append(u' ')
|
||||
.append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
|
||||
.append(u';') .append(u'\n');
|
||||
}
|
||||
|
||||
// Change the last rule added from the form "UV / WY" to "UV / WX".
|
||||
// Changes the rule so that it will match - all 4 chars are in ascending sequence.
|
||||
rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
RuleBasedBreakIterator bi(rules, parseError, status);
|
||||
if (!assertSuccess(WHERE, status)) {
|
||||
errln(rules);
|
||||
return;
|
||||
}
|
||||
// bi.dumpTables();
|
||||
|
||||
UnicodeString testString;
|
||||
for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
|
||||
testString.append(c);
|
||||
}
|
||||
bi.setText(testString);
|
||||
|
||||
int breaksFound = 0;
|
||||
while (bi.next() != UBRK_DONE) {
|
||||
++breaksFound;
|
||||
}
|
||||
|
||||
// Two matches are expected, one from the last rule that was explicitly modified,
|
||||
// and one at the end of the text.
|
||||
assertEquals(WHERE, 2, breaksFound);
|
||||
}
|
||||
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
static std::vector<std::string> gData;
|
||||
|
@ -91,6 +91,7 @@ public:
|
||||
void Test16BitsTrieWith8BitStateTable();
|
||||
void Test16BitsTrieWith16BitStateTable();
|
||||
void TestTable_8_16_Bits();
|
||||
void TestBug13590();
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
void TestTraceCreateCharacter();
|
||||
|
@ -46,6 +46,11 @@ public final class RBBIDataWrapper {
|
||||
* or the the largest category number + 1 if there are no dictionary categories.
|
||||
*/
|
||||
public int fDictCategoriesStart;
|
||||
/**
|
||||
* Size of run-time array required for holding
|
||||
* look-ahead results. Indexed by row.fLookAhead.
|
||||
*/
|
||||
public int fLookAheadResultsSize;
|
||||
/**
|
||||
* Option Flags for this state table.
|
||||
*/
|
||||
@ -54,7 +59,7 @@ public final class RBBIDataWrapper {
|
||||
* Length in bytes of the state table header, of all the int32 fields
|
||||
* preceding fTable in the serialized form.
|
||||
*/
|
||||
public static int fHeaderSize = 16;
|
||||
public static int fHeaderSize = 20;
|
||||
/**
|
||||
* Linear array of next state values, accessed as short[state, char_class]
|
||||
*/
|
||||
@ -74,6 +79,7 @@ public final class RBBIDataWrapper {
|
||||
This.fNumStates = bytes.getInt();
|
||||
This.fRowLen = bytes.getInt();
|
||||
This.fDictCategoriesStart = bytes.getInt();
|
||||
This.fLookAheadResultsSize = bytes.getInt();
|
||||
This.fFlags = bytes.getInt();
|
||||
int lengthOfTable = length - fHeaderSize; // length in bytes.
|
||||
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||||
@ -94,6 +100,7 @@ public final class RBBIDataWrapper {
|
||||
bytes.writeInt(fNumStates);
|
||||
bytes.writeInt(fRowLen);
|
||||
bytes.writeInt(fDictCategoriesStart);
|
||||
bytes.writeInt(fLookAheadResultsSize);
|
||||
bytes.writeInt(fFlags);
|
||||
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
|
||||
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
|
||||
@ -131,6 +138,7 @@ public final class RBBIDataWrapper {
|
||||
if (fNumStates != otherST.fNumStates) return false;
|
||||
if (fRowLen != otherST.fRowLen) return false;
|
||||
if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false;
|
||||
if (fLookAheadResultsSize != otherST.fLookAheadResultsSize) return false;
|
||||
if (fFlags != otherST.fFlags) return false;
|
||||
return Arrays.equals(fTable, otherST.fTable);
|
||||
}
|
||||
|
@ -79,6 +79,12 @@ class RBBITableBuilder {
|
||||
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
|
||||
int[] fLookAheadRuleMap;
|
||||
|
||||
/** Counter used when assigning lookahead rule numbers.
|
||||
* Contains the last look-ahead number already in use.
|
||||
* The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
|
||||
* for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
|
||||
int fLASlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor for RBBITableBuilder.
|
||||
@ -617,12 +623,6 @@ class RBBITableBuilder {
|
||||
void mapLookAheadRules() {
|
||||
fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
|
||||
|
||||
// Counter used when assigning lookahead rule numbers.
|
||||
// Contains the last look-ahead number already in use.
|
||||
// The first look-ahead number is 2;
|
||||
// Number 1 is reserved for non-lookahead accepting states.
|
||||
int laSlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
|
||||
|
||||
for (RBBIStateDescriptor sd: fDStates) {
|
||||
int laSlotForState = 0;
|
||||
|
||||
@ -656,7 +656,7 @@ class RBBITableBuilder {
|
||||
}
|
||||
|
||||
if (laSlotForState == 0) {
|
||||
laSlotForState = ++laSlotsInUse;
|
||||
laSlotForState = ++fLASlotsInUse;
|
||||
}
|
||||
|
||||
// For each look ahead node covered by this state,
|
||||
@ -1139,6 +1139,8 @@ class RBBITableBuilder {
|
||||
fDStates.size() < 0x7fff);
|
||||
table.fNumStates = fDStates.size();
|
||||
table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
|
||||
table.fLookAheadResultsSize =
|
||||
fLASlotsInUse == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
|
||||
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
|
||||
// Size of table size in shorts.
|
||||
|
@ -70,6 +70,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
|
||||
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
|
||||
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
|
||||
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
|
||||
return This;
|
||||
}
|
||||
|
||||
@ -93,6 +94,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
|
||||
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
|
||||
This.fRData = RBBIDataWrapper.get(bytes);
|
||||
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
|
||||
return This;
|
||||
}
|
||||
|
||||
@ -107,6 +109,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
|
||||
compileRules(rules, ruleOS);
|
||||
fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
|
||||
fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
|
||||
} catch (IOException e) {
|
||||
///CLOVER:OFF
|
||||
// An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
|
||||
@ -138,7 +141,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
synchronized (gAllBreakEngines) {
|
||||
result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
|
||||
}
|
||||
result.fLookAheadMatches = new LookAheadResults();
|
||||
result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
|
||||
result.fBreakCache = result.new BreakCache(fBreakCache);
|
||||
result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
|
||||
return result;
|
||||
@ -251,6 +254,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
*/
|
||||
private boolean fDone;
|
||||
|
||||
/**
|
||||
* Array of look-ahead tentative results.
|
||||
*/
|
||||
private int[] fLookAheadMatches;
|
||||
|
||||
/**
|
||||
* Cache of previously determined boundary positions.
|
||||
*/
|
||||
@ -740,53 +748,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
} // end synchronized(gAllBreakEngines)
|
||||
}
|
||||
|
||||
private static final int kMaxLookaheads = 8;
|
||||
private static class LookAheadResults {
|
||||
int fUsedSlotLimit;
|
||||
int[] fPositions;
|
||||
int[] fKeys;
|
||||
|
||||
LookAheadResults() {
|
||||
fUsedSlotLimit= 0;
|
||||
fPositions = new int[kMaxLookaheads];
|
||||
fKeys = new int[kMaxLookaheads];
|
||||
}
|
||||
|
||||
int getPosition(int key) {
|
||||
for (int i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
return fPositions[i];
|
||||
}
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void setPosition(int key, int position) {
|
||||
int i;
|
||||
for (i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
fPositions[i] = position;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (i >= kMaxLookaheads) {
|
||||
assert(false);
|
||||
i = kMaxLookaheads - 1;
|
||||
}
|
||||
fKeys[i] = key;
|
||||
fPositions[i] = position;
|
||||
assert(fUsedSlotLimit == i);
|
||||
fUsedSlotLimit = i + 1;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
fUsedSlotLimit = 0;
|
||||
}
|
||||
};
|
||||
private LookAheadResults fLookAheadMatches = new LookAheadResults();
|
||||
|
||||
|
||||
/**
|
||||
* The State Machine Engine for moving forward is here.
|
||||
* This function is the heart of the RBBI run time engine.
|
||||
@ -854,7 +815,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||||
}
|
||||
}
|
||||
fLookAheadMatches.reset();
|
||||
|
||||
// loop until we reach the end of the text or transition to state 0
|
||||
while (state != STOP_STATE) {
|
||||
@ -921,7 +881,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
|
||||
} else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
|
||||
// Lookahead match is completed
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(accepting);
|
||||
int lookaheadResult = fLookAheadMatches[accepting];
|
||||
if (lookaheadResult >= 0) {
|
||||
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
|
||||
fPosition = lookaheadResult;
|
||||
@ -944,7 +904,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// We want the beginning of it.
|
||||
pos--;
|
||||
}
|
||||
fLookAheadMatches.setPosition(rule, pos);
|
||||
fLookAheadMatches[rule] = pos;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c
|
||||
size 13142859
|
||||
oid sha256:73ba0e9a3520637ead2b6a0579902b3622e6611c3b9a6b22b835576a990982bf
|
||||
size 13147665
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7
|
||||
oid sha256:2f0c964909d165506be2914219f625245cf9e9e26ebfad0c78843f23b9740375
|
||||
size 94529
|
||||
|
@ -852,4 +852,57 @@ public class RBBITest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Test handling of a large number of look-ahead rules.
|
||||
* The number of rules in the test exceeds the implementation limits prior to the
|
||||
* improvements introduced with #13590.
|
||||
*
|
||||
* The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
|
||||
* The text being matched is sequential, "ABCDEFGHI..."
|
||||
*
|
||||
* The upshot is that the look-ahead rules all match on their preceding context,
|
||||
* and consequently must save a potential result, but then fail to match on their
|
||||
* trailing context, so that they don't actually cause a boundary.
|
||||
*
|
||||
* Additionally, add a ".*" rule, so there are no boundaries unless a
|
||||
* look-ahead hard-break rule forces one.
|
||||
*/
|
||||
@Test
|
||||
public void TestBug13590() {
|
||||
StringBuilder rules = new StringBuilder("!!quoted_literals_only; !!chain; .*;\n");
|
||||
|
||||
int NUM_LOOKAHEAD_RULES = 50;
|
||||
char STARTING_CHAR = '\u5000';
|
||||
char firstChar = 0;
|
||||
for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
|
||||
firstChar = (char) (STARTING_CHAR + ruleNum*2);
|
||||
rules.append('\'') .append(firstChar) .append((char)(firstChar+1)) .append('\'')
|
||||
.append(' ') .append('/') .append(' ')
|
||||
.append('\'') .append((char)(firstChar+2)) .append((char)(firstChar+4)) .append('\'')
|
||||
.append(';') .append('\n');
|
||||
}
|
||||
|
||||
// Change the last rule added from the form "UV / WY" to "UV / WX".
|
||||
// Changes the rule so that it will match - all 4 chars are in ascending sequence.
|
||||
String rulesStr = rules.toString().replace((char)(firstChar+4), (char)(firstChar+3));
|
||||
|
||||
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rulesStr);
|
||||
// bi.dump(System.out);
|
||||
|
||||
StringBuilder testString = new StringBuilder();
|
||||
for (char c = (char) (STARTING_CHAR-200); c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
|
||||
testString.append(c);
|
||||
}
|
||||
bi.setText(testString);
|
||||
|
||||
int breaksFound = 0;
|
||||
while (bi.next() != BreakIterator.DONE) {
|
||||
++breaksFound;
|
||||
}
|
||||
|
||||
// Two matches are expected, one from the last rule that was explicitly modified,
|
||||
// and one at the end of the text.
|
||||
assertEquals("Wrong number of breaks found", 2, breaksFound);
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user