ICU-13590 RBBI, improve handling of concurrent look-ahead rules.

Change the mapping from rule number to boundary position to use a simple array
instead of a linear search lookup map.

Look-ahead rules have a preceding context, a boundary position, and following context.
In the implementation, when the preceding context matches, the potential boundary
position is saved. Then, if the following context proves to match, the saved boundary is
returned as an actual boundary.

Look-ahead rules are numbered, and the implementation maintains a map from
rule number to the tentative saved boundary position.

In an earlier improvement to the rule builder, the rule numbering was changed to be a
contiguous sequence, from the original sparse numbering. In anticipation of
changing the mapping from number to position to use a simple array.
This commit is contained in:
Andy Heninger 2020-06-26 17:52:40 -07:00
parent 2de2585f1b
commit 003b431540
14 changed files with 226 additions and 140 deletions

View File

@ -68,10 +68,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
init(status);
fData = new RBBIDataWrapper(data, status); // status checked in constructor
if (U_FAILURE(status)) {return;}
if(fData == 0) {
if(fData == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (fData->fForwardTable->fLookAheadResultsSize > 0) {
fLookAheadMatches = static_cast<int32_t *>(
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
if (fLookAheadMatches == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
}
//
@ -98,10 +106,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
}
fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
if (U_FAILURE(status)) {return;}
if(fData == 0) {
if(fData == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (fData->fForwardTable->fLookAheadResultsSize > 0) {
fLookAheadMatches = static_cast<int32_t *>(
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
if (fLookAheadMatches == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
}
@ -117,10 +133,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
init(status);
fData = new RBBIDataWrapper(udm, status); // status checked in constructor
if (U_FAILURE(status)) {return;}
if(fData == 0) {
if(fData == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (fData->fForwardTable->fLookAheadResultsSize > 0) {
fLookAheadMatches = static_cast<int32_t *>(
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
if (fLookAheadMatches == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
}
@ -188,30 +212,34 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
// fCharIter was adopted from the outside.
delete fCharIter;
}
fCharIter = NULL;
fCharIter = nullptr;
utext_close(&fText);
if (fData != NULL) {
if (fData != nullptr) {
fData->removeReference();
fData = NULL;
fData = nullptr;
}
delete fBreakCache;
fBreakCache = NULL;
fBreakCache = nullptr;
delete fDictionaryCache;
fDictionaryCache = NULL;
fDictionaryCache = nullptr;
delete fLanguageBreakEngines;
fLanguageBreakEngines = NULL;
fLanguageBreakEngines = nullptr;
delete fUnhandledBreakEngine;
fUnhandledBreakEngine = NULL;
fUnhandledBreakEngine = nullptr;
uprv_free(fLookAheadMatches);
fLookAheadMatches = nullptr;
}
/**
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
* TODO: needs better handling of memory allocation errors.
*/
RuleBasedBreakIterator&
RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
@ -252,6 +280,14 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
fData = that.fData->addReference();
}
uprv_free(fLookAheadMatches);
fLookAheadMatches = nullptr;
if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) {
fLookAheadMatches = static_cast<int32_t *>(
uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
}
fPosition = that.fPosition;
fRuleStatusIndex = that.fRuleStatusIndex;
fDone = that.fDone;
@ -275,16 +311,17 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
//
//-----------------------------------------------------------------------------
void RuleBasedBreakIterator::init(UErrorCode &status) {
fCharIter = NULL;
fData = NULL;
fCharIter = nullptr;
fData = nullptr;
fPosition = 0;
fRuleStatusIndex = 0;
fDone = false;
fDictionaryCharCount = 0;
fLanguageBreakEngines = NULL;
fUnhandledBreakEngine = NULL;
fBreakCache = NULL;
fDictionaryCache = NULL;
fLanguageBreakEngines = nullptr;
fUnhandledBreakEngine = nullptr;
fBreakCache = nullptr;
fDictionaryCache = nullptr;
fLookAheadMatches = nullptr;
// Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
// fText = UTEXT_INITIALIZER;
@ -700,52 +737,6 @@ enum RBBIRunMode {
};
// Map from look-ahead break states (corresponds to rules) to boundary positions.
// Allows multiple lookahead break rules to be in flight at the same time.
//
// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
// in the state table be sequential, then we can just index an array. And the
// table could also tell us in advance how big that array needs to be.
//
// Before ICU 57 there was just a single simple variable for a look-ahead match that
// was in progress. Two rules at once did not work.
static const int32_t kMaxLookaheads = 8;
struct LookAheadResults {
int32_t fUsedSlotLimit;
int32_t fPositions[8];
uint16_t fKeys[8];
LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
int32_t getPosition(uint16_t key) {
for (int32_t i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
return fPositions[i];
}
}
UPRV_UNREACHABLE;
}
void setPosition(uint16_t key, int32_t position) {
int32_t i;
for (i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
fPositions[i] = position;
return;
}
}
if (i >= kMaxLookaheads) {
UPRV_UNREACHABLE;
}
fKeys[i] = key;
fPositions[i] = position;
U_ASSERT(fUsedSlotLimit == i);
fUsedSlotLimit = i + 1;
}
};
// Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
// instantiation, based on whether an 8 or 16 bit table is required.
//
@ -809,7 +800,6 @@ int32_t RuleBasedBreakIterator::handleNext() {
RowType *row;
UChar32 c;
LookAheadResults lookAheadMatches;
int32_t result = 0;
int32_t initialPosition = 0;
const RBBIStateTable *statetable = fData->fForwardTable;
@ -912,7 +902,8 @@ int32_t RuleBasedBreakIterator::handleNext() {
fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
} else if (accepting > ACCEPTING_UNCONDITIONAL) {
// Lookahead match is completed.
int32_t lookaheadResult = lookAheadMatches.getPosition(accepting);
U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
int32_t lookaheadResult = fLookAheadMatches[accepting];
if (lookaheadResult >= 0) {
fRuleStatusIndex = row->fTagsIdx;
fPosition = lookaheadResult;
@ -927,9 +918,11 @@ int32_t RuleBasedBreakIterator::handleNext() {
// But there are line break test failures when trying this. Investigate.
// Issue ICU-20837
uint16_t rule = row->fLookAhead;
if (rule != 0) {
U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL);
U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize);
if (rule > ACCEPTING_UNCONDITIONAL) {
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
lookAheadMatches.setPosition(rule, pos);
fLookAheadMatches[rule] = pos;
}
if (state == STOP_STATE) {

View File

@ -230,14 +230,16 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
uint32_t c;
uint32_t s;
RBBIDebugPrintf(" %s\n", heading);
RBBIDebugPrintf("%s\n", heading);
RBBIDebugPrintf("Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart);
RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize);
RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
table->fFlags,
table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
RBBIDebugPrintf("State | Acc LA TagIx");
RBBIDebugPrintf("\nState | Acc LA TagIx");
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("----");

View File

@ -137,6 +137,8 @@ struct RBBIStateTable {
uint32_t fDictCategoriesStart; // Char category number of the first dictionary
// char class, or the the largest category number + 1
// if there are no dictionary categories.
uint32_t fLookAheadResultsSize; // Size of run-time array required for holding
// look-ahead results. Indexed by row.fLookAhead.
uint32_t fFlags; // Option Flags for this state table.
char fTableData[1]; // First RBBIStateTableRow begins here.
// Variable-length array declared with length 1

View File

@ -715,12 +715,6 @@ void RBBITableBuilder::mapLookAheadRules() {
}
fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
// Counter used when assigning lookahead rule numbers.
// Contains the last look-ahead number already in use.
// The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
// for non-lookahead accepting states. See the declarations of RBBIStateTableRowT.
int32_t laSlotsInUse = ACCEPTING_UNCONDITIONAL;
for (int32_t n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
int32_t laSlotForState = 0;
@ -756,7 +750,7 @@ void RBBITableBuilder::mapLookAheadRules() {
}
if (laSlotForState == 0) {
laSlotForState = ++laSlotsInUse;
laSlotForState = ++fLASlotsInUse;
}
// For each look ahead node covered by this state,
@ -1386,6 +1380,7 @@ void RBBITableBuilder::exportTable(void *where) {
table->fNumStates = fDStates->size();
table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
table->fFlags = 0;
if (use8BitsForTable()) {
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;

View File

@ -20,6 +20,7 @@
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "rbbinode.h"
@ -184,9 +185,15 @@ private:
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
UVector32 *fLookAheadRuleMap = nullptr;
/* Counter used when assigning lookahead rule numbers.
* Contains the last look-ahead number already in use.
* The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
* for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL;
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class
};
//

View File

@ -142,6 +142,11 @@ private:
*/
UBool fDone;
/**
* Array of look-ahead tentative results.
*/
int32_t *fLookAheadMatches;
//=======================================================================
// constructors
//=======================================================================

View File

@ -133,6 +133,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
TESTCASE_AUTO(TestTable_8_16_Bits);
TESTCASE_AUTO(TestBug13590);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -5078,6 +5079,63 @@ void RBBITest::TestTable_8_16_Bits() {
}
}
/* Test handling of a large number of look-ahead rules.
* The number of rules in the test exceeds the implementation limits prior to the
* improvements introduced with #13590.
*
* The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
* The text being matched is sequential, "ABCDEFGHI..."
*
* The upshot is that the look-ahead rules all match on their preceding context,
* and consequently must save a potential result, but then fail to match on their
* trailing context, so that they don't actually cause a boundary.
*
* Additionally, add a ".*" rule, so there are no boundaries unless a
* look-ahead hard-break rule forces one.
*/
void RBBITest::TestBug13590() {
UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
const int NUM_LOOKAHEAD_RULES = 50;
const char16_t STARTING_CHAR = u'\u5000';
char16_t firstChar;
for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
firstChar = STARTING_CHAR + ruleNum*2;
rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
.append(u' ') .append(u'/') .append(u' ')
.append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
.append(u';') .append(u'\n');
}
// Change the last rule added from the form "UV / WY" to "UV / WX".
// Changes the rule so that it will match - all 4 chars are in ascending sequence.
rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
UErrorCode status = U_ZERO_ERROR;
UParseError parseError;
RuleBasedBreakIterator bi(rules, parseError, status);
if (!assertSuccess(WHERE, status)) {
errln(rules);
return;
}
// bi.dumpTables();
UnicodeString testString;
for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
testString.append(c);
}
bi.setText(testString);
int breaksFound = 0;
while (bi.next() != UBRK_DONE) {
++breaksFound;
}
// Two matches are expected, one from the last rule that was explicitly modified,
// and one at the end of the text.
assertEquals(WHERE, 2, breaksFound);
}
#if U_ENABLE_TRACING
static std::vector<std::string> gData;

View File

@ -91,6 +91,7 @@ public:
void Test16BitsTrieWith8BitStateTable();
void Test16BitsTrieWith16BitStateTable();
void TestTable_8_16_Bits();
void TestBug13590();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();

View File

@ -46,6 +46,11 @@ public final class RBBIDataWrapper {
* or the the largest category number + 1 if there are no dictionary categories.
*/
public int fDictCategoriesStart;
/**
* Size of run-time array required for holding
* look-ahead results. Indexed by row.fLookAhead.
*/
public int fLookAheadResultsSize;
/**
* Option Flags for this state table.
*/
@ -54,7 +59,7 @@ public final class RBBIDataWrapper {
* Length in bytes of the state table header, of all the int32 fields
* preceding fTable in the serialized form.
*/
public static int fHeaderSize = 16;
public static int fHeaderSize = 20;
/**
* Linear array of next state values, accessed as short[state, char_class]
*/
@ -74,6 +79,7 @@ public final class RBBIDataWrapper {
This.fNumStates = bytes.getInt();
This.fRowLen = bytes.getInt();
This.fDictCategoriesStart = bytes.getInt();
This.fLookAheadResultsSize = bytes.getInt();
This.fFlags = bytes.getInt();
int lengthOfTable = length - fHeaderSize; // length in bytes.
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
@ -94,6 +100,7 @@ public final class RBBIDataWrapper {
bytes.writeInt(fNumStates);
bytes.writeInt(fRowLen);
bytes.writeInt(fDictCategoriesStart);
bytes.writeInt(fLookAheadResultsSize);
bytes.writeInt(fFlags);
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
@ -131,6 +138,7 @@ public final class RBBIDataWrapper {
if (fNumStates != otherST.fNumStates) return false;
if (fRowLen != otherST.fRowLen) return false;
if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false;
if (fLookAheadResultsSize != otherST.fLookAheadResultsSize) return false;
if (fFlags != otherST.fFlags) return false;
return Arrays.equals(fTable, otherST.fTable);
}

View File

@ -79,6 +79,12 @@ class RBBITableBuilder {
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
int[] fLookAheadRuleMap;
/** Counter used when assigning lookahead rule numbers.
* Contains the last look-ahead number already in use.
* The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
* for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
int fLASlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
//-----------------------------------------------------------------------------
//
// Constructor for RBBITableBuilder.
@ -617,12 +623,6 @@ class RBBITableBuilder {
void mapLookAheadRules() {
fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
// Counter used when assigning lookahead rule numbers.
// Contains the last look-ahead number already in use.
// The first look-ahead number is 2;
// Number 1 is reserved for non-lookahead accepting states.
int laSlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
for (RBBIStateDescriptor sd: fDStates) {
int laSlotForState = 0;
@ -656,7 +656,7 @@ class RBBITableBuilder {
}
if (laSlotForState == 0) {
laSlotForState = ++laSlotsInUse;
laSlotForState = ++fLASlotsInUse;
}
// For each look ahead node covered by this state,
@ -1139,6 +1139,8 @@ class RBBITableBuilder {
fDStates.size() < 0x7fff);
table.fNumStates = fDStates.size();
table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
table.fLookAheadResultsSize =
fLASlotsInUse == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
// Size of table size in shorts.

View File

@ -70,6 +70,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
return This;
}
@ -93,6 +94,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(bytes);
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
return This;
}
@ -107,6 +109,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
compileRules(rules, ruleOS);
fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
} catch (IOException e) {
///CLOVER:OFF
// An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
@ -138,7 +141,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
synchronized (gAllBreakEngines) {
result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
}
result.fLookAheadMatches = new LookAheadResults();
result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
result.fBreakCache = result.new BreakCache(fBreakCache);
result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
return result;
@ -251,6 +254,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
private boolean fDone;
/**
* Array of look-ahead tentative results.
*/
private int[] fLookAheadMatches;
/**
* Cache of previously determined boundary positions.
*/
@ -740,53 +748,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
} // end synchronized(gAllBreakEngines)
}
private static final int kMaxLookaheads = 8;
private static class LookAheadResults {
int fUsedSlotLimit;
int[] fPositions;
int[] fKeys;
LookAheadResults() {
fUsedSlotLimit= 0;
fPositions = new int[kMaxLookaheads];
fKeys = new int[kMaxLookaheads];
}
int getPosition(int key) {
for (int i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
return fPositions[i];
}
}
assert(false);
return -1;
}
void setPosition(int key, int position) {
int i;
for (i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
fPositions[i] = position;
return;
}
}
if (i >= kMaxLookaheads) {
assert(false);
i = kMaxLookaheads - 1;
}
fKeys[i] = key;
fPositions[i] = position;
assert(fUsedSlotLimit == i);
fUsedSlotLimit = i + 1;
}
void reset() {
fUsedSlotLimit = 0;
}
};
private LookAheadResults fLookAheadMatches = new LookAheadResults();
/**
* The State Machine Engine for moving forward is here.
* This function is the heart of the RBBI run time engine.
@ -854,7 +815,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
}
fLookAheadMatches.reset();
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
@ -921,7 +881,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
} else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches.getPosition(accepting);
int lookaheadResult = fLookAheadMatches[accepting];
if (lookaheadResult >= 0) {
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
fPosition = lookaheadResult;
@ -944,7 +904,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// We want the beginning of it.
pos--;
}
fLookAheadMatches.setPosition(rule, pos);
fLookAheadMatches[rule] = pos;
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c
size 13142859
oid sha256:73ba0e9a3520637ead2b6a0579902b3622e6611c3b9a6b22b835576a990982bf
size 13147665

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7
oid sha256:2f0c964909d165506be2914219f625245cf9e9e26ebfad0c78843f23b9740375
size 94529

View File

@ -852,4 +852,57 @@ public class RBBITest extends TestFmwk {
}
}
}
/* Test handling of a large number of look-ahead rules.
* The number of rules in the test exceeds the implementation limits prior to the
* improvements introduced with #13590.
*
* The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
* The text being matched is sequential, "ABCDEFGHI..."
*
* The upshot is that the look-ahead rules all match on their preceding context,
* and consequently must save a potential result, but then fail to match on their
* trailing context, so that they don't actually cause a boundary.
*
* Additionally, add a ".*" rule, so there are no boundaries unless a
* look-ahead hard-break rule forces one.
*/
@Test
public void TestBug13590() {
StringBuilder rules = new StringBuilder("!!quoted_literals_only; !!chain; .*;\n");
int NUM_LOOKAHEAD_RULES = 50;
char STARTING_CHAR = '\u5000';
char firstChar = 0;
for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
firstChar = (char) (STARTING_CHAR + ruleNum*2);
rules.append('\'') .append(firstChar) .append((char)(firstChar+1)) .append('\'')
.append(' ') .append('/') .append(' ')
.append('\'') .append((char)(firstChar+2)) .append((char)(firstChar+4)) .append('\'')
.append(';') .append('\n');
}
// Change the last rule added from the form "UV / WY" to "UV / WX".
// Changes the rule so that it will match - all 4 chars are in ascending sequence.
String rulesStr = rules.toString().replace((char)(firstChar+4), (char)(firstChar+3));
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rulesStr);
// bi.dump(System.out);
StringBuilder testString = new StringBuilder();
for (char c = (char) (STARTING_CHAR-200); c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
testString.append(c);
}
bi.setText(testString);
int breaksFound = 0;
while (bi.next() != BreakIterator.DONE) {
++breaksFound;
}
// Two matches are expected, one from the last rule that was explicitly modified,
// and one at the end of the text.
assertEquals("Wrong number of breaks found", 2, breaksFound);
}
}