ICU-13565 RBBI, make all state table row data be unsigned.

This commit is contained in:
Andy Heninger 2020-05-27 16:36:22 -07:00
parent 723037953b
commit f0ad454691
11 changed files with 103 additions and 96 deletions

View File

@ -714,11 +714,11 @@ static const int32_t kMaxLookaheads = 8;
struct LookAheadResults {
int32_t fUsedSlotLimit;
int32_t fPositions[8];
int16_t fKeys[8];
uint16_t fKeys[8];
LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
int32_t getPosition(int16_t key) {
int32_t getPosition(uint16_t key) {
for (int32_t i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
return fPositions[i];
@ -727,7 +727,7 @@ struct LookAheadResults {
UPRV_UNREACHABLE;
}
void setPosition(int16_t key, int32_t position) {
void setPosition(uint16_t key, int32_t position) {
int32_t i;
for (i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
@ -912,20 +912,18 @@ int32_t RuleBasedBreakIterator::handleNext() {
(tableData + tableRowLen * state);
if (row->fAccepting == -1) {
uint16_t accepting = row->fAccepting;
if (accepting == ACCEPTING_UNCONDITIONAL) {
// Match found, common case.
if (mode != RBBI_START) {
result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
}
fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
}
int16_t completedRule = row->fAccepting;
if (completedRule > 0) {
fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
} else if (accepting > ACCEPTING_UNCONDITIONAL) {
// Lookahead match is completed.
int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
int32_t lookaheadResult = lookAheadMatches.getPosition(accepting);
if (lookaheadResult >= 0) {
fRuleStatusIndex = row->fTagIdx;
fRuleStatusIndex = row->fTagsIdx;
fPosition = lookaheadResult;
return lookaheadResult;
}
@ -937,7 +935,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
// This would enable hard-break rules with no following context.
// But there are line break test failures when trying this. Investigate.
// Issue ICU-20837
int16_t rule = row->fLookAhead;
uint16_t rule = row->fLookAhead;
if (rule != 0) {
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
lookAheadMatches.setPosition(rule, pos);

View File

@ -253,12 +253,12 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
RBBIStateTableRow *row = (RBBIStateTableRow *)
(table->fTableData + (table->fRowLen * s));
if (use8Bits) {
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagIdx);
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
}
} else {
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagIdx);
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
}

View File

@ -95,34 +95,36 @@ struct RBBIDataHeader {
template <typename ST, typename UT>
template <typename T>
struct RBBIStateTableRowT {
ST fAccepting; /* Non-zero if this row is for an accepting state. */
/* Value 0: not an accepting state. */
/* -1: Unconditional Accepting state. */
/* positive: Look-ahead match has completed. */
/* Actual boundary position happened earlier */
/* Value here == fLookAhead in earlier */
/* state, at actual boundary pos. */
ST fLookAhead; /* Non-zero if this row is for a state that */
/* corresponds to a '/' in the rule source. */
/* Value is the same as the fAccepting */
/* value for the rule (which will appear */
/* in a different state. */
ST fTagIdx; /* Non-zero if this row covers a {tagged} position */
/* from a rule. Value is the index in the */
/* StatusTable of the set of matching */
/* tags (rule status values) */
UT fNextState[1]; /* Next State, indexed by char category. */
/* Variable-length array declared with length 1 */
/* to disable bounds checkers. */
/* Array Size is actually fData->fHeader->fCatCount*/
/* CAUTION: see RBBITableBuilder::getTableSize() */
/* before changing anything here. */
T fAccepting; // Non-zero if this row is for an accepting state.
// Value 0: not an accepting state.
// 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
// >1: Look-ahead match has completed.
// Actual boundary position happened earlier
// Value here == fLookAhead in earlier
// state, at actual boundary pos.
T fLookAhead; // Non-zero if this row is for a state that
// corresponds to a '/' in the rule source.
// Value is the same as the fAccepting
// value for the rule (which will appear
// in a different state.
T fTagsIdx; // Non-zero if this row covers a {tagged} position
// from a rule. Value is the index in the
// StatusTable of the set of matching
// tags (rule status values)
T fNextState[1]; // Next State, indexed by char category.
// Variable-length array declared with length 1
// to disable bounds checkers.
// Array Size is actually fData->fHeader->fCatCount
// CAUTION: see RBBITableBuilder::getTableSize()
// before changing anything here.
};
typedef RBBIStateTableRowT<int8_t, uint8_t> RBBIStateTableRow8;
typedef RBBIStateTableRowT<int16_t, uint16_t> RBBIStateTableRow16;
typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting
union RBBIStateTableRow {
RBBIStateTableRow16 r16;

View File

@ -714,7 +714,12 @@ void RBBITableBuilder::mapLookAheadRules() {
return;
}
fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
int32_t laSlotsInUse = 0;
// Counter used when assigning lookahead rule numbers.
// Contains the last look-ahead number already in use.
// The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
// for non-lookahead accepting states. See the declarations of RBBIStateTableRowT.
int32_t laSlotsInUse = ACCEPTING_UNCONDITIONAL;
for (int32_t n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
@ -807,23 +812,23 @@ void RBBITableBuilder::flagAcceptingStates() {
if (sd->fPositions->indexOf(endMarker) >= 0) {
// Any non-zero value for fAccepting means this is an accepting node.
// The value is what will be returned to the user as the break status.
// If no other value was specified, force it to -1.
// If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
if (sd->fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
if (sd->fAccepting == 0) {
sd->fAccepting = -1;
sd->fAccepting = ACCEPTING_UNCONDITIONAL;
}
}
if (sd->fAccepting==-1 && endMarker->fVal != 0) {
if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead, because a look-ahead match needs to
// immediately stop the run-time engine. First match, not longest.
sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
}
// implicit else:
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
// if sd->fAccepting already had a value other than 0 or 1, leave it be.
}
}
}
@ -857,7 +862,7 @@ void RBBITableBuilder::flagLookAheadStates() {
int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
if (positionsIdx >= 0) {
U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
// if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
// printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
@ -1392,22 +1397,23 @@ void RBBITableBuilder::exportTable(void *where) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
if (use8BitsForTable()) {
U_ASSERT (-128 < sd->fAccepting && sd->fAccepting <= 127);
U_ASSERT (-128 < sd->fLookAhead && sd->fLookAhead <= 127);
U_ASSERT (-128 < sd->fTagsIdx && sd->fTagsIdx <= 127);
row->r8.fAccepting = (int8_t)sd->fAccepting;
row->r8.fLookAhead = (int8_t)sd->fLookAhead;
row->r8.fTagIdx = (int8_t)sd->fTagsIdx;
U_ASSERT (sd->fAccepting <= 255);
U_ASSERT (sd->fLookAhead <= 255);
U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
row->r8.fAccepting = sd->fAccepting;
row->r8.fLookAhead = sd->fLookAhead;
row->r8.fTagsIdx = sd->fTagsIdx;
for (col=0; col<catCount; col++) {
U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
row->r8.fNextState[col] = sd->fDtran->elementAti(col);
}
} else {
U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
row->r16.fAccepting = (int16_t)sd->fAccepting;
row->r16.fLookAhead = (int16_t)sd->fLookAhead;
row->r16.fTagIdx = (int16_t)sd->fTagsIdx;
U_ASSERT (sd->fAccepting <= 0xffff);
U_ASSERT (sd->fLookAhead <= 0xffff);
U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
row->r16.fAccepting = sd->fAccepting;
row->r16.fLookAhead = sd->fLookAhead;
row->r16.fTagsIdx = sd->fTagsIdx;
for (col=0; col<catCount; col++) {
row->r16.fNextState[col] = sd->fDtran->elementAti(col);
}
@ -1597,7 +1603,7 @@ void RBBITableBuilder::exportSafeTable(void *where) {
if (use8BitsForSafeTable()) {
row->r8.fAccepting = 0;
row->r8.fLookAhead = 0;
row->r8.fTagIdx = 0;
row->r8.fTagsIdx = 0;
for (col=0; col<catCount; col++) {
U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
row->r8.fNextState[col] = rowString->charAt(col);
@ -1605,7 +1611,7 @@ void RBBITableBuilder::exportSafeTable(void *where) {
} else {
row->r16.fAccepting = 0;
row->r16.fLookAhead = 0;
row->r16.fTagIdx = 0;
row->r16.fTagsIdx = 0;
for (col=0; col<catCount; col++) {
row->r16.fNextState[col] = rowString->charAt(col);
}

View File

@ -195,8 +195,8 @@ private:
class RBBIStateDescriptor : public UMemory {
public:
UBool fMarked;
int32_t fAccepting;
int32_t fLookAhead;
uint32_t fAccepting;
uint32_t fLookAhead;
UVector *fTagVals;
int32_t fTagsIdx;
UVector *fPositions; // Set of parse tree positions associated

View File

@ -4672,18 +4672,16 @@ void RBBITest::TestTableRedundancies() {
UnicodeString s;
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
if (in8Bits) {
assertTrue(WHERE, row->r8.fAccepting >= -1);
s.append(row->r8.fAccepting + 1); // values of -1 are expected.
s.append(row->r8.fAccepting);
s.append(row->r8.fLookAhead);
s.append(row->r8.fTagIdx);
s.append(row->r8.fTagsIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->r8.fNextState[column]);
}
} else {
assertTrue(WHERE, row->r16.fAccepting >= -1);
s.append(row->r16.fAccepting + 1); // values of -1 are expected.
s.append(row->r16.fAccepting);
s.append(row->r16.fLookAhead);
s.append(row->r16.fTagIdx);
s.append(row->r16.fTagsIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->r16.fNextState[column]);
}

View File

@ -70,11 +70,7 @@ public final class RBBIDataWrapper {
This.fTable = new char[lengthOfTable];
for (int i = 0; i < lengthOfTable; i++) {
byte b = bytes.get();
if (i % This.fRowLen < NEXTSTATES) {
This.fTable[i] = (char) b; // Treat b as signed.
} else {
This.fTable[i] = (char)(0xff & b); // Treat b as unsigned.
}
This.fTable[i] = (char)(0xff & b); // Treat b as unsigned.
}
ICUBinary.skipBytes(bytes, lengthOfTable & 1);
} else {
@ -202,12 +198,17 @@ public final class RBBIDataWrapper {
/**
* offset to the "tagIndex" field in a state table row.
*/
public final static int TAGIDX = 2;
public final static int TAGSIDX = 2;
/**
* offset to the start of the next states array in a state table row.
*/
public final static int NEXTSTATES = 3;
/**
* value constant for the ACCEPTING field of a state table row.
*/
public final static int ACCEPTING_UNCONDITIONAL = 1;
// Bit selectors for the "FLAGS" field of the state table header
// enum RBBIStateTableFlags in the C version.
//
@ -477,7 +478,7 @@ public final class RBBIDataWrapper {
} else {
dest.append(" ");
}
dest.append(intToString(table.fTable[row+TAGIDX], 5));
dest.append(intToString(table.fTable[row+TAGSIDX], 5));
for (int col=0; col<fHeader.fCatCount; col++) {
dest.append(intToString(table.fTable[row+NEXTSTATES+col], 5));

View File

@ -616,7 +616,12 @@ class RBBITableBuilder {
*/
void mapLookAheadRules() {
fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
int laSlotsInUse = 0;
// Counter used when assigning lookahead rule numbers.
// Contains the last look-ahead number already in use.
// The first look-ahead number is 2;
// Number 1 is reserved for non-lookahead accepting states.
int laSlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
for (RBBIStateDescriptor sd: fDStates) {
int laSlotForState = 0;
@ -692,27 +697,26 @@ class RBBITableBuilder {
endMarker = endMarkerNodes.get(i);
for (n=0; n<fDStates.size(); n++) {
RBBIStateDescriptor sd = fDStates.get(n);
//if (sd.fPositions.indexOf(endMarker) >= 0) {
if (sd.fPositions.contains(endMarker)) {
// Any non-zero value for fAccepting means this is an accepting node.
// The value is what will be returned to the user as the break status.
// If no other value was specified, force it to -1.
// If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
if (sd.fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
if (sd.fAccepting == 0) {
sd.fAccepting = -1;
sd.fAccepting = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
}
}
if (sd.fAccepting==-1 && endMarker.fVal != 0) {
if (sd.fAccepting==RBBIDataWrapper.ACCEPTING_UNCONDITIONAL && endMarker.fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead, because a look-ahead match needs to
// immediately stop the run-time engine. First match, not longest.
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
}
// implicit else:
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
// if sd.fAccepting already had a value other than 0 or 1, leave it be.
}
}
}
@ -1158,18 +1162,18 @@ class RBBITableBuilder {
RBBIStateDescriptor sd = fDStates.get(state);
int row = state*rowLen;
if (use8Bits) {
Assert.assrt (-128 < sd.fAccepting && sd.fAccepting <= MAX_STATE_FOR_8BITS_TABLE);
Assert.assrt (-128 < sd.fLookAhead && sd.fLookAhead <= MAX_STATE_FOR_8BITS_TABLE);
Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 255);
Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 255);
} else {
Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 0xffff);
Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 0xffff);
}
table.fTable[row + RBBIDataWrapper.ACCEPTING] = (char)sd.fAccepting;
table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (char)sd.fLookAhead;
table.fTable[row + RBBIDataWrapper.TAGIDX] = (char)sd.fTagsIdx;
table.fTable[row + RBBIDataWrapper.TAGSIDX] = (char)sd.fTagsIdx;
for (col=0; col<numCharCategories; col++) {
if (use8Bits) {
Assert.assrt (sd.fDtran[col] <= MAX_STATE_FOR_8BITS_TABLE);
Assert.assrt (0 <= sd.fDtran[col] && sd.fDtran[col] <= MAX_STATE_FOR_8BITS_TABLE);
}
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)sd.fDtran[col];
}

View File

@ -914,7 +914,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up a state transition in the state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0xffff) {
int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
// Match found, common case
result = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
@ -924,15 +925,12 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
// Remember the break status (tag) values.
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
}
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0 && completedRule != 0xffff) {
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
} else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
int lookaheadResult = fLookAheadMatches.getPosition(accepting);
if (lookaheadResult >= 0) {
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
fPosition = lookaheadResult;
return lookaheadResult;
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8ed7db50765b06c8a35f48048543c5c9a2c2e19993f752bd71a15e6ac89aa3b3
size 13141781
oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799
size 13141762

View File

@ -606,7 +606,7 @@ public class RBBITest extends TestFmwk {
int row = dw.getRowIndex(r);
s.append(fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.LOOKAHEAD]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGIDX]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGSIDX]);
for (int column=0; column<numCharClasses; column++) {
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append(tableVal);