ICU-13565 Reduce size of BreakIterator brk files

See #1100
This commit is contained in:
Frank Tang 2020-05-19 20:44:14 +00:00 committed by Frank Yung-Fong Tang
parent 566e0f8686
commit c5ebb80a73
24 changed files with 943 additions and 240 deletions

View File

@ -746,18 +746,68 @@ struct LookAheadResults {
};
// Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
// instantiation, based on whether an 8 or 16 bit table is required.
//
// These Trie access functions will be inlined within the handleNext()/Previous() instantions.
static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
}
static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
}
int32_t RuleBasedBreakIterator::handleNext() {
const RBBIStateTable *statetable = fData->fForwardTable;
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (statetable->fFlags & RBBI_8BITS_ROWS) {
if (use8BitsTrie) {
return handleNext<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>();
} else {
return handleNext<RBBIStateTableRow8, TrieFunc16, kDictBit>();
}
} else {
if (use8BitsTrie) {
return handleNext<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>();
} else {
return handleNext<RBBIStateTableRow16, TrieFunc16, kDictBit>();
}
}
}
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
const RBBIStateTable *statetable = fData->fReverseTable;
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (statetable->fFlags & RBBI_8BITS_ROWS) {
if (use8BitsTrie) {
return handleSafePrevious<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
} else {
return handleSafePrevious<RBBIStateTableRow8, TrieFunc16, kDictBit>(fromPosition);
}
} else {
if (use8BitsTrie) {
return handleSafePrevious<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
} else {
return handleSafePrevious<RBBIStateTableRow16, TrieFunc16, kDictBit>(fromPosition);
}
}
}
//-----------------------------------------------------------------------------------
//
// handleNext()
// Run the state machine to find a boundary
//
//-----------------------------------------------------------------------------------
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
int32_t RuleBasedBreakIterator::handleNext() {
int32_t state;
uint16_t category = 0;
RBBIRunMode mode;
RBBIStateTableRow *row;
RowType *row;
UChar32 c;
LookAheadResults lookAheadMatches;
int32_t result = 0;
@ -789,7 +839,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
// Set the initial state for the state machine
state = START_STATE;
row = (RBBIStateTableRow *)
row = (RowType *)
//(statetable->fTableData + (statetable->fRowLen * state));
(tableData + tableRowLen * state);
@ -825,20 +875,17 @@ int32_t RuleBasedBreakIterator::handleNext() {
if (mode == RBBI_RUN) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
category = UTRIE2_GET16(fData->fTrie, c);
category = trieFunc(fData->fTrie, c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iteration.
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
if ((category & dictMask) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
category &= ~dictMask;
}
}
@ -860,7 +907,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
// fNextState is a variable-length array.
U_ASSERT(category<fData->fHeader->fCatCount);
state = row->fNextState[category]; /*Not accessing beyond memory*/
row = (RBBIStateTableRow *)
row = (RowType *)
// (statetable->fTableData + (statetable->fRowLen * state));
(tableData + tableRowLen * state);
@ -948,10 +995,12 @@ int32_t RuleBasedBreakIterator::handleNext() {
// because the safe table does not require as many options.
//
//-----------------------------------------------------------------------------------
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
int32_t state;
uint16_t category = 0;
RBBIStateTableRow *row;
RowType *row;
UChar32 c;
int32_t result = 0;
@ -971,7 +1020,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
// Set the initial state for the state machine
c = UTEXT_PREVIOUS32(&fText);
state = START_STATE;
row = (RBBIStateTableRow *)
row = (RowType *)
(stateTable->fTableData + (stateTable->fRowLen * state));
// loop until we reach the start of the text or transition to state 0
@ -980,12 +1029,10 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = UTRIE2_GET16(fData->fTrie, c);
category &= ~0x4000;
// Off the dictionary flag bit. For reverse iteration it is not used.
category = trieFunc(fData->fTrie, c);
category &= ~dictMask;
#ifdef RBBI_DEBUG
if (gTrace) {
@ -1004,7 +1051,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
// fNextState is a variable-length array.
U_ASSERT(category<fData->fHeader->fCatCount);
state = row->fNextState[category]; /*Not accessing beyond memory*/
row = (RBBIStateTableRow *)
row = (RowType *)
(stateTable->fTableData + (stateTable->fRowLen * state));
if (state == STOP_STATE) {
@ -1024,6 +1071,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
return result;
}
//-------------------------------------------------------------------------------
//
// getRuleStatus() Return the break rule tag associated with the current

View File

@ -119,6 +119,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus) {
uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ?
kDictBitFor8BitsTrie : kDictBit;
if ((endPos - startPos) <= 1) {
return;
}
@ -142,13 +144,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
utext_setNativeIndex(text, rangeStart);
UChar32 c = utext_current32(text);
category = UTRIE2_GET16(fBI->fData->fTrie, c);
category = ucptrie_get(fBI->fData->fTrie, c);
while(U_SUCCESS(status)) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) {
utext_next32(text); // TODO: cleaner loop structure.
c = utext_current32(text);
category = UTRIE2_GET16(fBI->fData->fTrie, c);
category = ucptrie_get(fBI->fData->fTrie, c);
}
if (current >= rangeEnd) {
break;
@ -166,7 +168,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// Reload the loop variables for the next go-round
c = utext_current32(text);
category = UTRIE2_GET16(fBI->fData->fTrie, c);
category = ucptrie_get(fBI->fData->fTrie, c);
}
// If we found breaks, ensure that the first and last entries are

View File

@ -11,10 +11,10 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ucptrie.h"
#include "unicode/utypes.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "utrie2.h"
#include "udatamem.h"
#include "cmemory.h"
#include "cstring.h"
@ -110,17 +110,24 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
}
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
NULL, // *actual length
&status);
fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST,
UCPTRIE_VALUE_BITS_ANY,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
nullptr, // *actual length
&status);
if (U_FAILURE(status)) {
return;
}
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
fRuleString.setTo(TRUE, fRuleSource, -1);
UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie);
if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) {
status = U_INVALID_FORMAT_ERROR;
return;
}
fRuleSource = ((char *)data + fHeader->fRuleSource);
fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen));
U_ASSERT(data->fRuleSourceLen > 0);
fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
@ -142,8 +149,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
//-----------------------------------------------------------------------------
RBBIDataWrapper::~RBBIDataWrapper() {
U_ASSERT(fRefCount == 0);
utrie2_close(fTrie);
fTrie = NULL;
ucptrie_close(fTrie);
fTrie = nullptr;
if (fUDataMem) {
udata_close(fUDataMem);
} else if (!fDontFreeData) {
@ -225,6 +232,11 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
RBBIDebugPrintf(" %s\n", heading);
RBBIDebugPrintf("Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
table->fFlags,
table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
RBBIDebugPrintf("State | Acc LA TagIx");
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
@ -236,12 +248,20 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
RBBIDebugPrintf(" N U L L T A B L E\n\n");
return;
}
UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS;
for (s=0; s<table->fNumStates; s++) {
RBBIStateTableRow *row = (RBBIStateTableRow *)
(table->fTableData + (table->fRowLen * s));
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->fNextState[c]);
if (use8Bits) {
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
}
} else {
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
}
}
RBBIDebugPrintf("\n");
}
@ -377,35 +397,64 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
//
int32_t topSize = offsetof(RBBIStateTable, fTableData);
// Forward state table.
// Forward state table.
tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
tableLength = ds->readUInt32(rbbiDH->fFTableLen);
if (tableLength > 0) {
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
outBytes+tableStartOffset, status);
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
outBytes+tableStartOffset+topSize, status);
// Swap the state table if the table is in 16 bits.
if (use8Bits) {
if (outBytes != inBytes) {
uprv_memmove(outBytes+tableStartOffset+topSize,
inBytes+tableStartOffset+topSize,
tableLength-topSize);
}
} else {
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
outBytes+tableStartOffset+topSize, status);
}
}
// Reverse state table. Same layout as forward table, above.
tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
tableLength = ds->readUInt32(rbbiDH->fRTableLen);
if (tableLength > 0) {
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
outBytes+tableStartOffset, status);
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
outBytes+tableStartOffset+topSize, status);
// Swap the state table if the table is in 16 bits.
if (use8Bits) {
if (outBytes != inBytes) {
uprv_memmove(outBytes+tableStartOffset+topSize,
inBytes+tableStartOffset+topSize,
tableLength-topSize);
}
} else {
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
outBytes+tableStartOffset+topSize, status);
}
}
// Trie table for character categories
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
// Source Rules Text. It's UChar data
ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
// Source Rules Text. It's UTF8 data
if (outBytes != inBytes) {
uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource),
inBytes+ds->readUInt32(rbbiDH->fRuleSource),
ds->readUInt32(rbbiDH->fRuleSourceLen));
}
// Table of rule status values. It's all int_32 values
ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),

View File

@ -49,16 +49,17 @@ ubrk_swap(const UDataSwapper *ds,
#ifdef __cplusplus
#include "unicode/ucptrie.h"
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/uversion.h"
#include "umutex.h"
#include "utrie2.h"
U_NAMESPACE_BEGIN
// The current RBBI data format version.
static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {5, 0, 0, 0};
static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
/*
* The following structs map exactly onto the raw data from ICU common data file.
@ -94,25 +95,25 @@ struct RBBIDataHeader {
struct RBBIStateTableRow {
int16_t fAccepting; /* Non-zero if this row is for an accepting state. */
template <typename ST, typename UT>
struct RBBIStateTableRowT {
ST fAccepting; /* Non-zero if this row is for an accepting state. */
/* Value 0: not an accepting state. */
/* -1: Unconditional Accepting state. */
/* positive: Look-ahead match has completed. */
/* Actual boundary position happened earlier */
/* Value here == fLookAhead in earlier */
/* state, at actual boundary pos. */
int16_t fLookAhead; /* Non-zero if this row is for a state that */
ST fLookAhead; /* Non-zero if this row is for a state that */
/* corresponds to a '/' in the rule source. */
/* Value is the same as the fAccepting */
/* value for the rule (which will appear */
/* in a different state. */
int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */
ST fTagIdx; /* Non-zero if this row covers a {tagged} position */
/* from a rule. Value is the index in the */
/* StatusTable of the set of matching */
/* tags (rule status values) */
int16_t fReserved;
uint16_t fNextState[1]; /* Next State, indexed by char category. */
UT fNextState[1]; /* Next State, indexed by char category. */
/* Variable-length array declared with length 1 */
/* to disable bounds checkers. */
/* Array Size is actually fData->fHeader->fCatCount*/
@ -120,12 +121,18 @@ struct RBBIStateTableRow {
/* before changing anything here. */
};
typedef RBBIStateTableRowT<int8_t, uint8_t> RBBIStateTableRow8;
typedef RBBIStateTableRowT<int16_t, uint16_t> RBBIStateTableRow16;
union RBBIStateTableRow {
RBBIStateTableRow16 r16;
RBBIStateTableRow8 r8;
};
struct RBBIStateTable {
uint32_t fNumStates; /* Number of states. */
uint32_t fRowLen; /* Length of a state table row, in bytes. */
uint32_t fFlags; /* Option Flags for this state table */
uint32_t fReserved; /* reserved */
char fTableData[1]; /* First RBBIStateTableRow begins here. */
/* Variable-length array declared with length 1 */
/* to disable bounds checkers. */
@ -133,10 +140,9 @@ struct RBBIStateTable {
/* arithmetic for indexing variable length rows.) */
};
typedef enum {
RBBI_LOOKAHEAD_HARD_BREAK = 1,
RBBI_BOF_REQUIRED = 2
} RBBIStateTableFlags;
constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
constexpr uint32_t RBBI_BOF_REQUIRED = 2;
constexpr uint32_t RBBI_8BITS_ROWS = 4;
/* */
@ -170,13 +176,13 @@ public:
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
const RBBIStateTable *fReverseTable;
const UChar *fRuleSource;
const char *fRuleSource;
const int32_t *fRuleStatusTable;
/* number of int32_t values in the rule status table. Used to sanity check indexing */
int32_t fStatusMaxIdx;
UTrie2 *fTrie;
UCPTrie *fTrie;
private:
u_atomic_int32_t fRefCount;

View File

@ -22,6 +22,7 @@
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/ustring.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
@ -154,7 +155,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
int32_t rulesLengthInUTF8 = 0;
u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
fStrippedRules.getBuffer(), fStrippedRules.length(),
0xfffd, nullptr, fStatus);
*fStatus = U_ZERO_ERROR;
int32_t rulesSize = align8((rulesLengthInUTF8+1));
int32_t totalSize = headerSize
+ forwardTableSize
@ -197,11 +205,11 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fRTableLen = reverseTableSize;
data->fTrie = data->fRTable + data->fRTableLen;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fStatusTable = data->fTrie + trieSize;
data->fTrieLen = trieSize;
data->fStatusTable = data->fTrie + data->fTrieLen;
data->fStatusTableLen= statusTableSize;
data->fRuleSource = data->fStatusTable + statusTableSize;
data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
data->fRuleSourceLen = rulesLengthInUTF8;
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
@ -214,7 +222,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
}
fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
fStrippedRules.getBuffer(), fStrippedRules.length(),
0xfffd, nullptr, fStatus);
if (U_FAILURE(*fStatus)) {
return NULL;
}
return data;
}

View File

@ -829,16 +829,14 @@ static const UChar chRParen = 0x29;
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
UnicodeString strippedRules;
int32_t rulesLength = rules.length();
bool skippingSpaces = false;
for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
UChar32 cp = rules.char32At(idx);
bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
if (skippingSpaces && whiteSpace) {
if (whiteSpace) {
continue;
}
strippedRules.append(cp);
skippingSpaces = whiteSpace;
}
return strippedRules;
}

View File

@ -35,7 +35,6 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uniset.h"
#include "utrie2.h"
#include "uvector.h"
#include "uassert.h"
#include "cmemory.h"
@ -46,6 +45,7 @@
U_NAMESPACE_BEGIN
const int32_t kMaxCharCategoriesFor8BitsTrie = 127;
//------------------------------------------------------------------------
//
// Constructor
@ -56,7 +56,8 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
fRB = rb;
fStatus = rb->fStatus;
fRangeList = 0;
fTrie = 0;
fMutableTrie = nullptr;
fTrie = nullptr;
fTrieSize = 0;
fGroupCount = 0;
fSawBOF = FALSE;
@ -79,7 +80,8 @@ RBBISetBuilder::~RBBISetBuilder()
delete r;
}
utrie2_close(fTrie);
ucptrie_close(fTrie);
umutablecptrie_close(fMutableTrie);
}
@ -255,17 +257,23 @@ void RBBISetBuilder::buildRanges() {
void RBBISetBuilder::buildTrie() {
RangeDescriptor *rlRange;
fTrie = utrie2_open(0, // Initial value for all code points.
fMutableTrie = umutablecptrie_open(
0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
utrie2_setRange32(fTrie,
rlRange->fStartChar, // Range start
rlRange->fEndChar, // Range end (inclusive)
rlRange->fNum, // value for range
TRUE, // Overwrite previously written values
fStatus);
uint32_t value = rlRange->fNum;
if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) {
U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0);
value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit);
}
umutablecptrie_setRange(fMutableTrie,
rlRange->fStartChar, // Range start
rlRange->fEndChar, // Range end (inclusive)
value, // value for range
fStatus);
}
}
@ -274,8 +282,8 @@ void RBBISetBuilder::mergeCategories(IntPair categories) {
U_ASSERT(categories.first >= 1);
U_ASSERT(categories.second > categories.first);
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
int32_t rangeNum = rd->fNum & ~DICT_BIT;
int32_t rangeDict = rd->fNum & DICT_BIT;
int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit;
int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit;
if (rangeNum == categories.second) {
rd->fNum = categories.first | rangeDict;
} else if (rangeNum > categories.second) {
@ -295,15 +303,18 @@ int32_t RBBISetBuilder::getTrieSize() {
if (U_FAILURE(*fStatus)) {
return 0;
}
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
fTrieSize = utrie2_serialize(fTrie,
NULL, // Buffer
0, // Capacity
fStatus);
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
*fStatus = U_ZERO_ERROR;
if (fTrie == nullptr) {
bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
fTrie = umutablecptrie_buildImmutable(
fMutableTrie,
UCPTRIE_TYPE_FAST,
use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16,
fStatus);
fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus);
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
*fStatus = U_ZERO_ERROR;
}
}
// RBBIDebugPrintf("Trie table size is %d\n", trieSize);
return fTrieSize;
}
@ -316,9 +327,9 @@ int32_t RBBISetBuilder::getTrieSize() {
//
//-----------------------------------------------------------------------------------
void RBBISetBuilder::serializeTrie(uint8_t *where) {
utrie2_serialize(fTrie,
where, // Buffer
fTrieSize, // Capacity
ucptrie_toBinary(fTrie,
where, // Buffer
fTrieSize, // Capacity
fStatus);
}
@ -467,7 +478,7 @@ void RBBISetBuilder::printRangeGroups() {
lastPrintedGroupNum = groupNum;
RBBIDebugPrintf("%2i ", groupNum);
if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" <DICT> ");}
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
@ -669,7 +680,7 @@ void RangeDescriptor::setDictionaryFlag() {
if (varRef && varRef->fType == RBBINode::varRef) {
const UnicodeString *setName = &varRef->fText;
if (setName->compare(dictionary, -1) == 0) {
fNum |= RBBISetBuilder::DICT_BIT;
fNum |= RuleBasedBreakIterator::kDictBit;
break;
}
}

View File

@ -16,9 +16,10 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ucptrie.h"
#include "unicode/umutablecptrie.h"
#include "unicode/uobject.h"
#include "rbbirb.h"
#include "utrie2.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
@ -101,8 +102,6 @@ public:
*/
void mergeCategories(IntPair categories);
static constexpr int32_t DICT_BIT = 0x4000;
#ifdef RBBI_DEBUG
void printSets();
void printRanges();
@ -121,8 +120,9 @@ private:
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing
UCPTrie *fTrie; // the Unicode Sets.
uint32_t fTrieSize;
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.

View File

@ -28,6 +28,8 @@
U_NAMESPACE_BEGIN
const int32_t kMaxStateFor8BitsTable = 255;
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
fRB(rb),
fTree(*rootNode),
@ -1335,11 +1337,18 @@ int32_t RBBITableBuilder::getTableSize() const {
numRows = fDStates->size();
numCols = fRB->fSetBuilder->getNumCharCategories();
rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
if (use8BitsForTable()) {
rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
} else {
rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
}
size += numRows * rowSize;
return size;
}
bool RBBITableBuilder::use8BitsForTable() const {
return fDStates->size() <= kMaxStateFor8BitsTable;
}
//-----------------------------------------------------------------------------
//
@ -1364,27 +1373,44 @@ void RBBITableBuilder::exportTable(void *where) {
return;
}
table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
table->fNumStates = fDStates->size();
table->fFlags = 0;
if (use8BitsForTable()) {
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
table->fFlags |= RBBI_8BITS_ROWS;
} else {
table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
}
if (fRB->fLookAheadHardBreak) {
table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
}
if (fRB->fSetBuilder->sawBOF()) {
table->fFlags |= RBBI_BOF_REQUIRED;
}
table->fReserved = 0;
for (state=0; state<table->fNumStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
row->fAccepting = (int16_t)sd->fAccepting;
row->fLookAhead = (int16_t)sd->fLookAhead;
row->fTagIdx = (int16_t)sd->fTagsIdx;
for (col=0; col<catCount; col++) {
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
if (use8BitsForTable()) {
U_ASSERT (-128 < sd->fAccepting && sd->fAccepting <= 127);
U_ASSERT (-128 < sd->fLookAhead && sd->fLookAhead <= 127);
U_ASSERT (-128 < sd->fTagsIdx && sd->fTagsIdx <= 127);
row->r8.fAccepting = (int8_t)sd->fAccepting;
row->r8.fLookAhead = (int8_t)sd->fLookAhead;
row->r8.fTagIdx = (int8_t)sd->fTagsIdx;
for (col=0; col<catCount; col++) {
U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
row->r8.fNextState[col] = sd->fDtran->elementAti(col);
}
} else {
U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
row->r16.fAccepting = (int16_t)sd->fAccepting;
row->r16.fLookAhead = (int16_t)sd->fLookAhead;
row->r16.fTagIdx = (int16_t)sd->fTagsIdx;
for (col=0; col<catCount; col++) {
row->r16.fNextState[col] = sd->fDtran->elementAti(col);
}
}
}
}
@ -1520,11 +1546,18 @@ int32_t RBBITableBuilder::getSafeTableSize() const {
numRows = fSafeTable->size();
numCols = fRB->fSetBuilder->getNumCharCategories();
rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
if (use8BitsForSafeTable()) {
rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
} else {
rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
}
size += numRows * rowSize;
return size;
}
bool RBBITableBuilder::use8BitsForSafeTable() const {
return fSafeTable->size() <= kMaxStateFor8BitsTable;
}
//-----------------------------------------------------------------------------
//
@ -1549,20 +1582,33 @@ void RBBITableBuilder::exportSafeTable(void *where) {
return;
}
table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
table->fNumStates = fSafeTable->size();
table->fFlags = 0;
table->fReserved = 0;
if (use8BitsForSafeTable()) {
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
table->fFlags |= RBBI_8BITS_ROWS;
} else {
table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
}
for (state=0; state<table->fNumStates; state++) {
UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
row->fAccepting = 0;
row->fLookAhead = 0;
row->fTagIdx = 0;
row->fReserved = 0;
for (col=0; col<catCount; col++) {
row->fNextState[col] = rowString->charAt(col);
if (use8BitsForSafeTable()) {
row->r8.fAccepting = 0;
row->r8.fLookAhead = 0;
row->r8.fTagIdx = 0;
for (col=0; col<catCount; col++) {
U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
row->r8.fNextState[col] = rowString->charAt(col);
}
} else {
row->r16.fAccepting = 0;
row->r16.fLookAhead = 0;
row->r16.fTagIdx = 0;
for (col=0; col<catCount; col++) {
row->r16.fNextState[col] = rowString->charAt(col);
}
}
}
}

View File

@ -53,6 +53,9 @@ public:
*/
void exportTable(void *where);
/** Use 8 bits to encode the forward table */
bool use8BitsForTable() const;
/**
* Find duplicate (redundant) character classes. Begin looking with categories.first.
* Duplicate, if found are returned in the categories parameter.
@ -85,6 +88,8 @@ public:
*/
void exportSafeTable(void *where);
/** Use 8 bits to encode the safe reverse table */
bool use8BitsForSafeTable() const;
private:
void calcNullable(RBBINode *n);

View File

@ -32,6 +32,8 @@
#include "unicode/parseerr.h"
#include "unicode/schriter.h"
struct UCPTrie;
U_NAMESPACE_BEGIN
/** @internal */
@ -659,6 +661,28 @@ private:
*/
int32_t handleNext();
/*
* Templatized version of handleNext() and handleSafePrevious().
*
* There will be exactly four instantiations, two each for 8 and 16 bit tables,
* two each for 8 and 16 bit trie.
* Having separate instantiations for the table types keeps conditional tests of
* the table type out of the inner loops, at the expense of replicated code.
*
* The template parameter for the Trie access function is a value, not a type.
* Doing it this way, the compiler will inline the Trie function in the
* expanded functions. (Both the 8 and 16 bit access functions have the same type
* signature)
*/
typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
int32_t handleSafePrevious(int32_t fromPosition);
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
int32_t handleNext();
/**
* This function returns the appropriate LanguageBreakEngine for a
@ -682,6 +706,16 @@ private:
*/
void dumpTables();
/**
* Bit for dictionary based category
*/
static constexpr int32_t kDictBit = 0x4000;
/**
* Bit for dictionary based category in 8bits trie
*/
static constexpr int32_t kDictBitFor8BitsTrie = 0x0080;
#endif /* U_HIDE_INTERNAL_API */
};

View File

@ -1030,7 +1030,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
parseError.offset = 0;
LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status));
uint32_t length;
const UChar *builtSource;
const char *builtSource;
const uint8_t *rbbiRules;
const uint8_t *builtRules;
@ -1040,7 +1040,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
}
builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
builtSource = (const char *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
if (U_FAILURE(status)) {
errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",

View File

@ -128,6 +128,11 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestReverse);
TESTCASE_AUTO(TestBug13692);
TESTCASE_AUTO(TestDebugRules);
TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
TESTCASE_AUTO(TestTable_8_16_Bits);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -4621,7 +4626,7 @@ void RBBITest::TestBug12677() {
RuleBasedBreakIterator bi(rules, pe, status);
assertSuccess(WHERE, status);
UnicodeString rtRules = bi.getRules();
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
}
@ -4635,6 +4640,7 @@ void RBBITest::TestTableRedundancies() {
RBBIDataWrapper *dw = bi->fData;
const RBBIStateTable *fwtbl = dw->fForwardTable;
UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
int32_t numCharClasses = dw->fHeader->fCatCount;
// printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
@ -4645,7 +4651,7 @@ void RBBITest::TestTableRedundancies() {
UnicodeString s;
for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
s.append(row->fNextState[column]);
s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
}
columns.push_back(s);
}
@ -4665,12 +4671,22 @@ void RBBITest::TestTableRedundancies() {
for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
UnicodeString s;
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
assertTrue(WHERE, row->fAccepting >= -1);
s.append(row->fAccepting + 1); // values of -1 are expected.
s.append(row->fLookAhead);
s.append(row->fTagIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->fNextState[column]);
if (in8Bits) {
assertTrue(WHERE, row->r8.fAccepting >= -1);
s.append(row->r8.fAccepting + 1); // values of -1 are expected.
s.append(row->r8.fLookAhead);
s.append(row->r8.fTagIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->r8.fNextState[column]);
}
} else {
assertTrue(WHERE, row->r16.fAccepting >= -1);
s.append(row->r16.fAccepting + 1); // values of -1 are expected.
s.append(row->r16.fLookAhead);
s.append(row->r16.fTagIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->r16.fNextState[column]);
}
}
rows.push_back(s);
}
@ -4743,12 +4759,14 @@ void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
RBBIDataWrapper *data = bi->fData;
int32_t categoryCount = data->fHeader->fCatCount;
UTrie2 *trie = data->fTrie;
UCPTrie *trie = data->fTrie;
bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
std::vector<UnicodeString> strings(categoryCount, UnicodeString());
for (int cp=0; cp<0x1fff0; ++cp) {
int cat = utrie2_get32(trie, cp);
cat &= ~0x4000; // And off the dictionary bit from the category.
int cat = ucptrie_get(trie, cp);
cat &= ~dictBit; // And off the dictionary bit from the category.
assertTrue(WHERE, cat < categoryCount && cat >= 0);
if (cat < 0 || cat >= categoryCount) return;
strings[cat].append(cp);
@ -4886,6 +4904,182 @@ void RBBITest::TestDebugRules() {
#endif
}
void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
// Text are duplicate characters from U+4E00 to U+4FFF
UnicodeString text;
for (UChar c = 0x4e00; c < 0x5000; c++) {
text.append(c).append(c);
}
// Generate rule which will caused length+4 character classes and
// length+3 states
UnicodeString rules(u"!!quoted_literals_only;");
for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
rules.append(u'\'').append(c).append(c).append(u"';");
}
rules.append(u".;");
UErrorCode status = U_ZERO_ERROR;
UParseError parseError;
RuleBasedBreakIterator bi(rules, parseError, status);
assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
bi.setText(text);
int32_t pos;
int32_t i = 0;
while ((pos = bi.next()) > 0) {
// The first numChar should not break between the pair
if (i++ < numChar) {
assertEquals(WHERE, i * 2, pos);
} else {
// After the first numChar next(), break on each character.
assertEquals(WHERE, i + numChar, pos);
}
}
while ((pos = bi.previous()) > 0) {
// The first numChar should not break between the pair
if (--i < numChar) {
assertEquals(WHERE, i * 2, pos);
} else {
// After the first numChar next(), break on each character.
assertEquals(WHERE, i + numChar, pos);
}
}
}
void RBBITest::Test8BitsTrieWith8BitStateTable() {
testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
}
void RBBITest::Test16BitsTrieWith8BitStateTable() {
testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
}
void RBBITest::Test16BitsTrieWith16BitStateTable() {
testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
}
void RBBITest::Test8BitsTrieWith16BitStateTable() {
// Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
// create state table in 16 bits.
// Generate 510 'a' as text
UnicodeString text;
for (int32_t i = 0; i < 510; i++) {
text.append(u'a');
}
UnicodeString rules(u"!!quoted_literals_only;'");
// 254 'a' in the rule will cause 256 states
for (int32_t i = 0; i < 254; i++) {
rules.append(u'a');
}
rules.append(u"';.;");
UErrorCode status = U_ZERO_ERROR;
UParseError parseError;
LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
assertEquals(WHERE,
false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
bi->setText(text);
// break positions:
// 254, 508, 509, ... 510
assertEquals("next()", 254, bi->next());
int32_t i = 0;
int32_t pos;
while ((pos = bi->next()) > 0) {
assertEquals(WHERE, 508 + i , pos);
i++;
}
i = 0;
while ((pos = bi->previous()) > 0) {
i++;
if (pos >= 508) {
assertEquals(WHERE, 510 - i , pos);
} else {
assertEquals(WHERE, 254 , pos);
}
}
}
// Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
// that there are no problems with rules at the size that transitions between the two.
//
// A rule that matches a literal string, like 'abcdefghij', will require one state and
// one character class per character in the string. So we can make a rule to tickle the
// boundaries by using literal strings of various lengths.
//
// For both the number of states and the number of character classes, the eight bit format
// only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
// leaving 120 something available. This test runs the string over the range of 120 - 130,
// which allows some margin for changes to the number of values reserved by the rule builder
// without breaking the test.
void RBBITest::TestTable_8_16_Bits() {
// testStr serves as both the source of the rule string (truncated to the desired length)
// and as test data to check matching behavior. A break rule consisting of the first 120
// characters of testStr will match the first 120 chars of the full-length testStr.
UnicodeString testStr;
for (UChar c=0x3000; c<0x3200; ++c) {
testStr.append(c);
}
const int32_t startLength = 120; // The shortest rule string to test.
const int32_t endLength = 260; // The longest rule string to test
const int32_t increment = this->quick ? endLength - startLength : 1;
for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
RuleBasedBreakIterator bi(ruleString, parseError, status);
if (!assertSuccess(WHERE, status)) {
errln(ruleString);
break;
}
// bi.dumpTables();
// Verify that the break iterator is functioning - that the first boundary found
// in testStr is at the length of the rule string.
bi.setText(testStr);
assertEquals(WHERE, ruleLen, bi.next());
// Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
// of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
bi.setText(testStr);
int32_t result = bi.preceding(ruleLen);
assertEquals(WHERE, 0, result);
// Verify that the range of rule lengths being tested cover the transations
// from 8 to 16 bit data.
bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (ruleLen == startLength) {
assertEquals(WHERE, true, has8BitRowData);
assertEquals(WHERE, true, has8BitsTrie);
}
if (ruleLen == endLength) {
assertEquals(WHERE, false, has8BitRowData);
assertEquals(WHERE, false, has8BitsTrie);
}
}
}
#if U_ENABLE_TRACING
static std::vector<std::string> gData;
static std::vector<int32_t> gEntryFn;

View File

@ -86,6 +86,11 @@ public:
void TestDebug();
void TestProperties();
void Test8BitsTrieWith8BitStateTable();
void Test8BitsTrieWith16BitStateTable();
void Test16BitsTrieWith8BitStateTable();
void Test16BitsTrieWith16BitStateTable();
void TestTable_8_16_Bits();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
@ -133,6 +138,9 @@ private:
// Test parameters, from the test framework and test invocation.
const char* fTestParams;
// Helper functions to test different trie bit sizes and state table bit sizes.
void testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits);
#if U_ENABLE_TRACING
void assertTestTraceResult(int32_t fnNumber, const char* expectedData);
#endif

View File

@ -12,10 +12,12 @@ package com.ibm.icu.impl;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.CodePointTrie;
/**
* <p>Internal class used for Rule Based Break Iterators.</p>
@ -43,14 +45,10 @@ public final class RBBIDataWrapper {
* Option Flags for this state table.
*/
public int fFlags;
/**
* Option Flags for this state table.
*/
public int fReserved;
/**
* Linear array of next state values, accessed as short[state, char_class]
*/
public short[] fTable;
public char[] fTable;
public RBBIStateTable() {
}
@ -59,16 +57,29 @@ public final class RBBIDataWrapper {
if (length == 0) {
return null;
}
if (length < 16) {
if (length < 12) {
throw new IOException("Invalid RBBI state table length.");
}
RBBIStateTable This = new RBBIStateTable();
This.fNumStates = bytes.getInt();
This.fRowLen = bytes.getInt();
This.fFlags = bytes.getInt();
This.fReserved = bytes.getInt();
int lengthOfShorts = length - 16; // length in bytes.
This.fTable = ICUBinary.getShorts(bytes, lengthOfShorts / 2, lengthOfShorts & 1);
int lengthOfTable = length - 12; // length in bytes.
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
if (use8Bits) {
This.fTable = new char[lengthOfTable];
for (int i = 0; i < lengthOfTable; i++) {
byte b = bytes.get();
if (i % This.fRowLen < NEXTSTATES) {
This.fTable[i] = (char) b; // Treat b as signed.
} else {
This.fTable[i] = (char)(0xff & b); // Treat b as unsigned.
}
}
ICUBinary.skipBytes(bytes, lengthOfTable & 1);
} else {
This.fTable = ICUBinary.getChars(bytes, lengthOfTable / 2, lengthOfTable & 1);
}
return This;
}
@ -76,13 +87,20 @@ public final class RBBIDataWrapper {
bytes.writeInt(fNumStates);
bytes.writeInt(fRowLen);
bytes.writeInt(fFlags);
bytes.writeInt(fReserved);
int tableLen = fRowLen * fNumStates / 2; // fRowLen is bytes.
for (int i = 0; i < tableLen; i++) {
bytes.writeShort(fTable[i]);
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
for (int i = 0; i < tableLen; i++) {
byte b = (byte)(fTable[i] & 0x00ff);
bytes.writeByte(b);
}
} else {
int tableLen = fRowLen * fNumStates / 2; // fRowLen is bytes.
for (int i = 0; i < tableLen; i++) {
bytes.writeChar(fTable[i]);
}
}
int bytesWritten = 16 + fRowLen * fNumStates; // total bytes written,
// including 16 for the header.
int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written,
// including 12 for the header.
while (bytesWritten % 8 != 0) {
bytes.writeByte(0);
++bytesWritten;
@ -105,7 +123,6 @@ public final class RBBIDataWrapper {
if (fNumStates != otherST.fNumStates) return false;
if (fRowLen != otherST.fRowLen) return false;
if (fFlags != otherST.fFlags) return false;
if (fReserved != otherST.fReserved) return false;
return Arrays.equals(fTable, otherST.fTable);
}
}
@ -134,12 +151,12 @@ public final class RBBIDataWrapper {
public RBBIStateTable fRTable;
public Trie2 fTrie;
public CodePointTrie fTrie;
public String fRuleSource;
public int fStatusTable[];
public static final int DATA_FORMAT = 0x42726b20; // "Brk "
public static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0
public static final int FORMAT_VERSION = 0x06000000; // 6.0.0.0
private static final class IsAcceptable implements Authenticate {
@Override
@ -186,20 +203,20 @@ public final class RBBIDataWrapper {
* offset to the "tagIndex" field in a state table row.
*/
public final static int TAGIDX = 2;
/**
* offset to the reserved field in a state table row.
*/
public final static int RESERVED = 3;
/**
* offset to the start of the next states array in a state table row.
*/
public final static int NEXTSTATES = 4;
public final static int NEXTSTATES = 3;
// Bit selectors for the "FLAGS" field of the state table header
// enum RBBIStateTableFlags in the C version.
//
public final static int RBBI_LOOKAHEAD_HARD_BREAK = 1;
public final static int RBBI_BOF_REQUIRED = 2;
public final static int RBBI_8BITS_ROWS = 4;
public final static int DICT_BIT = 0x4000;
public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
/**
* Data Header. A struct-like class with the fields from the RBBI data file header.
@ -243,7 +260,7 @@ public final class RBBIDataWrapper {
* array index of the start of the state table row for that state.
*/
public int getRowIndex(int state){
return state * (fHeader.fCatCount + 4);
return state * (fHeader.fCatCount + NEXTSTATES);
}
RBBIDataWrapper() {
@ -330,7 +347,10 @@ public final class RBBIDataWrapper {
// as we don't go more than 100 bytes past the
// past the end of the TRIE.
This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer
This.fTrie = CodePointTrie.fromBinary(
CodePointTrie.Type.FAST,
null,
bytes); // Deserialize the TRIE, leaving buffer
// at an unknown position, preceding the
// padding between TRIE and following section.
@ -359,8 +379,8 @@ public final class RBBIDataWrapper {
}
ICUBinary.skipBytes(bytes, This.fHeader.fRuleSource - pos);
pos = This.fHeader.fRuleSource;
This.fRuleSource = ICUBinary.getString(
bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1);
This.fRuleSource = new String(
ICUBinary.getBytes(bytes, This.fHeader.fRuleSourceLen, 0), StandardCharsets.UTF_8);
if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
This.dump(System.out);
@ -396,6 +416,15 @@ public final class RBBIDataWrapper {
return dest.toString();
}
static public String charToString(char n, int width) {
StringBuilder dest = new StringBuilder(width);
dest.append(n);
while (dest.length() < width) {
dest.insert(0, ' ');
}
return dest.toString();
}
/** Fixed width int-to-string conversion. */
static public String intToHexString(int n, int width) {
StringBuilder dest = new StringBuilder(width);
@ -408,11 +437,11 @@ public final class RBBIDataWrapper {
/** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
private void dumpTable(java.io.PrintStream out, RBBIStateTable table) {
if (table == null || table.fTable.length == 0) {
if (table == null || (table.fTable.length == 0)) {
out.println(" -- null -- ");
} else {
int n;
int state;
char n;
char state;
StringBuilder header = new StringBuilder(" Row Acc Look Tag");
for (n=0; n<fHeader.fCatCount; n++) {
header.append(intToString(n, 5));
@ -434,18 +463,18 @@ public final class RBBIDataWrapper {
* @param table
* @param state
*/
private void dumpRow(java.io.PrintStream out, RBBIStateTable table, int state) {
private void dumpRow(java.io.PrintStream out, RBBIStateTable table, char state) {
StringBuilder dest = new StringBuilder(fHeader.fCatCount*5 + 20);
dest.append(intToString(state, 4));
int row = getRowIndex(state);
if (table.fTable[row+ACCEPTING] != 0) {
dest.append(intToString(table.fTable[row+ACCEPTING], 5));
}else {
dest.append(intToString(table.fTable[row+ACCEPTING], 5));
} else {
dest.append(" ");
}
if (table.fTable[row+LOOKAHEAD] != 0) {
dest.append(intToString(table.fTable[row+LOOKAHEAD], 5));
}else {
} else {
dest.append(" ");
}
dest.append(intToString(table.fTable[row+TAGIDX], 5));
@ -466,6 +495,7 @@ public final class RBBIDataWrapper {
int char32;
int category;
int lastNewline[] = new int[n+1];
int dictMask = fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT;
for (category = 0; category <= fHeader.fCatCount; category ++) {
catStrings[category] = "";
@ -474,7 +504,7 @@ public final class RBBIDataWrapper {
out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.get(char32);
category &= ~0x4000; // Mask off dictionary bit.
category &= ~dictMask; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
out.println("Error, bad category " + Integer.toHexString(category) +
" for char " + Integer.toHexString(char32));

View File

@ -11,6 +11,7 @@ package com.ibm.icu.text;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -182,7 +183,9 @@ class RBBIRuleBuilder {
int reverseTableSize = align8(fForwardTable.getSafeTableSize());
int trieSize = align8(fSetBuilder.getTrieSize());
int statusTableSize = align8(fRuleStatusVals.size() * 4);
int rulesSize = align8((strippedRules.length()) * 2);
byte[] strippedRulesUTF8 = strippedRules.getBytes(StandardCharsets.UTF_8);
int rulesSize = align8(strippedRulesUTF8.length + 1);
int totalSize = headerSize
+ forwardTableSize
@ -202,7 +205,7 @@ class RBBIRuleBuilder {
header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION;
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories();
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
@ -214,11 +217,11 @@ class RBBIRuleBuilder {
+ header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
+ header[RBBIDataWrapper.DH_TRIELEN];
+ trieSize;
header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE]
+ statusTableSize;
header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2;
header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRulesUTF8.length;
for (i = 0; i < header.length; i++) {
dos.writeInt(header[i]);
outputPos += 4;
@ -257,8 +260,9 @@ class RBBIRuleBuilder {
// Write out the stripped rules (rules with extra spaces removed
// These go last in the data area, even though they are not last in the header.
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
dos.writeChars(strippedRules);
outputPos += strippedRules.length() * 2;
dos.write(strippedRulesUTF8, 0, strippedRulesUTF8.length);
dos.write(0); // Null termination
outputPos += strippedRulesUTF8.length + 1;
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
dos.write(0);
outputPos += 1;

View File

@ -697,16 +697,14 @@ class RBBIRuleScanner {
static String stripRules(String rules) {
StringBuilder strippedRules = new StringBuilder();
int rulesLength = rules.length();
boolean skippingSpaces = false;
for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
int cp = rules.codePointAt(idx);
boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
if (skippingSpaces && whiteSpace) {
if (whiteSpace) {
continue;
}
strippedRules.appendCodePoint(cp);
skippingSpaces = whiteSpace;
}
return strippedRules.toString();
}

View File

@ -8,15 +8,16 @@
*/
package com.ibm.icu.text;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.Trie2Writable;
import com.ibm.icu.impl.Trie2_16;
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.MutableCodePointTrie;
//
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
@ -125,9 +126,9 @@ class RBBISetBuilder {
RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
Trie2Writable fTrie; // The mapping TRIE that is the end result of processing
MutableCodePointTrie fTrie; // The mapping TRIE that is the end result of processing
// the Unicode Sets.
Trie2_16 fFrozenTrie;
CodePointTrie fFrozenTrie;
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
@ -140,6 +141,7 @@ class RBBISetBuilder {
boolean fSawBOF;
static final int DICT_BIT = 0x4000;
static final int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
//------------------------------------------------------------------------
@ -286,22 +288,30 @@ class RBBISetBuilder {
}
private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127;
/**
* Build the Trie table for mapping UChar32 values to the corresponding
* range group number.
*/
void buildTrie() {
boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
RangeDescriptor rlRange;
fTrie = new Trie2Writable(0, // Initial value for all code points.
0); // Error value for out-of-range input.
fTrie = new MutableCodePointTrie(0, // Initial value for all code points.
0); // Error value for out-of-range input.
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
int value = rlRange.fNum;
if (use8Bits && ((value & DICT_BIT) != 0)) {
assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0);
// switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE
value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT);
}
fTrie.setRange(
rlRange.fStartChar, // Range start
rlRange.fEndChar, // Range end (inclusive)
rlRange.fNum, // value for range
true // Overwrite previously written values
value // value for range
);
}
}
@ -326,17 +336,31 @@ class RBBISetBuilder {
--fGroupCount;
}
//-----------------------------------------------------------------------------------
//
// freezeTrieIfNotYet() Ensure the trie is frozen. Shared code by getTrieSize
// and serializeTrie.
//
//-----------------------------------------------------------------------------------
void freezeTrieIfNotYet() {
if (fFrozenTrie == null) {
boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
fFrozenTrie = fTrie.buildImmutable(CodePointTrie.Type.FAST,
use8Bits ?
CodePointTrie.ValueWidth.BITS_8 :
CodePointTrie.ValueWidth.BITS_16);
fTrie = null;
}
}
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int getTrieSize() {
if (fFrozenTrie == null) {
fFrozenTrie = fTrie.toTrie2_16();
fTrie = null;
}
return fFrozenTrie.getSerializedLength();
freezeTrieIfNotYet();
return fFrozenTrie.toBinary(new ByteArrayOutputStream());
}
@ -346,11 +370,8 @@ class RBBISetBuilder {
//
//-----------------------------------------------------------------------------------
void serializeTrie(OutputStream os) throws IOException {
if (fFrozenTrie == null) {
fFrozenTrie = fTrie.toTrie2_16();
fTrie = null;
}
fFrozenTrie.serialize(os);
freezeTrieIfNotYet();
fFrozenTrie.toBinary(os);
}
//------------------------------------------------------------------------

View File

@ -74,6 +74,8 @@ class RBBITableBuilder {
/** Synthesized safe table, a List of row arrays. */
private List<short[]> fSafeTable;
private static final int MAX_STATE_FOR_8BITS_TABLE = 255;
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
int[] fLookAheadRuleMap;
@ -1097,10 +1099,11 @@ class RBBITableBuilder {
if (fRB.fTreeRoots[fRootIx] == null) {
return 0;
}
int size = 16; // The header of 4 ints, with no rows to the table.
int size = 12; // The header of 4 ints, with no rows to the table.
int numRows = fDStates.size();
int numCols = fRB.fSetBuilder.getNumCharCategories();
int rowSize = 8 + 2*numCols;
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
size += numRows * rowSize;
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
return size;
@ -1125,13 +1128,20 @@ class RBBITableBuilder {
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
fDStates.size() < 0x7fff);
table.fNumStates = fDStates.size();
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
// Size of table size in shorts.
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
int tableSize = (getTableSize() - 16) / 2; // fTable length in shorts.
table.fTable = new short[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
int tableSize;
if (use8Bits) {
tableSize = (getTableSize() - 12); // fTable length in bytes.
table.fTable = new char[tableSize];
table.fRowLen = rowLen; // Row length in bytes.
} else {
tableSize = (getTableSize() - 12) / 2; // fTable length in shorts.
table.fTable = new char[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
}
if (fRB.fLookAheadHardBreak) {
table.fFlags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
@ -1139,18 +1149,29 @@ class RBBITableBuilder {
if (fRB.fSetBuilder.sawBOF()) {
table.fFlags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
}
if (use8Bits) {
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
}
int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
for (state=0; state<table.fNumStates; state++) {
RBBIStateDescriptor sd = fDStates.get(state);
int row = state*rowLen;
Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
table.fTable[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
table.fTable[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx;
if (use8Bits) {
Assert.assrt (-128 < sd.fAccepting && sd.fAccepting <= MAX_STATE_FOR_8BITS_TABLE);
Assert.assrt (-128 < sd.fLookAhead && sd.fLookAhead <= MAX_STATE_FOR_8BITS_TABLE);
} else {
Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
}
table.fTable[row + RBBIDataWrapper.ACCEPTING] = (char)sd.fAccepting;
table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (char)sd.fLookAhead;
table.fTable[row + RBBIDataWrapper.TAGIDX] = (char)sd.fTagsIdx;
for (col=0; col<numCharCategories; col++) {
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
if (use8Bits) {
Assert.assrt (sd.fDtran[col] <= MAX_STATE_FOR_8BITS_TABLE);
}
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)sd.fDtran[col];
}
}
return table;
@ -1250,10 +1271,12 @@ class RBBITableBuilder {
if (fSafeTable == null) {
return 0;
}
int size = 16; // The header of 4 ints, with no rows to the table.
int size = 12; // The header of 4 ints, with no rows to the table.
int numRows = fSafeTable.size();
int numCols = fSafeTable.get(0).length;
int rowSize = 8 + 2*numCols;
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
size += numRows * rowSize;
// TODO: there are redundant round-up. Figure out best place, get rid of the rest.
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
@ -1269,23 +1292,33 @@ class RBBITableBuilder {
RBBIDataWrapper.RBBIStateTable exportSafeTable() {
RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
table.fNumStates = fSafeTable.size();
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
int numCharCategories = fSafeTable.get(0).length;
// Size of table size in shorts.
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
int rowLen = 4 + numCharCategories;
int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
// TODO: tableSize is basically numStates * numCharCategories,
// except for alignment padding. Clean up here, and in main exportTable().
int tableSize = (getSafeTableSize() - 16) / 2; // fTable length in shorts.
table.fTable = new short[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
int tableSize = (getSafeTableSize() - 12); // fTable length in bytes.
if (use8Bits) {
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
table.fTable = new char[tableSize];
table.fRowLen = rowLen; // Row length in bytes.
} else {
tableSize /= 2; // fTable length in shorts.
table.fTable = new char[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
}
for (int state=0; state<table.fNumStates; state++) {
short[] rowArray = fSafeTable.get(state);
int row = state * rowLen;
for (int col=0; col<numCharCategories; col++) {
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = rowArray[col];
if (use8Bits) {
Assert.assrt (rowArray[col] <= MAX_STATE_FOR_8BITS_TABLE);
}
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)rowArray[col];
}
}
return table;

View File

@ -26,10 +26,10 @@ import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;
/**
* Rule Based Break Iterator
@ -821,9 +821,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
// caches for quicker access
CharacterIterator text = fText;
Trie2 trie = fRData.fTrie;
CodePointTrie trie = fRData.fTrie;
short[] stateTable = fRData.fFTable.fTable;
char[] stateTable = fRData.fFTable.fTable;
int initialPosition = fPosition;
text.setIndex(initialPosition);
int result = initialPosition;
@ -844,6 +844,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
short category = 3;
int flagsState = fRData.fFTable.fFlags;
int mode = RBBI_RUN;
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
@ -885,10 +887,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
if ((category & dictMask) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
category &= ~dictMask;
}
if (TRACE) {
@ -912,8 +914,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up a state transition in the state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0xffff) {
// Match found, common case
result = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
@ -927,7 +928,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
if (completedRule > 0 && completedRule != 0xffff) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
@ -937,13 +938,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
}
// If we are at the position of the '/' in a look-ahead (hard break) rule;
// record the current position, to be returned later, if the full rule matches.
// TODO: Move this check before the previous check of fAccepting.
// This would enable hard-break rules with no following context.
// But there are line break test failures when trying this. Investigate.
// Issue ICU-20837
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
@ -996,14 +998,17 @@ public class RuleBasedBreakIterator extends BreakIterator {
* @internal
*/
private int handleSafePrevious(int fromPosition) {
int state;
char state;
short category = 0;
int result = 0;
// caches for quicker access
CharacterIterator text = fText;
Trie2 trie = fRData.fTrie;
short[] stateTable = fRData.fRTable.fTable;
CodePointTrie trie = fRData.fTrie;
char[] stateTable = fRData.fRTable.fTable;
int flagsState = fRData.fRTable.fFlags;
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
CISetIndex32(text, fromPosition);
if (TRACE) {
@ -1029,7 +1034,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = (short) trie.get(c);
category &= ~0x4000;
category &= ~dictMask;
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
@ -1209,6 +1214,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
int category;
int current;
int foundBreakCount = 0;
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
@ -1219,7 +1226,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
category = (short)fRData.fTrie.get(c);
while(true) {
while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) {
c = CharacterIteration.next32(fText); // pre-increment
category = (short)fRData.fTrie.get(c);
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0f8b27a3e77ffbe4468ede22ae8a4ca85df993472d6bdea432505a148ff33f23
size 13149606
oid sha256:8ed7db50765b06c8a35f48048543c5c9a2c2e19993f752bd71a15e6ac89aa3b3
size 13141781

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:58c1ec5386cba3b6660c3bf8c22ce74c343d3101354f157533d13ec1099e1379
size 94524
oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9
size 94529

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c39bb717c3e95c47c14f49507e8a7866e89bdb3588f021693c3909dda64f4dcb
size 723466
oid sha256:e032f823e0ba2fd99f784fe400675049c126e091158a285955c71aa5e2c6036b
size 723481

View File

@ -29,8 +29,10 @@ import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ULocale;
@RunWith(JUnit4.class)
public class RBBITest extends TestFmwk {
public RBBITest() {
@ -562,7 +564,7 @@ public class RBBITest extends TestFmwk {
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
String rtRules = bi.toString(); // getRules() in C++
assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
assertEquals("Break Iterator rule stripping test", "!!forward;$x=[ab#];'#''?';", rtRules);
}
@Test
@ -582,7 +584,7 @@ public class RBBITest extends TestFmwk {
StringBuilder s = new StringBuilder();
for (int r = 1; r < fwtbl.fNumStates; r++) {
int row = dw.getRowIndex(r);
short tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append((char)tableVal);
}
columns.add(s.toString());
@ -602,13 +604,12 @@ public class RBBITest extends TestFmwk {
for (int r=0; r<fwtbl.fNumStates; r++) {
StringBuilder s = new StringBuilder();
int row = dw.getRowIndex(r);
assertTrue("Accepting < -1", fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING] >= -1);
s.append(fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.LOOKAHEAD]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGIDX]);
for (int column=0; column<numCharClasses; column++) {
short tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append((char)tableVal);
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append(tableVal);
}
rows.add(s.toString());
}
@ -655,4 +656,199 @@ public class RBBITest extends TestFmwk {
assertTrue("Reverse Table", RBBIDataWrapper.equals(bi.fRData.fRTable, bi2.fRData.fRTable));
}
}
// Helper function to test 8/16 bits of trie and 8/16 bits of state table.
private void testTrieStateTable(int numChar, boolean expectUCPTrieValueWidthIn8Bits,
boolean expectStateRowIn8Bits) {
// Text are duplicate characters from U+4E00 to U+4FFF
StringBuilder builder = new StringBuilder(2 * (0x5000 - 0x4e00));
for (char c = 0x4e00; c < 0x5000; c++) {
builder.append(c).append(c);
}
String text = builder.toString();
// Generate rule which will caused length+4 character classes and
// length+3 states
builder = new StringBuilder(100 + 6 * numChar);
builder.append("!!quoted_literals_only;");
for (char c = 0x4e00; c < 0x4e00 + numChar; c++) {
builder.append("\'").append(c).append(c).append("';");
}
builder.append(".;");
String rules = builder.toString();
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
RBBIDataWrapper.RBBIStateTable rvtbl = dw.fRTable;
boolean has8BitRowDataForwardTable = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitRowDataReverseTable = (rvtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
assertEquals("Number of char classes mismatch numChar=" + numChar, numChar + 4, dw.fHeader.fCatCount);
assertEquals("Number of states in Forward Table mismatch numChar=" + numChar, numChar + 3, fwtbl.fNumStates);
assertEquals("Number of states in Reverse Table mismatch numChar=" + numChar, numChar + 3, rvtbl.fNumStates);
assertEquals("Trie width mismatch numChar=" + numChar, expectUCPTrieValueWidthIn8Bits, has8BitsTrie);
assertEquals("Bits of Forward State table mismatch numChar=" + numChar,
expectStateRowIn8Bits, has8BitRowDataForwardTable);
assertEquals("Bits of Reverse State table mismatch numChar=" + numChar,
expectStateRowIn8Bits, has8BitRowDataReverseTable);
bi.setText(text);
int pos;
int i = 0;
while ((pos = bi.next()) > 0) {
// The first numChar should not break between the pair
if (i++ < numChar) {
assertEquals("next() mismatch numChar=" + numChar, i * 2, pos);
} else {
// After the first numChar next(), break on each character.
assertEquals("next() mismatch numChar=" + numChar, i + numChar, pos);
}
}
while ((pos = bi.previous()) > 0) {
// The first numChar should not break between the pair
if (--i < numChar) {
assertEquals("previous() mismatch numChar=" + numChar, i * 2, pos);
} else {
// After the first numChar next(), break on each character.
assertEquals("previous() mismatch numChar=" + numChar, i + numChar, pos);
}
}
}
@Test
public void Test8BitsTrieWith8BitStateTable() {
testTrieStateTable(123, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith8BitStateTable() {
testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith16BitStateTable() {
testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
}
@Test
public void Test8BitsTrieWith16BitStateTable() {
// Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
// create state table in 16 bits.
// Generate 510 'a' as text
StringBuilder builder = new StringBuilder(510);
for (int i = 0; i < 510; i++) {
builder.append('a');
}
String text = builder.toString();
builder = new StringBuilder(550);
builder.append("!!quoted_literals_only;'");
// 254 'a' in the rule will cause 256 states
for (int i = 0; i < 254; i++) {
builder.append('a');
}
builder.append("';.;");
String rules = builder.toString();
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
assertFalse("State table should be in 16 bits", has8BitRowData);
assertTrue("Trie should be in 8 bits", has8BitsTrie);
bi.setText(text);
// break positions:
// 254, 508, 509, 510
assertEquals("next()", 254, bi.next());
int i = 0;
int pos;
while ((pos = bi.next()) > 0) {
assertEquals("next()", 508 + i , pos);
i++;
}
i = 0;
while ((pos = bi.previous()) > 0) {
i++;
if (pos >= 508) {
assertEquals("previous()", 510 - i , pos);
} else {
assertEquals("previous()", 254 , pos);
}
}
}
/**
* Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
* that there are no problems with rules at the size that transitions between the two.
*
* A rule that matches a literal string, like 'abcdefghij', will require one state and
* one character class per character in the string. So we can make a rule to tickle the
* boundaries by using literal strings of various lengths.
*
* For both the number of states and the number of character classes, the eight bit format
* only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
* leaving 120 something available. This test runs the string over the range of 120 - 130,
* which allows some margin for changes to the number of values reserved by the rule builder
* without breaking the test.
*/
@Test
public void TestTable_8_16_Bits() {
// testStr serves as both the source of the rule string (truncated to the desired length)
// and as test data to check matching behavior. A break rule consisting of the first 120
// characters of testStr will match the first 120 chars of the full-length testStr.
StringBuilder builder = new StringBuilder(0x200);
for (char c=0x3000; c<0x3200; ++c) {
builder.append(c);
}
String testStr = builder.toString();
int startLength = 120; // The shortest rule string to test.
int endLength = 260; // The longest rule string to test
int increment = 1;
for (int ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
String ruleString = (new String("!!quoted_literals_only; '#';"))
.replace("#", testStr.substring(0, ruleLen));
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(ruleString);
// Verify that the break iterator is functioning - that the first boundary found
// in testStr is at the length of the rule string.
bi.setText(testStr);
assertEquals("The first boundary found in testStr should be at the length of the rule string",
ruleLen, bi.next());
// Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
// of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
bi.setText(testStr);
int result = bi.preceding(ruleLen);
assertEquals("Reverse iteration should find the boundary at 0", 0, result);
// Verify that the range of rule lengths being tested cover the transations
// from 8 to 16 bit data.
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
if (ruleLen == startLength) {
assertTrue("State table should be in 8 bits", has8BitRowData);
assertTrue("Trie should be in 8 bits", has8BitsTrie);
}
if (ruleLen == endLength) {
assertFalse("State table should be in 16 bits", has8BitRowData);
assertFalse("Trie should be in 16 bits", has8BitsTrie);
}
}
}
}