parent
566e0f8686
commit
c5ebb80a73
@ -746,18 +746,68 @@ struct LookAheadResults {
|
||||
};
|
||||
|
||||
|
||||
// Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
|
||||
// instantiation, based on whether an 8 or 16 bit table is required.
|
||||
//
|
||||
// These Trie access functions will be inlined within the handleNext()/Previous() instantions.
|
||||
static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
|
||||
return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
|
||||
}
|
||||
|
||||
static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
|
||||
return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
|
||||
}
|
||||
|
||||
int32_t RuleBasedBreakIterator::handleNext() {
|
||||
const RBBIStateTable *statetable = fData->fForwardTable;
|
||||
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
|
||||
if (statetable->fFlags & RBBI_8BITS_ROWS) {
|
||||
if (use8BitsTrie) {
|
||||
return handleNext<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>();
|
||||
} else {
|
||||
return handleNext<RBBIStateTableRow8, TrieFunc16, kDictBit>();
|
||||
}
|
||||
} else {
|
||||
if (use8BitsTrie) {
|
||||
return handleNext<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>();
|
||||
} else {
|
||||
return handleNext<RBBIStateTableRow16, TrieFunc16, kDictBit>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
const RBBIStateTable *statetable = fData->fReverseTable;
|
||||
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
|
||||
if (statetable->fFlags & RBBI_8BITS_ROWS) {
|
||||
if (use8BitsTrie) {
|
||||
return handleSafePrevious<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
|
||||
} else {
|
||||
return handleSafePrevious<RBBIStateTableRow8, TrieFunc16, kDictBit>(fromPosition);
|
||||
}
|
||||
} else {
|
||||
if (use8BitsTrie) {
|
||||
return handleSafePrevious<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
|
||||
} else {
|
||||
return handleSafePrevious<RBBIStateTableRow16, TrieFunc16, kDictBit>(fromPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handleNext()
|
||||
// Run the state machine to find a boundary
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
|
||||
int32_t RuleBasedBreakIterator::handleNext() {
|
||||
int32_t state;
|
||||
uint16_t category = 0;
|
||||
RBBIRunMode mode;
|
||||
|
||||
RBBIStateTableRow *row;
|
||||
RowType *row;
|
||||
UChar32 c;
|
||||
LookAheadResults lookAheadMatches;
|
||||
int32_t result = 0;
|
||||
@ -789,7 +839,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
|
||||
// Set the initial state for the state machine
|
||||
state = START_STATE;
|
||||
row = (RBBIStateTableRow *)
|
||||
row = (RowType *)
|
||||
//(statetable->fTableData + (statetable->fRowLen * state));
|
||||
(tableData + tableRowLen * state);
|
||||
|
||||
@ -825,20 +875,17 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
if (mode == RBBI_RUN) {
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
category = trieFunc(fData->fTrie, c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iteration.
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
if ((category & dictMask) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
category &= ~dictMask;
|
||||
}
|
||||
}
|
||||
|
||||
@ -860,7 +907,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
// fNextState is a variable-length array.
|
||||
U_ASSERT(category<fData->fHeader->fCatCount);
|
||||
state = row->fNextState[category]; /*Not accessing beyond memory*/
|
||||
row = (RBBIStateTableRow *)
|
||||
row = (RowType *)
|
||||
// (statetable->fTableData + (statetable->fRowLen * state));
|
||||
(tableData + tableRowLen * state);
|
||||
|
||||
@ -948,10 +995,12 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
||||
// because the safe table does not require as many options.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
|
||||
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
|
||||
int32_t state;
|
||||
uint16_t category = 0;
|
||||
RBBIStateTableRow *row;
|
||||
RowType *row;
|
||||
UChar32 c;
|
||||
int32_t result = 0;
|
||||
|
||||
@ -971,7 +1020,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
// Set the initial state for the state machine
|
||||
c = UTEXT_PREVIOUS32(&fText);
|
||||
state = START_STATE;
|
||||
row = (RBBIStateTableRow *)
|
||||
row = (RowType *)
|
||||
(stateTable->fTableData + (stateTable->fRowLen * state));
|
||||
|
||||
// loop until we reach the start of the text or transition to state 0
|
||||
@ -980,12 +1029,10 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
category &= ~0x4000;
|
||||
// Off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = trieFunc(fData->fTrie, c);
|
||||
category &= ~dictMask;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
@ -1004,7 +1051,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
// fNextState is a variable-length array.
|
||||
U_ASSERT(category<fData->fHeader->fCatCount);
|
||||
state = row->fNextState[category]; /*Not accessing beyond memory*/
|
||||
row = (RBBIStateTableRow *)
|
||||
row = (RowType *)
|
||||
(stateTable->fTableData + (stateTable->fRowLen * state));
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
@ -1024,6 +1071,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// getRuleStatus() Return the break rule tag associated with the current
|
||||
|
@ -119,6 +119,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
|
||||
|
||||
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
|
||||
int32_t firstRuleStatus, int32_t otherRuleStatus) {
|
||||
uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ?
|
||||
kDictBitFor8BitsTrie : kDictBit;
|
||||
if ((endPos - startPos) <= 1) {
|
||||
return;
|
||||
}
|
||||
@ -142,13 +144,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
||||
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
UChar32 c = utext_current32(text);
|
||||
category = UTRIE2_GET16(fBI->fData->fTrie, c);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
|
||||
while(U_SUCCESS(status)) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) {
|
||||
utext_next32(text); // TODO: cleaner loop structure.
|
||||
c = utext_current32(text);
|
||||
category = UTRIE2_GET16(fBI->fData->fTrie, c);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
}
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
@ -166,7 +168,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = utext_current32(text);
|
||||
category = UTRIE2_GET16(fBI->fData->fTrie, c);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
}
|
||||
|
||||
// If we found breaks, ensure that the first and last entries are
|
||||
|
@ -11,10 +11,10 @@
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "rbbidata.h"
|
||||
#include "rbbirb.h"
|
||||
#include "utrie2.h"
|
||||
#include "udatamem.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
@ -110,17 +110,24 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
|
||||
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
NULL, // *actual length
|
||||
&status);
|
||||
fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST,
|
||||
UCPTRIE_VALUE_BITS_ANY,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
nullptr, // *actual length
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
|
||||
fRuleString.setTo(TRUE, fRuleSource, -1);
|
||||
UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie);
|
||||
if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
fRuleSource = ((char *)data + fHeader->fRuleSource);
|
||||
fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen));
|
||||
U_ASSERT(data->fRuleSourceLen > 0);
|
||||
|
||||
fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
|
||||
@ -142,8 +149,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
||||
//-----------------------------------------------------------------------------
|
||||
RBBIDataWrapper::~RBBIDataWrapper() {
|
||||
U_ASSERT(fRefCount == 0);
|
||||
utrie2_close(fTrie);
|
||||
fTrie = NULL;
|
||||
ucptrie_close(fTrie);
|
||||
fTrie = nullptr;
|
||||
if (fUDataMem) {
|
||||
udata_close(fUDataMem);
|
||||
} else if (!fDontFreeData) {
|
||||
@ -225,6 +232,11 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
|
||||
|
||||
RBBIDebugPrintf(" %s\n", heading);
|
||||
|
||||
RBBIDebugPrintf("Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
|
||||
table->fFlags,
|
||||
table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
|
||||
table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
|
||||
table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
|
||||
RBBIDebugPrintf("State | Acc LA TagIx");
|
||||
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
|
||||
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
|
||||
@ -236,12 +248,20 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
|
||||
RBBIDebugPrintf(" N U L L T A B L E\n\n");
|
||||
return;
|
||||
}
|
||||
UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS;
|
||||
for (s=0; s<table->fNumStates; s++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)
|
||||
(table->fTableData + (table->fRowLen * s));
|
||||
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("%3d ", row->fNextState[c]);
|
||||
if (use8Bits) {
|
||||
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagIdx);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
|
||||
}
|
||||
} else {
|
||||
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagIdx);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
|
||||
}
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
@ -377,35 +397,64 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
|
||||
//
|
||||
int32_t topSize = offsetof(RBBIStateTable, fTableData);
|
||||
|
||||
// Forward state table.
|
||||
// Forward state table.
|
||||
tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
|
||||
tableLength = ds->readUInt32(rbbiDH->fFTableLen);
|
||||
|
||||
if (tableLength > 0) {
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
|
||||
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
|
||||
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
outBytes+tableStartOffset, status);
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
|
||||
// Swap the state table if the table is in 16 bits.
|
||||
if (use8Bits) {
|
||||
if (outBytes != inBytes) {
|
||||
uprv_memmove(outBytes+tableStartOffset+topSize,
|
||||
inBytes+tableStartOffset+topSize,
|
||||
tableLength-topSize);
|
||||
}
|
||||
} else {
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Reverse state table. Same layout as forward table, above.
|
||||
tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
|
||||
tableLength = ds->readUInt32(rbbiDH->fRTableLen);
|
||||
|
||||
if (tableLength > 0) {
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
|
||||
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
|
||||
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
outBytes+tableStartOffset, status);
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
|
||||
// Swap the state table if the table is in 16 bits.
|
||||
if (use8Bits) {
|
||||
if (outBytes != inBytes) {
|
||||
uprv_memmove(outBytes+tableStartOffset+topSize,
|
||||
inBytes+tableStartOffset+topSize,
|
||||
tableLength-topSize);
|
||||
}
|
||||
} else {
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
}
|
||||
|
||||
// Trie table for character categories
|
||||
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
|
||||
ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
|
||||
|
||||
// Source Rules Text. It's UChar data
|
||||
ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
|
||||
// Source Rules Text. It's UTF8 data
|
||||
if (outBytes != inBytes) {
|
||||
uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource),
|
||||
inBytes+ds->readUInt32(rbbiDH->fRuleSource),
|
||||
ds->readUInt32(rbbiDH->fRuleSourceLen));
|
||||
}
|
||||
|
||||
// Table of rule status values. It's all int_32 values
|
||||
ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
|
||||
|
@ -49,16 +49,17 @@ ubrk_swap(const UDataSwapper *ds,
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uversion.h"
|
||||
#include "umutex.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// The current RBBI data format version.
|
||||
static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {5, 0, 0, 0};
|
||||
static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
|
||||
|
||||
/*
|
||||
* The following structs map exactly onto the raw data from ICU common data file.
|
||||
@ -94,25 +95,25 @@ struct RBBIDataHeader {
|
||||
|
||||
|
||||
|
||||
struct RBBIStateTableRow {
|
||||
int16_t fAccepting; /* Non-zero if this row is for an accepting state. */
|
||||
template <typename ST, typename UT>
|
||||
struct RBBIStateTableRowT {
|
||||
ST fAccepting; /* Non-zero if this row is for an accepting state. */
|
||||
/* Value 0: not an accepting state. */
|
||||
/* -1: Unconditional Accepting state. */
|
||||
/* positive: Look-ahead match has completed. */
|
||||
/* Actual boundary position happened earlier */
|
||||
/* Value here == fLookAhead in earlier */
|
||||
/* state, at actual boundary pos. */
|
||||
int16_t fLookAhead; /* Non-zero if this row is for a state that */
|
||||
ST fLookAhead; /* Non-zero if this row is for a state that */
|
||||
/* corresponds to a '/' in the rule source. */
|
||||
/* Value is the same as the fAccepting */
|
||||
/* value for the rule (which will appear */
|
||||
/* in a different state. */
|
||||
int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */
|
||||
ST fTagIdx; /* Non-zero if this row covers a {tagged} position */
|
||||
/* from a rule. Value is the index in the */
|
||||
/* StatusTable of the set of matching */
|
||||
/* tags (rule status values) */
|
||||
int16_t fReserved;
|
||||
uint16_t fNextState[1]; /* Next State, indexed by char category. */
|
||||
UT fNextState[1]; /* Next State, indexed by char category. */
|
||||
/* Variable-length array declared with length 1 */
|
||||
/* to disable bounds checkers. */
|
||||
/* Array Size is actually fData->fHeader->fCatCount*/
|
||||
@ -120,12 +121,18 @@ struct RBBIStateTableRow {
|
||||
/* before changing anything here. */
|
||||
};
|
||||
|
||||
typedef RBBIStateTableRowT<int8_t, uint8_t> RBBIStateTableRow8;
|
||||
typedef RBBIStateTableRowT<int16_t, uint16_t> RBBIStateTableRow16;
|
||||
|
||||
union RBBIStateTableRow {
|
||||
RBBIStateTableRow16 r16;
|
||||
RBBIStateTableRow8 r8;
|
||||
};
|
||||
|
||||
struct RBBIStateTable {
|
||||
uint32_t fNumStates; /* Number of states. */
|
||||
uint32_t fRowLen; /* Length of a state table row, in bytes. */
|
||||
uint32_t fFlags; /* Option Flags for this state table */
|
||||
uint32_t fReserved; /* reserved */
|
||||
char fTableData[1]; /* First RBBIStateTableRow begins here. */
|
||||
/* Variable-length array declared with length 1 */
|
||||
/* to disable bounds checkers. */
|
||||
@ -133,10 +140,9 @@ struct RBBIStateTable {
|
||||
/* arithmetic for indexing variable length rows.) */
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
RBBI_LOOKAHEAD_HARD_BREAK = 1,
|
||||
RBBI_BOF_REQUIRED = 2
|
||||
} RBBIStateTableFlags;
|
||||
constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
|
||||
constexpr uint32_t RBBI_BOF_REQUIRED = 2;
|
||||
constexpr uint32_t RBBI_8BITS_ROWS = 4;
|
||||
|
||||
|
||||
/* */
|
||||
@ -170,13 +176,13 @@ public:
|
||||
const RBBIDataHeader *fHeader;
|
||||
const RBBIStateTable *fForwardTable;
|
||||
const RBBIStateTable *fReverseTable;
|
||||
const UChar *fRuleSource;
|
||||
const char *fRuleSource;
|
||||
const int32_t *fRuleStatusTable;
|
||||
|
||||
/* number of int32_t values in the rule status table. Used to sanity check indexing */
|
||||
int32_t fStatusMaxIdx;
|
||||
|
||||
UTrie2 *fTrie;
|
||||
UCPTrie *fTrie;
|
||||
|
||||
private:
|
||||
u_atomic_int32_t fRefCount;
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uchriter.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
||||
@ -154,7 +155,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
||||
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
int32_t rulesLengthInUTF8 = 0;
|
||||
u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
|
||||
fStrippedRules.getBuffer(), fStrippedRules.length(),
|
||||
0xfffd, nullptr, fStatus);
|
||||
*fStatus = U_ZERO_ERROR;
|
||||
|
||||
int32_t rulesSize = align8((rulesLengthInUTF8+1));
|
||||
|
||||
int32_t totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
@ -197,11 +205,11 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
data->fRTableLen = reverseTableSize;
|
||||
|
||||
data->fTrie = data->fRTable + data->fRTableLen;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fStatusTable = data->fTrie + trieSize;
|
||||
data->fTrieLen = trieSize;
|
||||
data->fStatusTable = data->fTrie + data->fTrieLen;
|
||||
data->fStatusTableLen= statusTableSize;
|
||||
data->fRuleSource = data->fStatusTable + statusTableSize;
|
||||
data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
|
||||
data->fRuleSourceLen = rulesLengthInUTF8;
|
||||
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
@ -214,7 +222,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
|
||||
}
|
||||
|
||||
fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
|
||||
fStrippedRules.getBuffer(), fStrippedRules.length(),
|
||||
0xfffd, nullptr, fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
@ -829,16 +829,14 @@ static const UChar chRParen = 0x29;
|
||||
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
|
||||
UnicodeString strippedRules;
|
||||
int32_t rulesLength = rules.length();
|
||||
bool skippingSpaces = false;
|
||||
|
||||
for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
|
||||
UChar32 cp = rules.char32At(idx);
|
||||
bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
|
||||
if (skippingSpaces && whiteSpace) {
|
||||
if (whiteSpace) {
|
||||
continue;
|
||||
}
|
||||
strippedRules.append(cp);
|
||||
skippingSpaces = whiteSpace;
|
||||
}
|
||||
return strippedRules;
|
||||
}
|
||||
|
@ -35,7 +35,6 @@
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "utrie2.h"
|
||||
#include "uvector.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
@ -46,6 +45,7 @@
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const int32_t kMaxCharCategoriesFor8BitsTrie = 127;
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor
|
||||
@ -56,7 +56,8 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
|
||||
fRB = rb;
|
||||
fStatus = rb->fStatus;
|
||||
fRangeList = 0;
|
||||
fTrie = 0;
|
||||
fMutableTrie = nullptr;
|
||||
fTrie = nullptr;
|
||||
fTrieSize = 0;
|
||||
fGroupCount = 0;
|
||||
fSawBOF = FALSE;
|
||||
@ -79,7 +80,8 @@ RBBISetBuilder::~RBBISetBuilder()
|
||||
delete r;
|
||||
}
|
||||
|
||||
utrie2_close(fTrie);
|
||||
ucptrie_close(fTrie);
|
||||
umutablecptrie_close(fMutableTrie);
|
||||
}
|
||||
|
||||
|
||||
@ -255,17 +257,23 @@ void RBBISetBuilder::buildRanges() {
|
||||
void RBBISetBuilder::buildTrie() {
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
fTrie = utrie2_open(0, // Initial value for all code points.
|
||||
fMutableTrie = umutablecptrie_open(
|
||||
0, // Initial value for all code points.
|
||||
0, // Error value for out-of-range input.
|
||||
fStatus);
|
||||
|
||||
bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
|
||||
for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
|
||||
utrie2_setRange32(fTrie,
|
||||
rlRange->fStartChar, // Range start
|
||||
rlRange->fEndChar, // Range end (inclusive)
|
||||
rlRange->fNum, // value for range
|
||||
TRUE, // Overwrite previously written values
|
||||
fStatus);
|
||||
uint32_t value = rlRange->fNum;
|
||||
if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) {
|
||||
U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0);
|
||||
value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit);
|
||||
}
|
||||
umutablecptrie_setRange(fMutableTrie,
|
||||
rlRange->fStartChar, // Range start
|
||||
rlRange->fEndChar, // Range end (inclusive)
|
||||
value, // value for range
|
||||
fStatus);
|
||||
}
|
||||
}
|
||||
|
||||
@ -274,8 +282,8 @@ void RBBISetBuilder::mergeCategories(IntPair categories) {
|
||||
U_ASSERT(categories.first >= 1);
|
||||
U_ASSERT(categories.second > categories.first);
|
||||
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
|
||||
int32_t rangeNum = rd->fNum & ~DICT_BIT;
|
||||
int32_t rangeDict = rd->fNum & DICT_BIT;
|
||||
int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit;
|
||||
int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit;
|
||||
if (rangeNum == categories.second) {
|
||||
rd->fNum = categories.first | rangeDict;
|
||||
} else if (rangeNum > categories.second) {
|
||||
@ -295,15 +303,18 @@ int32_t RBBISetBuilder::getTrieSize() {
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return 0;
|
||||
}
|
||||
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
|
||||
fTrieSize = utrie2_serialize(fTrie,
|
||||
NULL, // Buffer
|
||||
0, // Capacity
|
||||
fStatus);
|
||||
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*fStatus = U_ZERO_ERROR;
|
||||
if (fTrie == nullptr) {
|
||||
bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
|
||||
fTrie = umutablecptrie_buildImmutable(
|
||||
fMutableTrie,
|
||||
UCPTRIE_TYPE_FAST,
|
||||
use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16,
|
||||
fStatus);
|
||||
fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus);
|
||||
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*fStatus = U_ZERO_ERROR;
|
||||
}
|
||||
}
|
||||
// RBBIDebugPrintf("Trie table size is %d\n", trieSize);
|
||||
return fTrieSize;
|
||||
}
|
||||
|
||||
@ -316,9 +327,9 @@ int32_t RBBISetBuilder::getTrieSize() {
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void RBBISetBuilder::serializeTrie(uint8_t *where) {
|
||||
utrie2_serialize(fTrie,
|
||||
where, // Buffer
|
||||
fTrieSize, // Capacity
|
||||
ucptrie_toBinary(fTrie,
|
||||
where, // Buffer
|
||||
fTrieSize, // Capacity
|
||||
fStatus);
|
||||
}
|
||||
|
||||
@ -467,7 +478,7 @@ void RBBISetBuilder::printRangeGroups() {
|
||||
lastPrintedGroupNum = groupNum;
|
||||
RBBIDebugPrintf("%2i ", groupNum);
|
||||
|
||||
if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
|
||||
if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" <DICT> ");}
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
@ -669,7 +680,7 @@ void RangeDescriptor::setDictionaryFlag() {
|
||||
if (varRef && varRef->fType == RBBINode::varRef) {
|
||||
const UnicodeString *setName = &varRef->fText;
|
||||
if (setName->compare(dictionary, -1) == 0) {
|
||||
fNum |= RBBISetBuilder::DICT_BIT;
|
||||
fNum |= RuleBasedBreakIterator::kDictBit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -16,9 +16,10 @@
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "rbbirb.h"
|
||||
#include "utrie2.h"
|
||||
#include "uvector.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
@ -101,8 +102,6 @@ public:
|
||||
*/
|
||||
void mergeCategories(IntPair categories);
|
||||
|
||||
static constexpr int32_t DICT_BIT = 0x4000;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void printSets();
|
||||
void printRanges();
|
||||
@ -121,8 +120,9 @@ private:
|
||||
|
||||
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
|
||||
uint32_t fTrieSize; // the Unicode Sets.
|
||||
UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing
|
||||
UCPTrie *fTrie; // the Unicode Sets.
|
||||
uint32_t fTrieSize;
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
|
@ -28,6 +28,8 @@
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const int32_t kMaxStateFor8BitsTable = 255;
|
||||
|
||||
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
|
||||
fRB(rb),
|
||||
fTree(*rootNode),
|
||||
@ -1335,11 +1337,18 @@ int32_t RBBITableBuilder::getTableSize() const {
|
||||
numRows = fDStates->size();
|
||||
numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
|
||||
if (use8BitsForTable()) {
|
||||
rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
|
||||
} else {
|
||||
rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
|
||||
}
|
||||
size += numRows * rowSize;
|
||||
return size;
|
||||
}
|
||||
|
||||
bool RBBITableBuilder::use8BitsForTable() const {
|
||||
return fDStates->size() <= kMaxStateFor8BitsTable;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
@ -1364,27 +1373,44 @@ void RBBITableBuilder::exportTable(void *where) {
|
||||
return;
|
||||
}
|
||||
|
||||
table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
|
||||
table->fNumStates = fDStates->size();
|
||||
table->fFlags = 0;
|
||||
if (use8BitsForTable()) {
|
||||
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
|
||||
table->fFlags |= RBBI_8BITS_ROWS;
|
||||
} else {
|
||||
table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
|
||||
}
|
||||
if (fRB->fLookAheadHardBreak) {
|
||||
table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
|
||||
}
|
||||
if (fRB->fSetBuilder->sawBOF()) {
|
||||
table->fFlags |= RBBI_BOF_REQUIRED;
|
||||
}
|
||||
table->fReserved = 0;
|
||||
|
||||
for (state=0; state<table->fNumStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
|
||||
U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
|
||||
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
|
||||
row->fAccepting = (int16_t)sd->fAccepting;
|
||||
row->fLookAhead = (int16_t)sd->fLookAhead;
|
||||
row->fTagIdx = (int16_t)sd->fTagsIdx;
|
||||
for (col=0; col<catCount; col++) {
|
||||
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
|
||||
if (use8BitsForTable()) {
|
||||
U_ASSERT (-128 < sd->fAccepting && sd->fAccepting <= 127);
|
||||
U_ASSERT (-128 < sd->fLookAhead && sd->fLookAhead <= 127);
|
||||
U_ASSERT (-128 < sd->fTagsIdx && sd->fTagsIdx <= 127);
|
||||
row->r8.fAccepting = (int8_t)sd->fAccepting;
|
||||
row->r8.fLookAhead = (int8_t)sd->fLookAhead;
|
||||
row->r8.fTagIdx = (int8_t)sd->fTagsIdx;
|
||||
for (col=0; col<catCount; col++) {
|
||||
U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
|
||||
row->r8.fNextState[col] = sd->fDtran->elementAti(col);
|
||||
}
|
||||
} else {
|
||||
U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
|
||||
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
|
||||
row->r16.fAccepting = (int16_t)sd->fAccepting;
|
||||
row->r16.fLookAhead = (int16_t)sd->fLookAhead;
|
||||
row->r16.fTagIdx = (int16_t)sd->fTagsIdx;
|
||||
for (col=0; col<catCount; col++) {
|
||||
row->r16.fNextState[col] = sd->fDtran->elementAti(col);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1520,11 +1546,18 @@ int32_t RBBITableBuilder::getSafeTableSize() const {
|
||||
numRows = fSafeTable->size();
|
||||
numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
|
||||
if (use8BitsForSafeTable()) {
|
||||
rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
|
||||
} else {
|
||||
rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
|
||||
}
|
||||
size += numRows * rowSize;
|
||||
return size;
|
||||
}
|
||||
|
||||
bool RBBITableBuilder::use8BitsForSafeTable() const {
|
||||
return fSafeTable->size() <= kMaxStateFor8BitsTable;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
@ -1549,20 +1582,33 @@ void RBBITableBuilder::exportSafeTable(void *where) {
|
||||
return;
|
||||
}
|
||||
|
||||
table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
|
||||
table->fNumStates = fSafeTable->size();
|
||||
table->fFlags = 0;
|
||||
table->fReserved = 0;
|
||||
if (use8BitsForSafeTable()) {
|
||||
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
|
||||
table->fFlags |= RBBI_8BITS_ROWS;
|
||||
} else {
|
||||
table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
|
||||
}
|
||||
|
||||
for (state=0; state<table->fNumStates; state++) {
|
||||
UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
|
||||
row->fAccepting = 0;
|
||||
row->fLookAhead = 0;
|
||||
row->fTagIdx = 0;
|
||||
row->fReserved = 0;
|
||||
for (col=0; col<catCount; col++) {
|
||||
row->fNextState[col] = rowString->charAt(col);
|
||||
if (use8BitsForSafeTable()) {
|
||||
row->r8.fAccepting = 0;
|
||||
row->r8.fLookAhead = 0;
|
||||
row->r8.fTagIdx = 0;
|
||||
for (col=0; col<catCount; col++) {
|
||||
U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
|
||||
row->r8.fNextState[col] = rowString->charAt(col);
|
||||
}
|
||||
} else {
|
||||
row->r16.fAccepting = 0;
|
||||
row->r16.fLookAhead = 0;
|
||||
row->r16.fTagIdx = 0;
|
||||
for (col=0; col<catCount; col++) {
|
||||
row->r16.fNextState[col] = rowString->charAt(col);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -53,6 +53,9 @@ public:
|
||||
*/
|
||||
void exportTable(void *where);
|
||||
|
||||
/** Use 8 bits to encode the forward table */
|
||||
bool use8BitsForTable() const;
|
||||
|
||||
/**
|
||||
* Find duplicate (redundant) character classes. Begin looking with categories.first.
|
||||
* Duplicate, if found are returned in the categories parameter.
|
||||
@ -85,6 +88,8 @@ public:
|
||||
*/
|
||||
void exportSafeTable(void *where);
|
||||
|
||||
/** Use 8 bits to encode the safe reverse table */
|
||||
bool use8BitsForSafeTable() const;
|
||||
|
||||
private:
|
||||
void calcNullable(RBBINode *n);
|
||||
|
@ -32,6 +32,8 @@
|
||||
#include "unicode/parseerr.h"
|
||||
#include "unicode/schriter.h"
|
||||
|
||||
struct UCPTrie;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/** @internal */
|
||||
@ -659,6 +661,28 @@ private:
|
||||
*/
|
||||
int32_t handleNext();
|
||||
|
||||
/*
|
||||
* Templatized version of handleNext() and handleSafePrevious().
|
||||
*
|
||||
* There will be exactly four instantiations, two each for 8 and 16 bit tables,
|
||||
* two each for 8 and 16 bit trie.
|
||||
* Having separate instantiations for the table types keeps conditional tests of
|
||||
* the table type out of the inner loops, at the expense of replicated code.
|
||||
*
|
||||
* The template parameter for the Trie access function is a value, not a type.
|
||||
* Doing it this way, the compiler will inline the Trie function in the
|
||||
* expanded functions. (Both the 8 and 16 bit access functions have the same type
|
||||
* signature)
|
||||
*/
|
||||
|
||||
typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
|
||||
|
||||
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
|
||||
int32_t handleSafePrevious(int32_t fromPosition);
|
||||
|
||||
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
|
||||
int32_t handleNext();
|
||||
|
||||
|
||||
/**
|
||||
* This function returns the appropriate LanguageBreakEngine for a
|
||||
@ -682,6 +706,16 @@ private:
|
||||
*/
|
||||
void dumpTables();
|
||||
|
||||
/**
|
||||
* Bit for dictionary based category
|
||||
*/
|
||||
static constexpr int32_t kDictBit = 0x4000;
|
||||
|
||||
/**
|
||||
* Bit for dictionary based category in 8bits trie
|
||||
*/
|
||||
static constexpr int32_t kDictBitFor8BitsTrie = 0x0080;
|
||||
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
};
|
||||
|
||||
|
@ -1030,7 +1030,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
||||
parseError.offset = 0;
|
||||
LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status));
|
||||
uint32_t length;
|
||||
const UChar *builtSource;
|
||||
const char *builtSource;
|
||||
const uint8_t *rbbiRules;
|
||||
const uint8_t *builtRules;
|
||||
|
||||
@ -1040,7 +1040,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
||||
}
|
||||
|
||||
builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
|
||||
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
|
||||
builtSource = (const char *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
|
||||
LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
|
||||
if (U_FAILURE(status)) {
|
||||
errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
|
||||
|
@ -128,6 +128,11 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
TESTCASE_AUTO(TestReverse);
|
||||
TESTCASE_AUTO(TestBug13692);
|
||||
TESTCASE_AUTO(TestDebugRules);
|
||||
TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
|
||||
TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
|
||||
TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
|
||||
TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
|
||||
TESTCASE_AUTO(TestTable_8_16_Bits);
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
TESTCASE_AUTO(TestTraceCreateCharacter);
|
||||
@ -4621,7 +4626,7 @@ void RBBITest::TestBug12677() {
|
||||
RuleBasedBreakIterator bi(rules, pe, status);
|
||||
assertSuccess(WHERE, status);
|
||||
UnicodeString rtRules = bi.getRules();
|
||||
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
|
||||
assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
|
||||
}
|
||||
|
||||
|
||||
@ -4635,6 +4640,7 @@ void RBBITest::TestTableRedundancies() {
|
||||
|
||||
RBBIDataWrapper *dw = bi->fData;
|
||||
const RBBIStateTable *fwtbl = dw->fForwardTable;
|
||||
UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
|
||||
int32_t numCharClasses = dw->fHeader->fCatCount;
|
||||
// printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
|
||||
|
||||
@ -4645,7 +4651,7 @@ void RBBITest::TestTableRedundancies() {
|
||||
UnicodeString s;
|
||||
for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
|
||||
s.append(row->fNextState[column]);
|
||||
s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
|
||||
}
|
||||
columns.push_back(s);
|
||||
}
|
||||
@ -4665,12 +4671,22 @@ void RBBITest::TestTableRedundancies() {
|
||||
for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
|
||||
UnicodeString s;
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
|
||||
assertTrue(WHERE, row->fAccepting >= -1);
|
||||
s.append(row->fAccepting + 1); // values of -1 are expected.
|
||||
s.append(row->fLookAhead);
|
||||
s.append(row->fTagIdx);
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
s.append(row->fNextState[column]);
|
||||
if (in8Bits) {
|
||||
assertTrue(WHERE, row->r8.fAccepting >= -1);
|
||||
s.append(row->r8.fAccepting + 1); // values of -1 are expected.
|
||||
s.append(row->r8.fLookAhead);
|
||||
s.append(row->r8.fTagIdx);
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
s.append(row->r8.fNextState[column]);
|
||||
}
|
||||
} else {
|
||||
assertTrue(WHERE, row->r16.fAccepting >= -1);
|
||||
s.append(row->r16.fAccepting + 1); // values of -1 are expected.
|
||||
s.append(row->r16.fLookAhead);
|
||||
s.append(row->r16.fTagIdx);
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
s.append(row->r16.fNextState[column]);
|
||||
}
|
||||
}
|
||||
rows.push_back(s);
|
||||
}
|
||||
@ -4743,12 +4759,14 @@ void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
|
||||
|
||||
RBBIDataWrapper *data = bi->fData;
|
||||
int32_t categoryCount = data->fHeader->fCatCount;
|
||||
UTrie2 *trie = data->fTrie;
|
||||
UCPTrie *trie = data->fTrie;
|
||||
bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
|
||||
uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
|
||||
|
||||
std::vector<UnicodeString> strings(categoryCount, UnicodeString());
|
||||
for (int cp=0; cp<0x1fff0; ++cp) {
|
||||
int cat = utrie2_get32(trie, cp);
|
||||
cat &= ~0x4000; // And off the dictionary bit from the category.
|
||||
int cat = ucptrie_get(trie, cp);
|
||||
cat &= ~dictBit; // And off the dictionary bit from the category.
|
||||
assertTrue(WHERE, cat < categoryCount && cat >= 0);
|
||||
if (cat < 0 || cat >= categoryCount) return;
|
||||
strings[cat].append(cp);
|
||||
@ -4886,6 +4904,182 @@ void RBBITest::TestDebugRules() {
|
||||
#endif
|
||||
}
|
||||
|
||||
void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
|
||||
UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
|
||||
int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
|
||||
// Text are duplicate characters from U+4E00 to U+4FFF
|
||||
UnicodeString text;
|
||||
for (UChar c = 0x4e00; c < 0x5000; c++) {
|
||||
text.append(c).append(c);
|
||||
}
|
||||
// Generate rule which will caused length+4 character classes and
|
||||
// length+3 states
|
||||
UnicodeString rules(u"!!quoted_literals_only;");
|
||||
for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
|
||||
rules.append(u'\'').append(c).append(c).append(u"';");
|
||||
}
|
||||
rules.append(u".;");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
RuleBasedBreakIterator bi(rules, parseError, status);
|
||||
|
||||
assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
|
||||
assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
|
||||
assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
|
||||
assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
|
||||
assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
|
||||
|
||||
bi.setText(text);
|
||||
|
||||
int32_t pos;
|
||||
int32_t i = 0;
|
||||
while ((pos = bi.next()) > 0) {
|
||||
// The first numChar should not break between the pair
|
||||
if (i++ < numChar) {
|
||||
assertEquals(WHERE, i * 2, pos);
|
||||
} else {
|
||||
// After the first numChar next(), break on each character.
|
||||
assertEquals(WHERE, i + numChar, pos);
|
||||
}
|
||||
}
|
||||
while ((pos = bi.previous()) > 0) {
|
||||
// The first numChar should not break between the pair
|
||||
if (--i < numChar) {
|
||||
assertEquals(WHERE, i * 2, pos);
|
||||
} else {
|
||||
// After the first numChar next(), break on each character.
|
||||
assertEquals(WHERE, i + numChar, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::Test8BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
void RBBITest::Test16BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
void RBBITest::Test16BitsTrieWith16BitStateTable() {
|
||||
testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
void RBBITest::Test8BitsTrieWith16BitStateTable() {
|
||||
// Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
|
||||
// create state table in 16 bits.
|
||||
|
||||
// Generate 510 'a' as text
|
||||
UnicodeString text;
|
||||
for (int32_t i = 0; i < 510; i++) {
|
||||
text.append(u'a');
|
||||
}
|
||||
|
||||
UnicodeString rules(u"!!quoted_literals_only;'");
|
||||
// 254 'a' in the rule will cause 256 states
|
||||
for (int32_t i = 0; i < 254; i++) {
|
||||
rules.append(u'a');
|
||||
}
|
||||
rules.append(u"';.;");
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
|
||||
|
||||
assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
|
||||
assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
|
||||
assertEquals(WHERE,
|
||||
false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
|
||||
bi->setText(text);
|
||||
|
||||
// break positions:
|
||||
// 254, 508, 509, ... 510
|
||||
assertEquals("next()", 254, bi->next());
|
||||
int32_t i = 0;
|
||||
int32_t pos;
|
||||
while ((pos = bi->next()) > 0) {
|
||||
assertEquals(WHERE, 508 + i , pos);
|
||||
i++;
|
||||
}
|
||||
i = 0;
|
||||
while ((pos = bi->previous()) > 0) {
|
||||
i++;
|
||||
if (pos >= 508) {
|
||||
assertEquals(WHERE, 510 - i , pos);
|
||||
} else {
|
||||
assertEquals(WHERE, 254 , pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
|
||||
// that there are no problems with rules at the size that transitions between the two.
|
||||
//
|
||||
// A rule that matches a literal string, like 'abcdefghij', will require one state and
|
||||
// one character class per character in the string. So we can make a rule to tickle the
|
||||
// boundaries by using literal strings of various lengths.
|
||||
//
|
||||
// For both the number of states and the number of character classes, the eight bit format
|
||||
// only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
|
||||
// leaving 120 something available. This test runs the string over the range of 120 - 130,
|
||||
// which allows some margin for changes to the number of values reserved by the rule builder
|
||||
// without breaking the test.
|
||||
|
||||
void RBBITest::TestTable_8_16_Bits() {
|
||||
|
||||
// testStr serves as both the source of the rule string (truncated to the desired length)
|
||||
// and as test data to check matching behavior. A break rule consisting of the first 120
|
||||
// characters of testStr will match the first 120 chars of the full-length testStr.
|
||||
UnicodeString testStr;
|
||||
for (UChar c=0x3000; c<0x3200; ++c) {
|
||||
testStr.append(c);
|
||||
}
|
||||
|
||||
const int32_t startLength = 120; // The shortest rule string to test.
|
||||
const int32_t endLength = 260; // The longest rule string to test
|
||||
const int32_t increment = this->quick ? endLength - startLength : 1;
|
||||
|
||||
for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
|
||||
UParseError parseError;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
|
||||
ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
|
||||
RuleBasedBreakIterator bi(ruleString, parseError, status);
|
||||
if (!assertSuccess(WHERE, status)) {
|
||||
errln(ruleString);
|
||||
break;
|
||||
}
|
||||
// bi.dumpTables();
|
||||
|
||||
// Verify that the break iterator is functioning - that the first boundary found
|
||||
// in testStr is at the length of the rule string.
|
||||
bi.setText(testStr);
|
||||
assertEquals(WHERE, ruleLen, bi.next());
|
||||
|
||||
// Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
|
||||
// of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
|
||||
bi.setText(testStr);
|
||||
int32_t result = bi.preceding(ruleLen);
|
||||
assertEquals(WHERE, 0, result);
|
||||
|
||||
// Verify that the range of rule lengths being tested cover the transations
|
||||
// from 8 to 16 bit data.
|
||||
bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
|
||||
bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
|
||||
|
||||
if (ruleLen == startLength) {
|
||||
assertEquals(WHERE, true, has8BitRowData);
|
||||
assertEquals(WHERE, true, has8BitsTrie);
|
||||
}
|
||||
if (ruleLen == endLength) {
|
||||
assertEquals(WHERE, false, has8BitRowData);
|
||||
assertEquals(WHERE, false, has8BitsTrie);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
static std::vector<std::string> gData;
|
||||
static std::vector<int32_t> gEntryFn;
|
||||
|
@ -86,6 +86,11 @@ public:
|
||||
|
||||
void TestDebug();
|
||||
void TestProperties();
|
||||
void Test8BitsTrieWith8BitStateTable();
|
||||
void Test8BitsTrieWith16BitStateTable();
|
||||
void Test16BitsTrieWith8BitStateTable();
|
||||
void Test16BitsTrieWith16BitStateTable();
|
||||
void TestTable_8_16_Bits();
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
void TestTraceCreateCharacter();
|
||||
@ -133,6 +138,9 @@ private:
|
||||
// Test parameters, from the test framework and test invocation.
|
||||
const char* fTestParams;
|
||||
|
||||
// Helper functions to test different trie bit sizes and state table bit sizes.
|
||||
void testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits);
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
void assertTestTraceResult(int32_t fnNumber, const char* expectedData);
|
||||
#endif
|
||||
|
@ -12,10 +12,12 @@ package com.ibm.icu.impl;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
import com.ibm.icu.impl.ICUBinary.Authenticate;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
|
||||
/**
|
||||
* <p>Internal class used for Rule Based Break Iterators.</p>
|
||||
@ -43,14 +45,10 @@ public final class RBBIDataWrapper {
|
||||
* Option Flags for this state table.
|
||||
*/
|
||||
public int fFlags;
|
||||
/**
|
||||
* Option Flags for this state table.
|
||||
*/
|
||||
public int fReserved;
|
||||
/**
|
||||
* Linear array of next state values, accessed as short[state, char_class]
|
||||
*/
|
||||
public short[] fTable;
|
||||
public char[] fTable;
|
||||
|
||||
public RBBIStateTable() {
|
||||
}
|
||||
@ -59,16 +57,29 @@ public final class RBBIDataWrapper {
|
||||
if (length == 0) {
|
||||
return null;
|
||||
}
|
||||
if (length < 16) {
|
||||
if (length < 12) {
|
||||
throw new IOException("Invalid RBBI state table length.");
|
||||
}
|
||||
RBBIStateTable This = new RBBIStateTable();
|
||||
This.fNumStates = bytes.getInt();
|
||||
This.fRowLen = bytes.getInt();
|
||||
This.fFlags = bytes.getInt();
|
||||
This.fReserved = bytes.getInt();
|
||||
int lengthOfShorts = length - 16; // length in bytes.
|
||||
This.fTable = ICUBinary.getShorts(bytes, lengthOfShorts / 2, lengthOfShorts & 1);
|
||||
int lengthOfTable = length - 12; // length in bytes.
|
||||
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||||
if (use8Bits) {
|
||||
This.fTable = new char[lengthOfTable];
|
||||
for (int i = 0; i < lengthOfTable; i++) {
|
||||
byte b = bytes.get();
|
||||
if (i % This.fRowLen < NEXTSTATES) {
|
||||
This.fTable[i] = (char) b; // Treat b as signed.
|
||||
} else {
|
||||
This.fTable[i] = (char)(0xff & b); // Treat b as unsigned.
|
||||
}
|
||||
}
|
||||
ICUBinary.skipBytes(bytes, lengthOfTable & 1);
|
||||
} else {
|
||||
This.fTable = ICUBinary.getChars(bytes, lengthOfTable / 2, lengthOfTable & 1);
|
||||
}
|
||||
return This;
|
||||
}
|
||||
|
||||
@ -76,13 +87,20 @@ public final class RBBIDataWrapper {
|
||||
bytes.writeInt(fNumStates);
|
||||
bytes.writeInt(fRowLen);
|
||||
bytes.writeInt(fFlags);
|
||||
bytes.writeInt(fReserved);
|
||||
int tableLen = fRowLen * fNumStates / 2; // fRowLen is bytes.
|
||||
for (int i = 0; i < tableLen; i++) {
|
||||
bytes.writeShort(fTable[i]);
|
||||
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
|
||||
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
|
||||
for (int i = 0; i < tableLen; i++) {
|
||||
byte b = (byte)(fTable[i] & 0x00ff);
|
||||
bytes.writeByte(b);
|
||||
}
|
||||
} else {
|
||||
int tableLen = fRowLen * fNumStates / 2; // fRowLen is bytes.
|
||||
for (int i = 0; i < tableLen; i++) {
|
||||
bytes.writeChar(fTable[i]);
|
||||
}
|
||||
}
|
||||
int bytesWritten = 16 + fRowLen * fNumStates; // total bytes written,
|
||||
// including 16 for the header.
|
||||
int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written,
|
||||
// including 12 for the header.
|
||||
while (bytesWritten % 8 != 0) {
|
||||
bytes.writeByte(0);
|
||||
++bytesWritten;
|
||||
@ -105,7 +123,6 @@ public final class RBBIDataWrapper {
|
||||
if (fNumStates != otherST.fNumStates) return false;
|
||||
if (fRowLen != otherST.fRowLen) return false;
|
||||
if (fFlags != otherST.fFlags) return false;
|
||||
if (fReserved != otherST.fReserved) return false;
|
||||
return Arrays.equals(fTable, otherST.fTable);
|
||||
}
|
||||
}
|
||||
@ -134,12 +151,12 @@ public final class RBBIDataWrapper {
|
||||
|
||||
public RBBIStateTable fRTable;
|
||||
|
||||
public Trie2 fTrie;
|
||||
public CodePointTrie fTrie;
|
||||
public String fRuleSource;
|
||||
public int fStatusTable[];
|
||||
|
||||
public static final int DATA_FORMAT = 0x42726b20; // "Brk "
|
||||
public static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0
|
||||
public static final int FORMAT_VERSION = 0x06000000; // 6.0.0.0
|
||||
|
||||
private static final class IsAcceptable implements Authenticate {
|
||||
@Override
|
||||
@ -186,20 +203,20 @@ public final class RBBIDataWrapper {
|
||||
* offset to the "tagIndex" field in a state table row.
|
||||
*/
|
||||
public final static int TAGIDX = 2;
|
||||
/**
|
||||
* offset to the reserved field in a state table row.
|
||||
*/
|
||||
public final static int RESERVED = 3;
|
||||
/**
|
||||
* offset to the start of the next states array in a state table row.
|
||||
*/
|
||||
public final static int NEXTSTATES = 4;
|
||||
public final static int NEXTSTATES = 3;
|
||||
|
||||
// Bit selectors for the "FLAGS" field of the state table header
|
||||
// enum RBBIStateTableFlags in the C version.
|
||||
//
|
||||
public final static int RBBI_LOOKAHEAD_HARD_BREAK = 1;
|
||||
public final static int RBBI_BOF_REQUIRED = 2;
|
||||
public final static int RBBI_8BITS_ROWS = 4;
|
||||
|
||||
public final static int DICT_BIT = 0x4000;
|
||||
public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
|
||||
|
||||
/**
|
||||
* Data Header. A struct-like class with the fields from the RBBI data file header.
|
||||
@ -243,7 +260,7 @@ public final class RBBIDataWrapper {
|
||||
* array index of the start of the state table row for that state.
|
||||
*/
|
||||
public int getRowIndex(int state){
|
||||
return state * (fHeader.fCatCount + 4);
|
||||
return state * (fHeader.fCatCount + NEXTSTATES);
|
||||
}
|
||||
|
||||
RBBIDataWrapper() {
|
||||
@ -330,7 +347,10 @@ public final class RBBIDataWrapper {
|
||||
// as we don't go more than 100 bytes past the
|
||||
// past the end of the TRIE.
|
||||
|
||||
This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer
|
||||
This.fTrie = CodePointTrie.fromBinary(
|
||||
CodePointTrie.Type.FAST,
|
||||
null,
|
||||
bytes); // Deserialize the TRIE, leaving buffer
|
||||
// at an unknown position, preceding the
|
||||
// padding between TRIE and following section.
|
||||
|
||||
@ -359,8 +379,8 @@ public final class RBBIDataWrapper {
|
||||
}
|
||||
ICUBinary.skipBytes(bytes, This.fHeader.fRuleSource - pos);
|
||||
pos = This.fHeader.fRuleSource;
|
||||
This.fRuleSource = ICUBinary.getString(
|
||||
bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1);
|
||||
This.fRuleSource = new String(
|
||||
ICUBinary.getBytes(bytes, This.fHeader.fRuleSourceLen, 0), StandardCharsets.UTF_8);
|
||||
|
||||
if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
|
||||
This.dump(System.out);
|
||||
@ -396,6 +416,15 @@ public final class RBBIDataWrapper {
|
||||
return dest.toString();
|
||||
}
|
||||
|
||||
static public String charToString(char n, int width) {
|
||||
StringBuilder dest = new StringBuilder(width);
|
||||
dest.append(n);
|
||||
while (dest.length() < width) {
|
||||
dest.insert(0, ' ');
|
||||
}
|
||||
return dest.toString();
|
||||
}
|
||||
|
||||
/** Fixed width int-to-string conversion. */
|
||||
static public String intToHexString(int n, int width) {
|
||||
StringBuilder dest = new StringBuilder(width);
|
||||
@ -408,11 +437,11 @@ public final class RBBIDataWrapper {
|
||||
|
||||
/** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
|
||||
private void dumpTable(java.io.PrintStream out, RBBIStateTable table) {
|
||||
if (table == null || table.fTable.length == 0) {
|
||||
if (table == null || (table.fTable.length == 0)) {
|
||||
out.println(" -- null -- ");
|
||||
} else {
|
||||
int n;
|
||||
int state;
|
||||
char n;
|
||||
char state;
|
||||
StringBuilder header = new StringBuilder(" Row Acc Look Tag");
|
||||
for (n=0; n<fHeader.fCatCount; n++) {
|
||||
header.append(intToString(n, 5));
|
||||
@ -434,18 +463,18 @@ public final class RBBIDataWrapper {
|
||||
* @param table
|
||||
* @param state
|
||||
*/
|
||||
private void dumpRow(java.io.PrintStream out, RBBIStateTable table, int state) {
|
||||
private void dumpRow(java.io.PrintStream out, RBBIStateTable table, char state) {
|
||||
StringBuilder dest = new StringBuilder(fHeader.fCatCount*5 + 20);
|
||||
dest.append(intToString(state, 4));
|
||||
int row = getRowIndex(state);
|
||||
if (table.fTable[row+ACCEPTING] != 0) {
|
||||
dest.append(intToString(table.fTable[row+ACCEPTING], 5));
|
||||
}else {
|
||||
dest.append(intToString(table.fTable[row+ACCEPTING], 5));
|
||||
} else {
|
||||
dest.append(" ");
|
||||
}
|
||||
if (table.fTable[row+LOOKAHEAD] != 0) {
|
||||
dest.append(intToString(table.fTable[row+LOOKAHEAD], 5));
|
||||
}else {
|
||||
} else {
|
||||
dest.append(" ");
|
||||
}
|
||||
dest.append(intToString(table.fTable[row+TAGIDX], 5));
|
||||
@ -466,6 +495,7 @@ public final class RBBIDataWrapper {
|
||||
int char32;
|
||||
int category;
|
||||
int lastNewline[] = new int[n+1];
|
||||
int dictMask = fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT;
|
||||
|
||||
for (category = 0; category <= fHeader.fCatCount; category ++) {
|
||||
catStrings[category] = "";
|
||||
@ -474,7 +504,7 @@ public final class RBBIDataWrapper {
|
||||
out.println("--------------------");
|
||||
for (char32 = 0; char32<=0x10ffff; char32++) {
|
||||
category = fTrie.get(char32);
|
||||
category &= ~0x4000; // Mask off dictionary bit.
|
||||
category &= ~dictMask; // Mask off dictionary bit.
|
||||
if (category < 0 || category > fHeader.fCatCount) {
|
||||
out.println("Error, bad category " + Integer.toHexString(category) +
|
||||
" for char " + Integer.toHexString(char32));
|
||||
|
@ -11,6 +11,7 @@ package com.ibm.icu.text;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -182,7 +183,9 @@ class RBBIRuleBuilder {
|
||||
int reverseTableSize = align8(fForwardTable.getSafeTableSize());
|
||||
int trieSize = align8(fSetBuilder.getTrieSize());
|
||||
int statusTableSize = align8(fRuleStatusVals.size() * 4);
|
||||
int rulesSize = align8((strippedRules.length()) * 2);
|
||||
|
||||
byte[] strippedRulesUTF8 = strippedRules.getBytes(StandardCharsets.UTF_8);
|
||||
int rulesSize = align8(strippedRulesUTF8.length + 1);
|
||||
|
||||
int totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
@ -202,7 +205,7 @@ class RBBIRuleBuilder {
|
||||
header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
|
||||
header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION;
|
||||
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
|
||||
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
|
||||
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories();
|
||||
|
||||
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
|
||||
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
|
||||
@ -214,11 +217,11 @@ class RBBIRuleBuilder {
|
||||
+ header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
|
||||
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
|
||||
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
|
||||
+ header[RBBIDataWrapper.DH_TRIELEN];
|
||||
+ trieSize;
|
||||
header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
|
||||
header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE]
|
||||
+ statusTableSize;
|
||||
header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2;
|
||||
header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRulesUTF8.length;
|
||||
for (i = 0; i < header.length; i++) {
|
||||
dos.writeInt(header[i]);
|
||||
outputPos += 4;
|
||||
@ -257,8 +260,9 @@ class RBBIRuleBuilder {
|
||||
// Write out the stripped rules (rules with extra spaces removed
|
||||
// These go last in the data area, even though they are not last in the header.
|
||||
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
|
||||
dos.writeChars(strippedRules);
|
||||
outputPos += strippedRules.length() * 2;
|
||||
dos.write(strippedRulesUTF8, 0, strippedRulesUTF8.length);
|
||||
dos.write(0); // Null termination
|
||||
outputPos += strippedRulesUTF8.length + 1;
|
||||
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
|
||||
dos.write(0);
|
||||
outputPos += 1;
|
||||
|
@ -697,16 +697,14 @@ class RBBIRuleScanner {
|
||||
static String stripRules(String rules) {
|
||||
StringBuilder strippedRules = new StringBuilder();
|
||||
int rulesLength = rules.length();
|
||||
boolean skippingSpaces = false;
|
||||
|
||||
for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
|
||||
int cp = rules.codePointAt(idx);
|
||||
boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
|
||||
if (skippingSpaces && whiteSpace) {
|
||||
if (whiteSpace) {
|
||||
continue;
|
||||
}
|
||||
strippedRules.appendCodePoint(cp);
|
||||
skippingSpaces = whiteSpace;
|
||||
}
|
||||
return strippedRules.toString();
|
||||
}
|
||||
|
@ -8,15 +8,16 @@
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.Trie2Writable;
|
||||
import com.ibm.icu.impl.Trie2_16;
|
||||
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
import com.ibm.icu.util.MutableCodePointTrie;
|
||||
|
||||
//
|
||||
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
|
||||
@ -125,9 +126,9 @@ class RBBISetBuilder {
|
||||
RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
|
||||
RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
Trie2Writable fTrie; // The mapping TRIE that is the end result of processing
|
||||
MutableCodePointTrie fTrie; // The mapping TRIE that is the end result of processing
|
||||
// the Unicode Sets.
|
||||
Trie2_16 fFrozenTrie;
|
||||
CodePointTrie fFrozenTrie;
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
@ -140,6 +141,7 @@ class RBBISetBuilder {
|
||||
boolean fSawBOF;
|
||||
|
||||
static final int DICT_BIT = 0x4000;
|
||||
static final int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
@ -286,22 +288,30 @@ class RBBISetBuilder {
|
||||
}
|
||||
|
||||
|
||||
private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127;
|
||||
|
||||
/**
|
||||
* Build the Trie table for mapping UChar32 values to the corresponding
|
||||
* range group number.
|
||||
*/
|
||||
void buildTrie() {
|
||||
boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
|
||||
RangeDescriptor rlRange;
|
||||
|
||||
fTrie = new Trie2Writable(0, // Initial value for all code points.
|
||||
0); // Error value for out-of-range input.
|
||||
fTrie = new MutableCodePointTrie(0, // Initial value for all code points.
|
||||
0); // Error value for out-of-range input.
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
int value = rlRange.fNum;
|
||||
if (use8Bits && ((value & DICT_BIT) != 0)) {
|
||||
assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0);
|
||||
// switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE
|
||||
value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT);
|
||||
}
|
||||
fTrie.setRange(
|
||||
rlRange.fStartChar, // Range start
|
||||
rlRange.fEndChar, // Range end (inclusive)
|
||||
rlRange.fNum, // value for range
|
||||
true // Overwrite previously written values
|
||||
value // value for range
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -326,17 +336,31 @@ class RBBISetBuilder {
|
||||
--fGroupCount;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// freezeTrieIfNotYet() Ensure the trie is frozen. Shared code by getTrieSize
|
||||
// and serializeTrie.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void freezeTrieIfNotYet() {
|
||||
if (fFrozenTrie == null) {
|
||||
boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
|
||||
fFrozenTrie = fTrie.buildImmutable(CodePointTrie.Type.FAST,
|
||||
use8Bits ?
|
||||
CodePointTrie.ValueWidth.BITS_8 :
|
||||
CodePointTrie.ValueWidth.BITS_16);
|
||||
fTrie = null;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int getTrieSize() {
|
||||
if (fFrozenTrie == null) {
|
||||
fFrozenTrie = fTrie.toTrie2_16();
|
||||
fTrie = null;
|
||||
}
|
||||
return fFrozenTrie.getSerializedLength();
|
||||
freezeTrieIfNotYet();
|
||||
return fFrozenTrie.toBinary(new ByteArrayOutputStream());
|
||||
}
|
||||
|
||||
|
||||
@ -346,11 +370,8 @@ class RBBISetBuilder {
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void serializeTrie(OutputStream os) throws IOException {
|
||||
if (fFrozenTrie == null) {
|
||||
fFrozenTrie = fTrie.toTrie2_16();
|
||||
fTrie = null;
|
||||
}
|
||||
fFrozenTrie.serialize(os);
|
||||
freezeTrieIfNotYet();
|
||||
fFrozenTrie.toBinary(os);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
|
@ -74,6 +74,8 @@ class RBBITableBuilder {
|
||||
/** Synthesized safe table, a List of row arrays. */
|
||||
private List<short[]> fSafeTable;
|
||||
|
||||
private static final int MAX_STATE_FOR_8BITS_TABLE = 255;
|
||||
|
||||
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
|
||||
int[] fLookAheadRuleMap;
|
||||
|
||||
@ -1097,10 +1099,11 @@ class RBBITableBuilder {
|
||||
if (fRB.fTreeRoots[fRootIx] == null) {
|
||||
return 0;
|
||||
}
|
||||
int size = 16; // The header of 4 ints, with no rows to the table.
|
||||
int size = 12; // The header of 4 ints, with no rows to the table.
|
||||
int numRows = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
int rowSize = 8 + 2*numCols;
|
||||
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
|
||||
size += numRows * rowSize;
|
||||
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
|
||||
return size;
|
||||
@ -1125,13 +1128,20 @@ class RBBITableBuilder {
|
||||
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
|
||||
fDStates.size() < 0x7fff);
|
||||
table.fNumStates = fDStates.size();
|
||||
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
|
||||
// Size of table size in shorts.
|
||||
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
|
||||
int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
|
||||
int tableSize = (getTableSize() - 16) / 2; // fTable length in shorts.
|
||||
table.fTable = new short[tableSize];
|
||||
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||||
int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
|
||||
int tableSize;
|
||||
if (use8Bits) {
|
||||
tableSize = (getTableSize() - 12); // fTable length in bytes.
|
||||
table.fTable = new char[tableSize];
|
||||
table.fRowLen = rowLen; // Row length in bytes.
|
||||
} else {
|
||||
tableSize = (getTableSize() - 12) / 2; // fTable length in shorts.
|
||||
table.fTable = new char[tableSize];
|
||||
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||||
}
|
||||
|
||||
if (fRB.fLookAheadHardBreak) {
|
||||
table.fFlags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
|
||||
@ -1139,18 +1149,29 @@ class RBBITableBuilder {
|
||||
if (fRB.fSetBuilder.sawBOF()) {
|
||||
table.fFlags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
|
||||
}
|
||||
if (use8Bits) {
|
||||
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||||
}
|
||||
|
||||
int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
|
||||
for (state=0; state<table.fNumStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
int row = state*rowLen;
|
||||
Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
|
||||
Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
|
||||
table.fTable[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
|
||||
table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
|
||||
table.fTable[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx;
|
||||
if (use8Bits) {
|
||||
Assert.assrt (-128 < sd.fAccepting && sd.fAccepting <= MAX_STATE_FOR_8BITS_TABLE);
|
||||
Assert.assrt (-128 < sd.fLookAhead && sd.fLookAhead <= MAX_STATE_FOR_8BITS_TABLE);
|
||||
} else {
|
||||
Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
|
||||
Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
|
||||
}
|
||||
table.fTable[row + RBBIDataWrapper.ACCEPTING] = (char)sd.fAccepting;
|
||||
table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (char)sd.fLookAhead;
|
||||
table.fTable[row + RBBIDataWrapper.TAGIDX] = (char)sd.fTagsIdx;
|
||||
for (col=0; col<numCharCategories; col++) {
|
||||
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
|
||||
if (use8Bits) {
|
||||
Assert.assrt (sd.fDtran[col] <= MAX_STATE_FOR_8BITS_TABLE);
|
||||
}
|
||||
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)sd.fDtran[col];
|
||||
}
|
||||
}
|
||||
return table;
|
||||
@ -1250,10 +1271,12 @@ class RBBITableBuilder {
|
||||
if (fSafeTable == null) {
|
||||
return 0;
|
||||
}
|
||||
int size = 16; // The header of 4 ints, with no rows to the table.
|
||||
int size = 12; // The header of 4 ints, with no rows to the table.
|
||||
int numRows = fSafeTable.size();
|
||||
int numCols = fSafeTable.get(0).length;
|
||||
int rowSize = 8 + 2*numCols;
|
||||
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
|
||||
int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
|
||||
size += numRows * rowSize;
|
||||
// TODO: there are redundant round-up. Figure out best place, get rid of the rest.
|
||||
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
|
||||
@ -1269,23 +1292,33 @@ class RBBITableBuilder {
|
||||
RBBIDataWrapper.RBBIStateTable exportSafeTable() {
|
||||
RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
|
||||
table.fNumStates = fSafeTable.size();
|
||||
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
int numCharCategories = fSafeTable.get(0).length;
|
||||
|
||||
// Size of table size in shorts.
|
||||
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
|
||||
int rowLen = 4 + numCharCategories;
|
||||
int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
|
||||
// TODO: tableSize is basically numStates * numCharCategories,
|
||||
// except for alignment padding. Clean up here, and in main exportTable().
|
||||
int tableSize = (getSafeTableSize() - 16) / 2; // fTable length in shorts.
|
||||
table.fTable = new short[tableSize];
|
||||
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||||
int tableSize = (getSafeTableSize() - 12); // fTable length in bytes.
|
||||
if (use8Bits) {
|
||||
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||||
table.fTable = new char[tableSize];
|
||||
table.fRowLen = rowLen; // Row length in bytes.
|
||||
} else {
|
||||
tableSize /= 2; // fTable length in shorts.
|
||||
table.fTable = new char[tableSize];
|
||||
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||||
}
|
||||
|
||||
for (int state=0; state<table.fNumStates; state++) {
|
||||
short[] rowArray = fSafeTable.get(state);
|
||||
int row = state * rowLen;
|
||||
|
||||
for (int col=0; col<numCharCategories; col++) {
|
||||
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = rowArray[col];
|
||||
if (use8Bits) {
|
||||
Assert.assrt (rowArray[col] <= MAX_STATE_FOR_8BITS_TABLE);
|
||||
}
|
||||
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)rowArray[col];
|
||||
}
|
||||
}
|
||||
return table;
|
||||
|
@ -26,10 +26,10 @@ import com.ibm.icu.impl.CharacterIteration;
|
||||
import com.ibm.icu.impl.ICUBinary;
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
import com.ibm.icu.impl.RBBIDataWrapper;
|
||||
import com.ibm.icu.impl.Trie2;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
|
||||
/**
|
||||
* Rule Based Break Iterator
|
||||
@ -821,9 +821,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
|
||||
// caches for quicker access
|
||||
CharacterIterator text = fText;
|
||||
Trie2 trie = fRData.fTrie;
|
||||
CodePointTrie trie = fRData.fTrie;
|
||||
|
||||
short[] stateTable = fRData.fFTable.fTable;
|
||||
char[] stateTable = fRData.fFTable.fTable;
|
||||
int initialPosition = fPosition;
|
||||
text.setIndex(initialPosition);
|
||||
int result = initialPosition;
|
||||
@ -844,6 +844,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
short category = 3;
|
||||
int flagsState = fRData.fFTable.fFlags;
|
||||
int mode = RBBI_RUN;
|
||||
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
|
||||
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
|
||||
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
|
||||
category = 2;
|
||||
mode = RBBI_START;
|
||||
@ -885,10 +887,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
if ((category & dictMask) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
category &= ~dictMask;
|
||||
}
|
||||
|
||||
if (TRACE) {
|
||||
@ -912,8 +914,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
// look up a state transition in the state table
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0xffff) {
|
||||
// Match found, common case
|
||||
result = text.getIndex();
|
||||
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
@ -927,7 +928,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
}
|
||||
|
||||
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||||
if (completedRule > 0) {
|
||||
if (completedRule > 0 && completedRule != 0xffff) {
|
||||
// Lookahead match is completed
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
@ -937,13 +938,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// If we are at the position of the '/' in a look-ahead (hard break) rule;
|
||||
// record the current position, to be returned later, if the full rule matches.
|
||||
// TODO: Move this check before the previous check of fAccepting.
|
||||
// This would enable hard-break rules with no following context.
|
||||
// But there are line break test failures when trying this. Investigate.
|
||||
// Issue ICU-20837
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
int pos = text.getIndex();
|
||||
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
@ -996,14 +998,17 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
* @internal
|
||||
*/
|
||||
private int handleSafePrevious(int fromPosition) {
|
||||
int state;
|
||||
char state;
|
||||
short category = 0;
|
||||
int result = 0;
|
||||
|
||||
// caches for quicker access
|
||||
CharacterIterator text = fText;
|
||||
Trie2 trie = fRData.fTrie;
|
||||
short[] stateTable = fRData.fRTable.fTable;
|
||||
CodePointTrie trie = fRData.fTrie;
|
||||
char[] stateTable = fRData.fRTable.fTable;
|
||||
int flagsState = fRData.fRTable.fFlags;
|
||||
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
|
||||
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
|
||||
|
||||
CISetIndex32(text, fromPosition);
|
||||
if (TRACE) {
|
||||
@ -1029,7 +1034,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = (short) trie.get(c);
|
||||
category &= ~0x4000;
|
||||
category &= ~dictMask;
|
||||
if (TRACE) {
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||||
@ -1209,6 +1214,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
int category;
|
||||
int current;
|
||||
int foundBreakCount = 0;
|
||||
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
|
||||
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
// For each span, find the appropriate break engine, and ask it to find
|
||||
@ -1219,7 +1226,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
|
||||
while(true) {
|
||||
while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
|
||||
while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) {
|
||||
c = CharacterIteration.next32(fText); // pre-increment
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0f8b27a3e77ffbe4468ede22ae8a4ca85df993472d6bdea432505a148ff33f23
|
||||
size 13149606
|
||||
oid sha256:8ed7db50765b06c8a35f48048543c5c9a2c2e19993f752bd71a15e6ac89aa3b3
|
||||
size 13141781
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:58c1ec5386cba3b6660c3bf8c22ce74c343d3101354f157533d13ec1099e1379
|
||||
size 94524
|
||||
oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9
|
||||
size 94529
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c39bb717c3e95c47c14f49507e8a7866e89bdb3588f021693c3909dda64f4dcb
|
||||
size 723466
|
||||
oid sha256:e032f823e0ba2fd99f784fe400675049c126e091158a285955c71aa5e2c6036b
|
||||
size 723481
|
||||
|
@ -29,8 +29,10 @@ import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.RBBIDataWrapper;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class RBBITest extends TestFmwk {
|
||||
public RBBITest() {
|
||||
@ -562,7 +564,7 @@ public class RBBITest extends TestFmwk {
|
||||
|
||||
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
|
||||
String rtRules = bi.toString(); // getRules() in C++
|
||||
assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
|
||||
assertEquals("Break Iterator rule stripping test", "!!forward;$x=[ab#];'#''?';", rtRules);
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -582,7 +584,7 @@ public class RBBITest extends TestFmwk {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int r = 1; r < fwtbl.fNumStates; r++) {
|
||||
int row = dw.getRowIndex(r);
|
||||
short tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
s.append((char)tableVal);
|
||||
}
|
||||
columns.add(s.toString());
|
||||
@ -602,13 +604,12 @@ public class RBBITest extends TestFmwk {
|
||||
for (int r=0; r<fwtbl.fNumStates; r++) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
int row = dw.getRowIndex(r);
|
||||
assertTrue("Accepting < -1", fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING] >= -1);
|
||||
s.append(fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING]);
|
||||
s.append(fwtbl.fTable[row + RBBIDataWrapper.LOOKAHEAD]);
|
||||
s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGIDX]);
|
||||
for (int column=0; column<numCharClasses; column++) {
|
||||
short tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
s.append((char)tableVal);
|
||||
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
s.append(tableVal);
|
||||
}
|
||||
rows.add(s.toString());
|
||||
}
|
||||
@ -655,4 +656,199 @@ public class RBBITest extends TestFmwk {
|
||||
assertTrue("Reverse Table", RBBIDataWrapper.equals(bi.fRData.fRTable, bi2.fRData.fRTable));
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to test 8/16 bits of trie and 8/16 bits of state table.
|
||||
private void testTrieStateTable(int numChar, boolean expectUCPTrieValueWidthIn8Bits,
|
||||
boolean expectStateRowIn8Bits) {
|
||||
// Text are duplicate characters from U+4E00 to U+4FFF
|
||||
StringBuilder builder = new StringBuilder(2 * (0x5000 - 0x4e00));
|
||||
for (char c = 0x4e00; c < 0x5000; c++) {
|
||||
builder.append(c).append(c);
|
||||
}
|
||||
String text = builder.toString();
|
||||
|
||||
// Generate rule which will caused length+4 character classes and
|
||||
// length+3 states
|
||||
|
||||
builder = new StringBuilder(100 + 6 * numChar);
|
||||
builder.append("!!quoted_literals_only;");
|
||||
for (char c = 0x4e00; c < 0x4e00 + numChar; c++) {
|
||||
builder.append("\'").append(c).append(c).append("';");
|
||||
}
|
||||
builder.append(".;");
|
||||
String rules = builder.toString();
|
||||
|
||||
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
|
||||
|
||||
RBBIDataWrapper dw = bi.fRData;
|
||||
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
|
||||
RBBIDataWrapper.RBBIStateTable rvtbl = dw.fRTable;
|
||||
|
||||
boolean has8BitRowDataForwardTable = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
|
||||
boolean has8BitRowDataReverseTable = (rvtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
|
||||
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
|
||||
|
||||
assertEquals("Number of char classes mismatch numChar=" + numChar, numChar + 4, dw.fHeader.fCatCount);
|
||||
assertEquals("Number of states in Forward Table mismatch numChar=" + numChar, numChar + 3, fwtbl.fNumStates);
|
||||
assertEquals("Number of states in Reverse Table mismatch numChar=" + numChar, numChar + 3, rvtbl.fNumStates);
|
||||
assertEquals("Trie width mismatch numChar=" + numChar, expectUCPTrieValueWidthIn8Bits, has8BitsTrie);
|
||||
assertEquals("Bits of Forward State table mismatch numChar=" + numChar,
|
||||
expectStateRowIn8Bits, has8BitRowDataForwardTable);
|
||||
assertEquals("Bits of Reverse State table mismatch numChar=" + numChar,
|
||||
expectStateRowIn8Bits, has8BitRowDataReverseTable);
|
||||
|
||||
bi.setText(text);
|
||||
|
||||
int pos;
|
||||
int i = 0;
|
||||
while ((pos = bi.next()) > 0) {
|
||||
// The first numChar should not break between the pair
|
||||
if (i++ < numChar) {
|
||||
assertEquals("next() mismatch numChar=" + numChar, i * 2, pos);
|
||||
} else {
|
||||
// After the first numChar next(), break on each character.
|
||||
assertEquals("next() mismatch numChar=" + numChar, i + numChar, pos);
|
||||
}
|
||||
}
|
||||
while ((pos = bi.previous()) > 0) {
|
||||
// The first numChar should not break between the pair
|
||||
if (--i < numChar) {
|
||||
assertEquals("previous() mismatch numChar=" + numChar, i * 2, pos);
|
||||
} else {
|
||||
// After the first numChar next(), break on each character.
|
||||
assertEquals("previous() mismatch numChar=" + numChar, i + numChar, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test8BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(123, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test16BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test16BitsTrieWith16BitStateTable() {
|
||||
testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test8BitsTrieWith16BitStateTable() {
|
||||
// Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
|
||||
// create state table in 16 bits.
|
||||
|
||||
// Generate 510 'a' as text
|
||||
StringBuilder builder = new StringBuilder(510);
|
||||
for (int i = 0; i < 510; i++) {
|
||||
builder.append('a');
|
||||
}
|
||||
String text = builder.toString();
|
||||
|
||||
builder = new StringBuilder(550);
|
||||
builder.append("!!quoted_literals_only;'");
|
||||
// 254 'a' in the rule will cause 256 states
|
||||
for (int i = 0; i < 254; i++) {
|
||||
builder.append('a');
|
||||
}
|
||||
builder.append("';.;");
|
||||
String rules = builder.toString();
|
||||
|
||||
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
|
||||
|
||||
RBBIDataWrapper dw = bi.fRData;
|
||||
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
|
||||
|
||||
boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
|
||||
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
|
||||
assertFalse("State table should be in 16 bits", has8BitRowData);
|
||||
assertTrue("Trie should be in 8 bits", has8BitsTrie);
|
||||
|
||||
bi.setText(text);
|
||||
|
||||
// break positions:
|
||||
// 254, 508, 509, 510
|
||||
assertEquals("next()", 254, bi.next());
|
||||
int i = 0;
|
||||
int pos;
|
||||
while ((pos = bi.next()) > 0) {
|
||||
assertEquals("next()", 508 + i , pos);
|
||||
i++;
|
||||
}
|
||||
i = 0;
|
||||
while ((pos = bi.previous()) > 0) {
|
||||
i++;
|
||||
if (pos >= 508) {
|
||||
assertEquals("previous()", 510 - i , pos);
|
||||
} else {
|
||||
assertEquals("previous()", 254 , pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
|
||||
* that there are no problems with rules at the size that transitions between the two.
|
||||
*
|
||||
* A rule that matches a literal string, like 'abcdefghij', will require one state and
|
||||
* one character class per character in the string. So we can make a rule to tickle the
|
||||
* boundaries by using literal strings of various lengths.
|
||||
*
|
||||
* For both the number of states and the number of character classes, the eight bit format
|
||||
* only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
|
||||
* leaving 120 something available. This test runs the string over the range of 120 - 130,
|
||||
* which allows some margin for changes to the number of values reserved by the rule builder
|
||||
* without breaking the test.
|
||||
*/
|
||||
@Test
|
||||
public void TestTable_8_16_Bits() {
|
||||
// testStr serves as both the source of the rule string (truncated to the desired length)
|
||||
// and as test data to check matching behavior. A break rule consisting of the first 120
|
||||
// characters of testStr will match the first 120 chars of the full-length testStr.
|
||||
StringBuilder builder = new StringBuilder(0x200);
|
||||
for (char c=0x3000; c<0x3200; ++c) {
|
||||
builder.append(c);
|
||||
}
|
||||
String testStr = builder.toString();
|
||||
|
||||
int startLength = 120; // The shortest rule string to test.
|
||||
int endLength = 260; // The longest rule string to test
|
||||
int increment = 1;
|
||||
for (int ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
|
||||
String ruleString = (new String("!!quoted_literals_only; '#';"))
|
||||
.replace("#", testStr.substring(0, ruleLen));
|
||||
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(ruleString);
|
||||
|
||||
// Verify that the break iterator is functioning - that the first boundary found
|
||||
// in testStr is at the length of the rule string.
|
||||
bi.setText(testStr);
|
||||
assertEquals("The first boundary found in testStr should be at the length of the rule string",
|
||||
ruleLen, bi.next());
|
||||
|
||||
// Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
|
||||
// of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
|
||||
bi.setText(testStr);
|
||||
int result = bi.preceding(ruleLen);
|
||||
assertEquals("Reverse iteration should find the boundary at 0", 0, result);
|
||||
|
||||
// Verify that the range of rule lengths being tested cover the transations
|
||||
// from 8 to 16 bit data.
|
||||
RBBIDataWrapper dw = bi.fRData;
|
||||
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
|
||||
|
||||
boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
|
||||
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
|
||||
if (ruleLen == startLength) {
|
||||
assertTrue("State table should be in 8 bits", has8BitRowData);
|
||||
assertTrue("Trie should be in 8 bits", has8BitsTrie);
|
||||
}
|
||||
if (ruleLen == endLength) {
|
||||
assertFalse("State table should be in 16 bits", has8BitRowData);
|
||||
assertFalse("Trie should be in 16 bits", has8BitsTrie);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user