2000-01-08 02:05:05 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 11/11/99 rgillam Complete port from Java.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
2000-01-11 00:46:58 +00:00
|
|
|
#include "ucmp8.h"
|
2000-01-08 02:05:05 +00:00
|
|
|
#include "cmemory.h"
|
2000-01-11 00:46:58 +00:00
|
|
|
#include "rbbi_tbl.h"
|
2002-02-28 01:28:04 +00:00
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#ifdef RBBI_DEBUG
|
|
|
|
#include <stdio.h>
|
|
|
|
#endif
|
2000-01-08 02:05:05 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2000-01-08 02:05:05 +00:00
|
|
|
//=======================================================================
|
|
|
|
// constructor
|
|
|
|
//=======================================================================
|
|
|
|
|
2000-07-12 05:01:53 +00:00
|
|
|
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables(UDataMemory* memory)
|
2000-01-08 02:05:05 +00:00
|
|
|
: refCount(0),
|
|
|
|
ownTables(FALSE)
|
|
|
|
{
|
2000-07-12 05:01:53 +00:00
|
|
|
if(memory != 0) {
|
|
|
|
fMemory = memory;
|
|
|
|
const void* image = udata_getMemory(memory);
|
2000-01-08 02:05:05 +00:00
|
|
|
|
2000-07-12 05:01:53 +00:00
|
|
|
if(image != 0) {
|
|
|
|
|
|
|
|
const int32_t* im = (const int32_t*)(image);
|
|
|
|
const int8_t* base = (const int8_t*)(image);
|
|
|
|
|
|
|
|
// the memory image begins with an index that gives the offsets into the
|
|
|
|
// image for each of the fields in the BreakIteratorTables object--
|
|
|
|
// use those to initialize the tables object (it will end up pointing
|
|
|
|
// into the memory image for everything)
|
|
|
|
numCategories = (int32_t)im[0];
|
|
|
|
description = UnicodeString(TRUE, (UChar*)((int32_t)im[1] + base), -1);
|
|
|
|
charCategoryTable = ucmp8_openAlias((uint16_t*)((int32_t)im[2] + base),
|
2000-09-26 02:35:48 +00:00
|
|
|
(int8_t*)((int32_t)im[3] + base), 0);
|
2000-07-12 05:01:53 +00:00
|
|
|
stateTable = (int16_t*)((int32_t)im[4] + base);
|
|
|
|
backwardsStateTable = (int16_t*)((int32_t)im[5] + base);
|
|
|
|
endStates = (int8_t*)((int32_t)im[6] + base);
|
|
|
|
lookaheadStates = (int8_t*)((int32_t)im[7] + base);
|
|
|
|
} else {
|
|
|
|
udata_close(fMemory);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fMemory = 0;
|
|
|
|
}
|
2000-01-08 02:05:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables()
|
|
|
|
: refCount(0),
|
2000-07-12 05:01:53 +00:00
|
|
|
ownTables(TRUE),
|
|
|
|
fMemory(0)
|
2000-01-08 02:05:05 +00:00
|
|
|
{
|
|
|
|
// everything else is null-initialized. This constructor depends on
|
|
|
|
// a RuleBasedBreakIteratorBuilder filling in all the members
|
|
|
|
}
|
|
|
|
|
|
|
|
//=======================================================================
|
|
|
|
// boilerplate
|
|
|
|
//=======================================================================
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor
|
|
|
|
*/
|
|
|
|
RuleBasedBreakIteratorTables::~RuleBasedBreakIteratorTables() {
|
|
|
|
if (ownTables) {
|
|
|
|
delete [] stateTable;
|
|
|
|
delete [] backwardsStateTable;
|
|
|
|
delete [] endStates;
|
|
|
|
delete [] lookaheadStates;
|
|
|
|
ucmp8_close(charCategoryTable);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
uprv_free(charCategoryTable);
|
2000-09-26 02:35:48 +00:00
|
|
|
if(fMemory != 0) {
|
|
|
|
udata_close(fMemory);
|
|
|
|
}
|
2000-01-08 02:05:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Equality operator. Returns TRUE if both tables objects are of the
|
|
|
|
* same class, have the same behavior, and iterate over the same text.
|
|
|
|
*/
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool
|
2000-01-08 02:05:05 +00:00
|
|
|
RuleBasedBreakIteratorTables::operator==(const RuleBasedBreakIteratorTables& that) const {
|
|
|
|
return this->description == that.description;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Compute a hash code for these tables
|
|
|
|
* @return A hash code
|
|
|
|
*/
|
|
|
|
int32_t
|
|
|
|
RuleBasedBreakIteratorTables::hashCode() const {
|
|
|
|
return description.hashCode();
|
|
|
|
}
|
|
|
|
|
|
|
|
//=======================================================================
|
|
|
|
// implementation
|
|
|
|
//=======================================================================
|
|
|
|
/**
|
|
|
|
* Looks up a character's category (i.e., its category for breaking purposes,
|
|
|
|
* not its Unicode category)
|
2000-09-26 02:35:48 +00:00
|
|
|
* The ignored parameter is used by derived implementations.
|
2000-01-08 02:05:05 +00:00
|
|
|
*/
|
|
|
|
int32_t
|
2000-09-26 02:35:48 +00:00
|
|
|
RuleBasedBreakIteratorTables::lookupCategory(UChar c, BreakIterator* /*ignored*/) const {
|
2000-01-08 02:05:05 +00:00
|
|
|
return ucmp8_get(charCategoryTable, c);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a current state and a character category, looks up the
|
|
|
|
* next state to transition to in the state table.
|
|
|
|
*/
|
|
|
|
int32_t
|
|
|
|
RuleBasedBreakIteratorTables::lookupState(int32_t state, int32_t category) const {
|
|
|
|
return stateTable[state * numCategories + category];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a current state and a character category, looks up the
|
|
|
|
* next state to transition to in the backwards state table.
|
|
|
|
*/
|
|
|
|
int32_t
|
|
|
|
RuleBasedBreakIteratorTables::lookupBackwardState(int32_t state, int32_t category) const {
|
|
|
|
return backwardsStateTable[state * numCategories + category];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns true if the specified state is an accepting state.
|
|
|
|
*/
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool
|
2000-01-08 02:05:05 +00:00
|
|
|
RuleBasedBreakIteratorTables::isEndState(int32_t state) const {
|
|
|
|
return endStates[state];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns true if the specified state is a lookahead state.
|
|
|
|
*/
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool
|
2000-01-08 02:05:05 +00:00
|
|
|
RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
|
|
|
|
return lookaheadStates[state];
|
|
|
|
}
|
2002-02-28 01:28:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
#ifdef RBBI_DEBUG
|
|
|
|
//
|
|
|
|
// debugDumpTables
|
|
|
|
//
|
|
|
|
void RuleBasedBreakIteratorTables::debugDumpTables() const {
|
|
|
|
printf("Character Classes:\n");
|
|
|
|
int currentCharClass = 257;
|
|
|
|
int startCurrentRange = 0;
|
|
|
|
int initialStringLength = 0;
|
|
|
|
char buf[80];
|
|
|
|
|
|
|
|
UnicodeString *charClassRanges = new UnicodeString[numCategories];
|
|
|
|
|
|
|
|
for (int i = 0; i < 0xffff; i++) {
|
|
|
|
if ( ucmp8_get(charCategoryTable, i) != currentCharClass) {
|
|
|
|
if (currentCharClass != 257) {
|
|
|
|
// Complete the output of the previous range.
|
|
|
|
if (i != startCurrentRange+1) {
|
|
|
|
sprintf(buf, "-%x", i-1);
|
|
|
|
charClassRanges[currentCharClass].append(buf);
|
|
|
|
}
|
|
|
|
if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
|
|
|
|
charClassRanges[currentCharClass].append("\n ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Output the start of the new range.
|
|
|
|
currentCharClass = ucmp8_get(charCategoryTable, i);
|
|
|
|
startCurrentRange = i;
|
|
|
|
initialStringLength = charClassRanges[currentCharClass].length();
|
|
|
|
if (charClassRanges[currentCharClass].length() > 0)
|
|
|
|
charClassRanges[currentCharClass].append(", ");
|
|
|
|
sprintf(buf, "%x", i);
|
|
|
|
charClassRanges[currentCharClass].append(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i=0; i<numCategories; i++) {
|
|
|
|
printf("%d: ", i);
|
|
|
|
// Write out the chars in the UnicodeStrings.
|
|
|
|
// We know we didn't put anything into them except for plain ascii chars.
|
|
|
|
for (int j=0; j<charClassRanges[i].length(); j++) {
|
|
|
|
putchar(charClassRanges[i].charAt(j));
|
|
|
|
}
|
|
|
|
putchar('\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
delete [] charClassRanges;
|
|
|
|
|
|
|
|
|
|
|
|
// State table length might be too big by one, because the only indication
|
|
|
|
// we have is the pointer to the start of the next item in the memory
|
|
|
|
// image, the backwardsStateTable, which is 4 byte aligned.
|
|
|
|
//
|
|
|
|
int stateTableLength = backwardsStateTable - stateTable;
|
|
|
|
if ((stateTableLength % numCategories) == 1) {
|
|
|
|
stateTableLength -= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\n\nState Table. *: end state %%: look ahead state\n");
|
|
|
|
printf("C:\t");
|
|
|
|
for (int i = 0; i < numCategories; i++) {
|
|
|
|
printf("%d\t", i);
|
|
|
|
}
|
|
|
|
printf("\n=================================================");
|
|
|
|
|
|
|
|
for (int i = 0; i < stateTableLength; i++) {
|
|
|
|
if (i % numCategories == 0) {
|
|
|
|
putchar('\n');
|
|
|
|
if (endStates[i / numCategories])
|
|
|
|
putchar('*');
|
|
|
|
else
|
|
|
|
putchar(' ');
|
|
|
|
if (lookaheadStates[i / numCategories]) {
|
|
|
|
putchar('%');
|
|
|
|
}
|
|
|
|
else
|
|
|
|
putchar(' ');
|
|
|
|
printf("%d:\t", i / numCategories);
|
|
|
|
}
|
|
|
|
if (stateTable[i] == 0) {
|
|
|
|
printf(".\t");
|
|
|
|
} else {
|
|
|
|
printf("%d\t", stateTable[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf("\n\n\n");
|
|
|
|
}
|
|
|
|
#endif // RBBI_DEBUG
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|