2000-01-10 20:26:57 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
2000-01-14 00:13:59 +00:00
|
|
|
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
|
2000-01-10 20:26:57 +00:00
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 12/1/99 rtg Ported from Java
|
2000-01-14 00:13:59 +00:00
|
|
|
* 01/13/2000 helena Added UErrorCode to ctors.
|
2000-01-10 20:26:57 +00:00
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION
|
|
|
|
|
2004-09-03 15:17:54 +00:00
|
|
|
#include "unicode/ures.h"
|
2000-01-10 20:26:57 +00:00
|
|
|
#include "brkdict.h"
|
2000-06-15 22:32:39 +00:00
|
|
|
#include "cmemory.h"
|
2000-01-10 20:26:57 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2000-01-10 20:26:57 +00:00
|
|
|
//=================================================================================
|
|
|
|
// deserialization
|
|
|
|
//=================================================================================
|
|
|
|
|
2002-07-24 19:07:37 +00:00
|
|
|
BreakDictionary::BreakDictionary(const char* /*dictionaryFilename*/, UErrorCode& status)
|
2002-12-06 23:07:48 +00:00
|
|
|
: columnMap(NULL),
|
|
|
|
table(NULL),
|
|
|
|
rowIndex(NULL),
|
|
|
|
rowIndexFlags(NULL),
|
|
|
|
rowIndexFlagsIndex(NULL),
|
|
|
|
rowIndexShifts(NULL)
|
2000-01-10 20:26:57 +00:00
|
|
|
{
|
2000-01-14 00:13:59 +00:00
|
|
|
if (U_FAILURE(status)) return;
|
2000-07-10 20:16:27 +00:00
|
|
|
|
2004-09-03 15:17:54 +00:00
|
|
|
UResourceBundle *th_dict = ures_open(NULL, "th", &status);
|
|
|
|
th_dict = ures_getByKey(th_dict, "BreakDictionaryData", th_dict, &status);
|
2000-07-10 20:16:27 +00:00
|
|
|
if (U_FAILURE(status)) return;
|
|
|
|
|
|
|
|
int32_t len;
|
2004-09-03 15:17:54 +00:00
|
|
|
const uint8_t * data = ures_getBinary(th_dict, &len, &status);
|
|
|
|
ures_close(th_dict);
|
2000-07-10 20:16:27 +00:00
|
|
|
if (U_FAILURE(status)) return;
|
|
|
|
|
2004-09-17 00:09:29 +00:00
|
|
|
readDictionaryFile(data);
|
2000-01-10 20:26:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
BreakDictionary::~BreakDictionary()
|
|
|
|
{
|
|
|
|
ucmp8_close(columnMap);
|
2002-07-16 01:55:55 +00:00
|
|
|
uprv_free(table);
|
|
|
|
uprv_free(rowIndex);
|
|
|
|
uprv_free(rowIndexFlags);
|
|
|
|
uprv_free(rowIndexFlagsIndex);
|
|
|
|
uprv_free(rowIndexShifts);
|
2000-01-10 20:26:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// macros to support readDictionaryFile. The data files originated from a Java
|
|
|
|
// program, and Java always writes data out in big-endian format. These macros will
|
|
|
|
// byte-swap the data for appropriate use on Windows.
|
2000-01-13 23:41:25 +00:00
|
|
|
|
|
|
|
#if U_IS_BIG_ENDIAN
|
|
|
|
#define SWAP32(x)
|
|
|
|
#define SWAP16(x)
|
|
|
|
#else
|
2000-08-11 01:27:17 +00:00
|
|
|
#define SWAP32(x) x = (uint32_t)((x >> 24 & 0xff) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24 & 0xff000000))
|
|
|
|
#define SWAP16(x) x = (uint16_t)((x << 8 & 0xff00) | (x >> 8 & 0xff))
|
2000-01-13 23:41:25 +00:00
|
|
|
#endif
|
2000-01-10 20:26:57 +00:00
|
|
|
|
2004-09-17 00:09:29 +00:00
|
|
|
#define DICTIONARY_READ(source, destAddr, len) \
|
|
|
|
uprv_memcpy(destAddr, source, len);\
|
|
|
|
source+=(len)
|
|
|
|
|
|
|
|
|
2000-01-10 20:26:57 +00:00
|
|
|
void
|
2004-09-17 00:09:29 +00:00
|
|
|
BreakDictionary::readDictionaryFile(const uint8_t * in)
|
2000-01-10 20:26:57 +00:00
|
|
|
{
|
|
|
|
int32_t l;
|
|
|
|
int32_t version;
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
int i;
|
2000-01-13 01:23:38 +00:00
|
|
|
|
2000-01-10 20:26:57 +00:00
|
|
|
// read in the version number (right now we just ignore it)
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &version, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
|
|
|
|
// read in the column map (this is serialized in its internal form:
|
|
|
|
// an index array followed by a data array)
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2000-06-15 22:32:39 +00:00
|
|
|
uint16_t* temp = (uint16_t*) uprv_malloc(sizeof(uint16_t)*l);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, temp, l * sizeof (int16_t) );
|
2000-01-13 01:23:38 +00:00
|
|
|
for (i = 0; i < l; i++) {
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP16(temp[i]);
|
|
|
|
}
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2000-06-15 22:32:39 +00:00
|
|
|
int8_t* temp2 = (int8_t*) uprv_malloc(sizeof(int8_t)*l);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, temp2, l);
|
2000-01-10 20:26:57 +00:00
|
|
|
columnMap = ucmp8_openAdopt(temp, temp2, l);
|
|
|
|
|
|
|
|
// read in numCols and numColGroups
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &numCols, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(numCols);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &numColGroups, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(numColGroups);
|
|
|
|
|
|
|
|
// read in the row-number index
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2002-07-16 01:55:55 +00:00
|
|
|
rowIndex = (int16_t *)uprv_malloc(l*2);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, rowIndex, l * sizeof (int16_t) );
|
2000-01-13 01:01:17 +00:00
|
|
|
for (i = 0; i < l; i++) {
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP16(rowIndex[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// load in the populated-cells bitmap: index first, then bitmap list
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2002-07-16 01:55:55 +00:00
|
|
|
rowIndexFlagsIndex = (int16_t *)uprv_malloc(l*2);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, rowIndexFlagsIndex, l * sizeof(int16_t) );
|
2000-01-13 01:01:17 +00:00
|
|
|
for (i = 0; i < l; i++) {
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP16(rowIndexFlagsIndex[i]);
|
|
|
|
}
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2002-07-16 01:55:55 +00:00
|
|
|
rowIndexFlags = (int32_t *)uprv_malloc(l*4);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, rowIndexFlags, l * sizeof(int32_t));
|
2000-01-13 01:01:17 +00:00
|
|
|
for (i = 0; i < l; i++) {
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(rowIndexFlags[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// load in the row-shift index
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2002-07-16 01:55:55 +00:00
|
|
|
rowIndexShifts = (int8_t *)uprv_malloc(l);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, rowIndexShifts, l);
|
2000-01-10 20:26:57 +00:00
|
|
|
|
|
|
|
// finally, load in the actual state table
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, &l, 4);
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP32(l);
|
2002-07-16 01:55:55 +00:00
|
|
|
table = (int16_t *)uprv_malloc(l*2);
|
2004-09-17 00:09:29 +00:00
|
|
|
DICTIONARY_READ(in, table, l * sizeof(int16_t) );
|
2000-01-13 01:01:17 +00:00
|
|
|
for (i = 0; i < l; i++) {
|
2000-01-10 20:26:57 +00:00
|
|
|
SWAP16(table[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// the reverse column map occurs next in the file. In the C/C++ code, for the
|
|
|
|
// time being, we're not going to worry about that.
|
|
|
|
}
|
|
|
|
|
|
|
|
//=================================================================================
|
|
|
|
// access to the words
|
|
|
|
//=================================================================================
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Uses the column map to map the character to a column number, then
|
|
|
|
* passes the row and column number to the other version of at()
|
|
|
|
* @param row The current state
|
|
|
|
* @param ch The character whose column we're interested in
|
|
|
|
* @return The new state to transition to
|
|
|
|
*/
|
|
|
|
int16_t
|
|
|
|
BreakDictionary::at(int32_t row, UChar ch) const
|
|
|
|
{
|
|
|
|
int16_t col = ucmp8_get(columnMap, ch);
|
|
|
|
return at(row, (int32_t)col);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the value in the cell with the specified (logical) row and
|
|
|
|
* column numbers. In DictionaryBasedBreakIterator, the row number is
|
|
|
|
* a state number, the column number is an input, and the return value
|
|
|
|
* is the row number of the new state to transition to. (0 is the
|
|
|
|
* "error" state, and -1 is the "end of word" state in a dictionary)
|
|
|
|
* @param row The row number of the current state
|
|
|
|
* @param col The column number of the input character (0 means "not a
|
|
|
|
* dictionary character")
|
|
|
|
* @return The row number of the new state to transition to
|
|
|
|
*/
|
|
|
|
int16_t
|
|
|
|
BreakDictionary::at(int32_t row, int32_t col) const
|
|
|
|
{
|
|
|
|
if (cellIsPopulated(row, col)) {
|
|
|
|
// we map from logical to physical row number by looking up the
|
|
|
|
// mapping in rowIndex; we map from logical column number to
|
|
|
|
// physical column number by looking up a shift value for this
|
|
|
|
// logical row and offsetting the logical column number by
|
|
|
|
// the shift amount. Then we can use internalAt() to actually
|
|
|
|
// get the value out of the table.
|
|
|
|
return internalAt(rowIndex[row], col + rowIndexShifts[row]);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//=================================================================================
|
|
|
|
// implementation
|
|
|
|
//=================================================================================
|
|
|
|
/**
|
|
|
|
* Given (logical) row and column numbers, returns true if the
|
|
|
|
* cell in that position is populated
|
|
|
|
*/
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool
|
2000-01-10 20:26:57 +00:00
|
|
|
BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const
|
|
|
|
{
|
|
|
|
// look up the entry in the bitmap index for the specified row.
|
|
|
|
// If it's a negative number, it's the column number of the only
|
|
|
|
// populated cell in the row
|
|
|
|
if (rowIndexFlagsIndex[row] < 0) {
|
|
|
|
return col == -rowIndexFlagsIndex[row];
|
|
|
|
}
|
|
|
|
|
|
|
|
// if it's a positive number, it's the offset of an entry in the bitmap
|
|
|
|
// list. If the table is more than 32 columns wide, the bitmap is stored
|
|
|
|
// successive entries in the bitmap list, so we have to divide the column
|
|
|
|
// number by 32 and offset the number we got out of the index by the result.
|
|
|
|
// Once we have the appropriate piece of the bitmap, test the appropriate
|
|
|
|
// bit and return the result.
|
|
|
|
else {
|
|
|
|
int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
|
|
|
|
return (flags & (1 << (col & 0x1f))) != 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation of at() when we know the specified cell is populated.
|
|
|
|
* @param row The PHYSICAL row number of the cell
|
|
|
|
* @param col The PHYSICAL column number of the cell
|
|
|
|
* @return The value stored in the cell
|
|
|
|
*/
|
|
|
|
int16_t
|
|
|
|
BreakDictionary::internalAt(int32_t row, int32_t col) const
|
|
|
|
{
|
|
|
|
// the table is a one-dimensional array, so this just does the math necessary
|
|
|
|
// to treat it as a two-dimensional array (we don't just use a two-dimensional
|
|
|
|
// array because two-dimensional arrays are inefficient in Java)
|
|
|
|
return table[row * numCols + col];
|
|
|
|
}
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|