/* ********************************************************************** * Copyright (C) 1999-2000 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 12/1/99 rtg Ported from Java * 01/13/2000 helena Added UErrorCode to ctors. ********************************************************************** */ #include "brkdict.h" #include "cmemory.h" #include "unicode/resbund.h" //================================================================================= // deserialization //================================================================================= BreakDictionary::BreakDictionary(char* dictionaryFilename, UErrorCode& status) { if (U_FAILURE(status)) return; ResourceBundle th((char *)0, Locale("th"), status); if (U_FAILURE(status)) return; ResourceBundle th_dict = th.get("BreakDictionaryData", status); if (U_FAILURE(status)) return; int32_t len; const uint8_t * data = th_dict.getBinary(len, status); if (U_FAILURE(status)) return; UMemoryStream* dictionaryStream = uprv_mstrm_openBuffer(data, len); if (dictionaryStream == 0) { status = U_FILE_ACCESS_ERROR; return; } readDictionaryFile(dictionaryStream); uprv_mstrm_close(dictionaryStream); } BreakDictionary::~BreakDictionary() { ucmp8_close(columnMap); delete [] table; delete [] rowIndex; delete [] rowIndexFlags; delete [] rowIndexFlagsIndex; delete [] rowIndexShifts; } // macros to support readDictionaryFile. The data files originated from a Java // program, and Java always writes data out in big-endian format. These macros will // byte-swap the data for appropriate use on Windows. #if U_IS_BIG_ENDIAN #define SWAP32(x) #define SWAP16(x) #else #define SWAP32(x) x = (x >> 24 & 0xff) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24 & 0xff000000) #define SWAP16(x) x = (x << 8 & 0xff00) | (x >> 8 & 0xff) #endif void BreakDictionary::readDictionaryFile(UMemoryStream* in) { int32_t l; int32_t version; int i; // read in the version number (right now we just ignore it) uprv_mstrm_read(in, &version, 4); // read in the column map (this is serialized in its internal form: // an index array followed by a data array) uprv_mstrm_read(in, &l, 4); SWAP32(l); uint16_t* temp = (uint16_t*) uprv_malloc(sizeof(uint16_t)*l); uprv_mstrm_read(in, temp, l * sizeof (int16_t) ); for (i = 0; i < l; i++) { SWAP16(temp[i]); } uprv_mstrm_read(in, &l, 4); SWAP32(l); int8_t* temp2 = (int8_t*) uprv_malloc(sizeof(int8_t)*l); uprv_mstrm_read(in, temp2, l); columnMap = ucmp8_openAdopt(temp, temp2, l); // read in numCols and numColGroups uprv_mstrm_read(in, &numCols, 4); SWAP32(numCols); uprv_mstrm_read(in, &numColGroups, 4); SWAP32(numColGroups); // read in the row-number index uprv_mstrm_read(in, &l, 4); SWAP32(l); rowIndex = new int16_t[l]; uprv_mstrm_read(in, rowIndex, l * sizeof (int16_t) ); for (i = 0; i < l; i++) { SWAP16(rowIndex[i]); } // load in the populated-cells bitmap: index first, then bitmap list uprv_mstrm_read(in, &l, 4); SWAP32(l); rowIndexFlagsIndex = new int16_t[l]; uprv_mstrm_read(in, rowIndexFlagsIndex, l * sizeof(int16_t) ); for (i = 0; i < l; i++) { SWAP16(rowIndexFlagsIndex[i]); } uprv_mstrm_read(in, &l, 4); SWAP32(l); rowIndexFlags = new int32_t[l]; uprv_mstrm_read(in, rowIndexFlags, l * sizeof(int32_t)); for (i = 0; i < l; i++) { SWAP32(rowIndexFlags[i]); } // load in the row-shift index uprv_mstrm_read(in, &l, 4); SWAP32(l); rowIndexShifts = new int8_t[l]; uprv_mstrm_read(in, rowIndexShifts, l); // finally, load in the actual state table uprv_mstrm_read(in, &l, 4); SWAP32(l); table = new int16_t[l]; uprv_mstrm_read(in, table, l * sizeof(int16_t) ); for (i = 0; i < l; i++) { SWAP16(table[i]); } // the reverse column map occurs next in the file. In the C/C++ code, for the // time being, we're not going to worry about that. } //================================================================================= // access to the words //================================================================================= /** * Uses the column map to map the character to a column number, then * passes the row and column number to the other version of at() * @param row The current state * @param ch The character whose column we're interested in * @return The new state to transition to */ int16_t BreakDictionary::at(int32_t row, UChar ch) const { int16_t col = ucmp8_get(columnMap, ch); return at(row, (int32_t)col); } /** * Returns the value in the cell with the specified (logical) row and * column numbers. In DictionaryBasedBreakIterator, the row number is * a state number, the column number is an input, and the return value * is the row number of the new state to transition to. (0 is the * "error" state, and -1 is the "end of word" state in a dictionary) * @param row The row number of the current state * @param col The column number of the input character (0 means "not a * dictionary character") * @return The row number of the new state to transition to */ int16_t BreakDictionary::at(int32_t row, int32_t col) const { if (cellIsPopulated(row, col)) { // we map from logical to physical row number by looking up the // mapping in rowIndex; we map from logical column number to // physical column number by looking up a shift value for this // logical row and offsetting the logical column number by // the shift amount. Then we can use internalAt() to actually // get the value out of the table. return internalAt(rowIndex[row], col + rowIndexShifts[row]); } else { return 0; } } //================================================================================= // implementation //================================================================================= /** * Given (logical) row and column numbers, returns true if the * cell in that position is populated */ UBool BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const { // look up the entry in the bitmap index for the specified row. // If it's a negative number, it's the column number of the only // populated cell in the row if (rowIndexFlagsIndex[row] < 0) { return col == -rowIndexFlagsIndex[row]; } // if it's a positive number, it's the offset of an entry in the bitmap // list. If the table is more than 32 columns wide, the bitmap is stored // successive entries in the bitmap list, so we have to divide the column // number by 32 and offset the number we got out of the index by the result. // Once we have the appropriate piece of the bitmap, test the appropriate // bit and return the result. else { int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; return (flags & (1 << (col & 0x1f))) != 0; } } /** * Implementation of at() when we know the specified cell is populated. * @param row The PHYSICAL row number of the cell * @param col The PHYSICAL column number of the cell * @return The value stored in the cell */ int16_t BreakDictionary::internalAt(int32_t row, int32_t col) const { // the table is a one-dimensional array, so this just does the math necessary // to treat it as a two-dimensional array (we don't just use a two-dimensional // array because two-dimensional arrays are inefficient in Java) return table[row * numCols + col]; }