2000-01-10 20:26:57 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
2000-01-14 00:13:59 +00:00
|
|
|
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
|
2000-01-10 20:26:57 +00:00
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 12/1/99 rtg Ported from Java
|
2000-01-14 00:13:59 +00:00
|
|
|
* 01/13/2000 helena Added UErrorCode to ctors.
|
2000-01-10 20:26:57 +00:00
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef BRKDICT_H
|
|
|
|
#define BRKDICT_H
|
|
|
|
|
2002-06-27 01:19:20 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uobject.h"
|
2000-01-10 20:26:57 +00:00
|
|
|
#include "ucmp8.h"
|
2000-07-10 20:16:27 +00:00
|
|
|
#include "umemstrm.h"
|
2000-01-10 20:26:57 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2000-01-10 20:26:57 +00:00
|
|
|
/**
|
|
|
|
* This is the class that represents the list of known words used by
|
|
|
|
* DictionaryBasedBreakIterator. The conceptual data structure used
|
|
|
|
* here is a trie: there is a node hanging off the root node for every
|
|
|
|
* letter that can start a word. Each of these nodes has a node hanging
|
|
|
|
* off of it for every letter that can be the second letter of a word
|
|
|
|
* if this node is the first letter, and so on. The trie is represented
|
|
|
|
* as a two-dimensional array that can be treated as a table of state
|
|
|
|
* transitions. Indexes are used to compress this array, taking
|
|
|
|
* advantage of the fact that this array will always be very sparse.
|
|
|
|
*/
|
2002-10-04 01:23:34 +00:00
|
|
|
class BreakDictionary : public UMemory {
|
2000-01-10 20:26:57 +00:00
|
|
|
//=================================================================================
|
|
|
|
// data members
|
|
|
|
//=================================================================================
|
|
|
|
private:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Maps from characters to column numbers. The main use of this is to
|
|
|
|
* avoid making room in the array for empty columns.
|
|
|
|
*/
|
|
|
|
CompactByteArray* columnMap;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The number of actual columns in the table
|
|
|
|
*/
|
|
|
|
int32_t numCols;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Columns are organized into groups of 32. This says how many
|
|
|
|
* column groups. (We could calculate this, but we store the
|
|
|
|
* value to avoid having to repeatedly calculate it.)
|
|
|
|
*/
|
|
|
|
int32_t numColGroups;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The actual compressed state table. Each conceptual row represents
|
|
|
|
* a state, and the cells in it contain the row numbers of the states
|
|
|
|
* to transition to for each possible letter. 0 is used to indicate
|
|
|
|
* an illegal combination of letters (i.e., the error state). The
|
|
|
|
* table is compressed by eliminating all the unpopulated (i.e., zero)
|
|
|
|
* cells. Multiple conceptual rows can then be doubled up in a single
|
|
|
|
* physical row by sliding them up and possibly shifting them to one
|
|
|
|
* side or the other so the populated cells don't collide. Indexes
|
|
|
|
* are used to identify unpopulated cells and to locate populated cells.
|
|
|
|
*/
|
|
|
|
int16_t* table;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This index maps logical row numbers to physical row numbers
|
|
|
|
*/
|
|
|
|
int16_t* rowIndex;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* A bitmap is used to tell which cells in the comceptual table are
|
|
|
|
* populated. This array contains all the unique bit combinations
|
|
|
|
* in that bitmap. If the table is more than 32 columns wide,
|
|
|
|
* successive entries in this array are used for a single row.
|
|
|
|
*/
|
|
|
|
int32_t* rowIndexFlags;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This index maps from a logical row number into the bitmap table above.
|
|
|
|
* (This keeps us from storing duplicate bitmap combinations.) Since there
|
|
|
|
* are a lot of rows with only one populated cell, instead of wasting space
|
|
|
|
* in the bitmap table, we just store a negative number in this index for
|
|
|
|
* rows with one populated cell. The absolute value of that number is
|
|
|
|
* the column number of the populated cell.
|
|
|
|
*/
|
|
|
|
int16_t* rowIndexFlagsIndex;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* For each logical row, this index contains a constant that is added to
|
|
|
|
* the logical column number to get the physical column number
|
|
|
|
*/
|
|
|
|
int8_t* rowIndexShifts;
|
|
|
|
|
|
|
|
//=================================================================================
|
|
|
|
// deserialization
|
|
|
|
//=================================================================================
|
|
|
|
|
|
|
|
public:
|
|
|
|
/**
|
|
|
|
* Constructor. Creates the BreakDictionary by using readDictionaryFile() to
|
|
|
|
* load the dictionary tables from the disk.
|
2002-07-03 12:05:56 +00:00
|
|
|
* @param dictionaryFilename The name of the dictionary file
|
|
|
|
* @param status for errors if it occurs
|
2000-01-10 20:26:57 +00:00
|
|
|
*/
|
2001-10-09 22:57:29 +00:00
|
|
|
BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
|
2000-01-10 20:26:57 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor.
|
|
|
|
*/
|
|
|
|
~BreakDictionary();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Reads the dictionary file on the disk and constructs the appropriate in-memory
|
|
|
|
* representation.
|
2002-07-03 12:05:56 +00:00
|
|
|
* @param in The given memory stream
|
2000-01-10 20:26:57 +00:00
|
|
|
*/
|
2000-07-10 20:16:27 +00:00
|
|
|
void readDictionaryFile(UMemoryStream* in);
|
2000-01-10 20:26:57 +00:00
|
|
|
|
|
|
|
//=================================================================================
|
|
|
|
// access to the words
|
|
|
|
//=================================================================================
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Uses the column map to map the character to a column number, then
|
|
|
|
* passes the row and column number to the other version of at()
|
|
|
|
* @param row The current state
|
|
|
|
* @param ch The character whose column we're interested in
|
|
|
|
* @return The new state to transition to
|
|
|
|
*/
|
|
|
|
int16_t at(int32_t row, UChar ch) const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the value in the cell with the specified (logical) row and
|
|
|
|
* column numbers. In DictionaryBasedBreakIterator, the row number is
|
|
|
|
* a state number, the column number is an input, and the return value
|
|
|
|
* is the row number of the new state to transition to. (0 is the
|
|
|
|
* "error" state, and -1 is the "end of word" state in a dictionary)
|
|
|
|
* @param row The row number of the current state
|
|
|
|
* @param col The column number of the input character (0 means "not a
|
|
|
|
* dictionary character")
|
|
|
|
* @return The row number of the new state to transition to
|
|
|
|
*/
|
|
|
|
int16_t at(int32_t row, int32_t col) const;
|
|
|
|
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* Given (logical) row and column numbers, returns true if the
|
|
|
|
* cell in that position is populated
|
2002-07-03 12:05:56 +00:00
|
|
|
* @param row The LOGICAL row number of the cell
|
|
|
|
* @param col The PHYSICAL row number of the cell
|
|
|
|
* @return true if the cell in that position is populated
|
2000-01-10 20:26:57 +00:00
|
|
|
*/
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool cellIsPopulated(int32_t row, int32_t col) const;
|
2000-01-10 20:26:57 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation of at() when we know the specified cell is populated.
|
|
|
|
* @param row The PHYSICAL row number of the cell
|
|
|
|
* @param col The PHYSICAL column number of the cell
|
|
|
|
* @return The value stored in the cell
|
|
|
|
*/
|
|
|
|
int16_t internalAt(int32_t row, int32_t col) const;
|
|
|
|
|
|
|
|
// the following methods are never meant to be called and so are not defined
|
|
|
|
// (if you don't declare them, you get default implementations)
|
|
|
|
BreakDictionary(const BreakDictionary& that);
|
|
|
|
BreakDictionary& operator=(const BreakDictionary& that);
|
|
|
|
};
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
2000-01-10 20:26:57 +00:00
|
|
|
|
|
|
|
#endif
|