scuffed-code/icu4c/source/common/brkdict.h

/*
**********************************************************************
*   Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*   12/1/99     rtg         Ported from Java
*   01/13/2000  helena      Added UErrorCode to ctors.
**********************************************************************
*/

#ifndef BRKDICT_H
#define BRKDICT_H

#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "ucmp8.h"
#include "umemstrm.h"

U_NAMESPACE_BEGIN

/**
 * This is the class that represents the list of known words used by
 * DictionaryBasedBreakIterator.  The conceptual data structure used
 * here is a trie: there is a node hanging off the root node for every
 * letter that can start a word.  Each of these nodes has a node hanging
 * off of it for every letter that can be the second letter of a word
 * if this node is the first letter, and so on.  The trie is represented
 * as a two-dimensional array that can be treated as a table of state
 * transitions.  Indexes are used to compress this array, taking
 * advantage of the fact that this array will always be very sparse.
 */
class BreakDictionary : public UMemory {
    //=================================================================================
    // data members
    //=================================================================================
private:

    /**
     * Maps from characters to column numbers.  The main use of this is to
     * avoid making room in the array for empty columns.
     */
    CompactByteArray* columnMap;

    /**
     * The number of actual columns in the table
     */
    int32_t numCols;

    /**
     * Columns are organized into groups of 32.  This says how many
     * column groups.  (We could calculate this, but we store the
     * value to avoid having to repeatedly calculate it.)
     */
    int32_t numColGroups;

    /**
     * The actual compressed state table.  Each conceptual row represents
     * a state, and the cells in it contain the row numbers of the states
     * to transition to for each possible letter.  0 is used to indicate
     * an illegal combination of letters (i.e., the error state).  The
     * table is compressed by eliminating all the unpopulated (i.e., zero)
     * cells.  Multiple conceptual rows can then be doubled up in a single
     * physical row by sliding them up and possibly shifting them to one
     * side or the other so the populated cells don't collide.  Indexes
     * are used to identify unpopulated cells and to locate populated cells.
     */
    int16_t* table;

    /**
     * This index maps logical row numbers to physical row numbers
     */
    int16_t* rowIndex;

    /**
     * A bitmap is used to tell which cells in the comceptual table are
     * populated.  This array contains all the unique bit combinations
     * in that bitmap.  If the table is more than 32 columns wide,
     * successive entries in this array are used for a single row.
     */
    int32_t* rowIndexFlags;

    /**
     * This index maps from a logical row number into the bitmap table above.
     * (This keeps us from storing duplicate bitmap combinations.)  Since there
     * are a lot of rows with only one populated cell, instead of wasting space
     * in the bitmap table, we just store a negative number in this index for
     * rows with one populated cell.  The absolute value of that number is
     * the column number of the populated cell.
     */
    int16_t* rowIndexFlagsIndex;

    /**
     * For each logical row, this index contains a constant that is added to
     * the logical column number to get the physical column number
     */
    int8_t* rowIndexShifts;

    //=================================================================================
    // deserialization
    //=================================================================================

public:
    /**
     * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
     * load the dictionary tables from the disk.
     * @param dictionaryFilename The name of the dictionary file
     * @param status for errors if it occurs
     */
    BreakDictionary(const char* dictionaryFilename, UErrorCode& status);

    /**
     * Destructor.
     */
    ~BreakDictionary();

    /**
     * Reads the dictionary file on the disk and constructs the appropriate in-memory
     * representation.
     * @param in The given memory stream
     */
    void readDictionaryFile(UMemoryStream* in);

    //=================================================================================
    // access to the words
    //=================================================================================

    /**
     * Uses the column map to map the character to a column number, then
     * passes the row and column number to the other version of at()
     * @param row The current state
     * @param ch The character whose column we're interested in
     * @return The new state to transition to
     */
    int16_t at(int32_t row, UChar ch) const;

    /**
     * Returns the value in the cell with the specified (logical) row and
     * column numbers.  In DictionaryBasedBreakIterator, the row number is
     * a state number, the column number is an input, and the return value
     * is the row number of the new state to transition to.  (0 is the
     * "error" state, and -1 is the "end of word" state in a dictionary)
     * @param row The row number of the current state
     * @param col The column number of the input character (0 means "not a
     * dictionary character")
     * @return The row number of the new state to transition to
     */
    int16_t at(int32_t row, int32_t col) const;

private:
    /**
     * Given (logical) row and column numbers, returns true if the
     * cell in that position is populated
     * @param row The LOGICAL row number of the cell
     * @param col The PHYSICAL row number of the cell
     * @return true if the cell in that position is populated
     */
    UBool cellIsPopulated(int32_t row, int32_t col) const;

    /**
     * Implementation of at() when we know the specified cell is populated.
     * @param row The PHYSICAL row number of the cell
     * @param col The PHYSICAL column number of the cell
     * @return The value stored in the cell
     */
    int16_t internalAt(int32_t row, int32_t col) const;

    // the following methods are never meant to be called and so are not defined
    // (if you don't declare them, you get default implementations)
    BreakDictionary(const BreakDictionary& that);
    BreakDictionary& operator=(const BreakDictionary& that);
};

U_NAMESPACE_END

#endif
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`/*`
			`**********************************************************************`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`* Copyright (C) 1999-2000 IBM and others. All rights reserved.`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`**********************************************************************`
			`* Date Name Description`
			`* 12/1/99 rtg Ported from Java`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`* 01/13/2000 helena Added UErrorCode to ctors.`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`**********************************************************************`
			`*/`

			`#ifndef BRKDICT_H`
			`#define BRKDICT_H`

ICU-1962 derive all ICU C++ classes from common UObject base class X-SVN-Rev: 8953 2002-06-27 01:19:20 +00:00			`#include "unicode/utypes.h"`
			`#include "unicode/uobject.h"`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`#include "ucmp8.h"`
ICU-489 breakiterator now uses binary type in resb for thai X-SVN-Rev: 1772 2000-07-10 20:16:27 +00:00			`#include "umemstrm.h"`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_BEGIN`

ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`/**`
			`* This is the class that represents the list of known words used by`
			`* DictionaryBasedBreakIterator. The conceptual data structure used`
			`* here is a trie: there is a node hanging off the root node for every`
			`* letter that can start a word. Each of these nodes has a node hanging`
			`* off of it for every letter that can be the second letter of a word`
			`* if this node is the first letter, and so on. The trie is represented`
			`* as a two-dimensional array that can be treated as a table of state`
			`* transitions. Indexes are used to compress this array, taking`
			`* advantage of the fact that this array will always be very sparse.`
			`*/`
ICU-2244 change internal classes to inherit UMemory, not UObject X-SVN-Rev: 9960 2002-10-04 01:23:34 +00:00			`class BreakDictionary : public UMemory {`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`//=================================================================================`
			`// data members`
			`//=================================================================================`
			`private:`

			`/**`
			`* Maps from characters to column numbers. The main use of this is to`
			`* avoid making room in the array for empty columns.`
			`*/`
			`CompactByteArray* columnMap;`

			`/**`
			`* The number of actual columns in the table`
			`*/`
			`int32_t numCols;`

			`/**`
			`* Columns are organized into groups of 32. This says how many`
			`* column groups. (We could calculate this, but we store the`
			`* value to avoid having to repeatedly calculate it.)`
			`*/`
			`int32_t numColGroups;`

			`/**`
			`* The actual compressed state table. Each conceptual row represents`
			`* a state, and the cells in it contain the row numbers of the states`
			`* to transition to for each possible letter. 0 is used to indicate`
			`* an illegal combination of letters (i.e., the error state). The`
			`* table is compressed by eliminating all the unpopulated (i.e., zero)`
			`* cells. Multiple conceptual rows can then be doubled up in a single`
			`* physical row by sliding them up and possibly shifting them to one`
			`* side or the other so the populated cells don't collide. Indexes`
			`* are used to identify unpopulated cells and to locate populated cells.`
			`*/`
			`int16_t* table;`

			`/**`
			`* This index maps logical row numbers to physical row numbers`
			`*/`
			`int16_t* rowIndex;`

			`/**`
			`* A bitmap is used to tell which cells in the comceptual table are`
			`* populated. This array contains all the unique bit combinations`
			`* in that bitmap. If the table is more than 32 columns wide,`
			`* successive entries in this array are used for a single row.`
			`*/`
			`int32_t* rowIndexFlags;`

			`/**`
			`* This index maps from a logical row number into the bitmap table above.`
			`* (This keeps us from storing duplicate bitmap combinations.) Since there`
			`* are a lot of rows with only one populated cell, instead of wasting space`
			`* in the bitmap table, we just store a negative number in this index for`
			`* rows with one populated cell. The absolute value of that number is`
			`* the column number of the populated cell.`
			`*/`
			`int16_t* rowIndexFlagsIndex;`

			`/**`
			`* For each logical row, this index contains a constant that is added to`
			`* the logical column number to get the physical column number`
			`*/`
			`int8_t* rowIndexShifts;`

			`//=================================================================================`
			`// deserialization`
			`//=================================================================================`

			`public:`
			`/**`
			`* Constructor. Creates the BreakDictionary by using readDictionaryFile() to`
			`* load the dictionary tables from the disk.`
ICU-1953 Check APIs comments for @param, @return... X-SVN-Rev: 9024 2002-07-03 12:05:56 +00:00			`* @param dictionaryFilename The name of the dictionary file`
			`* @param status for errors if it occurs`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`*/`
ICU-1099 Make some data a bit more const X-SVN-Rev: 6138 2001-10-09 22:57:29 +00:00			`BreakDictionary(const char* dictionaryFilename, UErrorCode& status);`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00
			`/**`
			`* Destructor.`
			`*/`
			`~BreakDictionary();`

			`/**`
			`* Reads the dictionary file on the disk and constructs the appropriate in-memory`
			`* representation.`
ICU-1953 Check APIs comments for @param, @return... X-SVN-Rev: 9024 2002-07-03 12:05:56 +00:00			`* @param in The given memory stream`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`*/`
ICU-489 breakiterator now uses binary type in resb for thai X-SVN-Rev: 1772 2000-07-10 20:16:27 +00:00			`void readDictionaryFile(UMemoryStream* in);`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00
			`//=================================================================================`
			`// access to the words`
			`//=================================================================================`

			`/**`
			`* Uses the column map to map the character to a column number, then`
			`* passes the row and column number to the other version of at()`
			`* @param row The current state`
			`* @param ch The character whose column we're interested in`
			`* @return The new state to transition to`
			`*/`
			`int16_t at(int32_t row, UChar ch) const;`

			`/**`
			`* Returns the value in the cell with the specified (logical) row and`
			`* column numbers. In DictionaryBasedBreakIterator, the row number is`
			`* a state number, the column number is an input, and the return value`
			`* is the row number of the new state to transition to. (0 is the`
			`* "error" state, and -1 is the "end of word" state in a dictionary)`
			`* @param row The row number of the current state`
			`* @param col The column number of the input character (0 means "not a`
			`* dictionary character")`
			`* @return The row number of the new state to transition to`
			`*/`
			`int16_t at(int32_t row, int32_t col) const;`

			`private:`
			`/**`
			`* Given (logical) row and column numbers, returns true if the`
			`* cell in that position is populated`
ICU-1953 Check APIs comments for @param, @return... X-SVN-Rev: 9024 2002-07-03 12:05:56 +00:00			`* @param row The LOGICAL row number of the cell`
			`* @param col The PHYSICAL row number of the cell`
			`* @return true if the cell in that position is populated`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00			`*/`
ICU-351 Define UBool to be used in the APIs. X-SVN-Rev: 1410 2000-05-18 22:08:39 +00:00			`UBool cellIsPopulated(int32_t row, int32_t col) const;`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00
			`/**`
			`* Implementation of at() when we know the specified cell is populated.`
			`* @param row The PHYSICAL row number of the cell`
			`* @param col The PHYSICAL column number of the cell`
			`* @return The value stored in the cell`
			`*/`
			`int16_t internalAt(int32_t row, int32_t col) const;`

			`// the following methods are never meant to be called and so are not defined`
			`// (if you don't declare them, you get default implementations)`
			`BreakDictionary(const BreakDictionary& that);`
			`BreakDictionary& operator=(const BreakDictionary& that);`
			`};`

ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_END`
ICU-45 Check in initial version of the RBBI implementation. X-SVN-Rev: 505 2000-01-10 20:26:57 +00:00
			`#endif`