scuffed-code/icu4c/source/common/unicode/dbbi.h

/*
**********************************************************************
*   Copyright (C) 1999-2001 IBM Corp. All rights reserved.
**********************************************************************
*   Date        Name        Description
*   12/1/99    rgillam     Complete port from Java.
*   01/13/2000 helena      Added UErrorCode to ctors.
**********************************************************************
*/

#ifndef DBBI_H
#define DBBI_H

#include "unicode/rbbi.h"

U_NAMESPACE_BEGIN

/* forward declaration */
class DictionaryBasedBreakIteratorTables;

/**
 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
 * to further subdivide ranges of text beyond what is possible using just the
 * state-table-based algorithm.  This is necessary, for example, to handle
 * word and line breaking in Thai, which doesn't use spaces between words.  The
 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
 * up text as far as possible, and then contiguous ranges of letters are
 * repeatedly compared against a list of known words (i.e., the dictionary)
 * to divide them up into words.
 *
 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
 * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution
 * name is used to identify characters in words in the dictionary.  The idea is that
 * if the iterator passes over a chunk of text that includes two or more characters
 * in a row that are included in &lt;dictionary&gt;, it goes back through that range and
 * derives additional break positions (if possible) using the dictionary.
 *
 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
 * file.  It follows a prescribed search path to locate the dictionary (right now,
 * it looks for it in /com/ibm/text/resources in each directory in the classpath,
 * and won't find it in JAR files, but this location is likely to change).  The
 * dictionary file is in a serialized binary format.  We have a very primitive (and
 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
 * currently making it public.  Contact us for help.
 * <p>
 * <b> NOTE </b>  The DictionaryBasedIterator class is still under development.  The
 * APIs are not in stable condition yet.  
 */
class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {

private:
    /**
     * a temporary hiding place for the number of dictionary characters in the
     * last range passed over by next()
     */
    int32_t dictionaryCharCount;

    /**
     * when a range of characters is divided up using the dictionary, the break
     * positions that are discovered are stored here, preventing us from having
     * to use either the dictionary or the state table again until the iterator
     * leaves this range of text
     */
    int32_t* cachedBreakPositions;

    /**
     * The number of elements in cachedBreakPositions
     */
    int32_t numCachedBreakPositions;

    /**
     * if cachedBreakPositions is not null, this indicates which item in the
     * cache the current iteration position refers to
     */
    int32_t positionInCache;

    /**
     * Class ID
     */
    static const char fgClassID;

    /**=======================================================================
     * Create a dictionary based break boundary detection iterator.  
     * @param tablesImage The location for the dictionary to be loaded into memory
     * @param dictionaryFilename The name of the dictionary file 
     * @param status the error code status
     * @return A dictionary based break detection iterator.  The UErrorCode& status 
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.  For example,
     * U_FILE_ACCESS_ERROR will be returned if the file does not exist.
     * The caller owns the returned object and is responsible for deleting it.
     ======================================================================= */
    DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status);

public:
    //=======================================================================
    // boilerplate
    //=======================================================================

    /**
     * Destructor
     */
    virtual ~DictionaryBasedBreakIterator();

    /**
     * Assignment operator.  Sets this iterator to have the same behavior,
     * and iterate over the same text, as the one passed in.
     */
    DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);

    /**
     * Returns a newly-constructed RuleBasedBreakIterator with the same
     * behavior, and iterating over the same text, as this one.
     */
    virtual BreakIterator* clone(void) const;

    //=======================================================================
    // BreakIterator overrides
    //=======================================================================
    /**
     * Advances the iterator backwards, to the last boundary preceding this one.
     * @return The position of the last boundary position preceding this one.
     */
    virtual int32_t previous(void);

    /**
     * Sets the iterator to refer to the first boundary position following
     * the specified position.
     * @offset The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     */
    virtual int32_t following(int32_t offset);

    /**
     * Sets the iterator to refer to the last boundary position before the
     * specified position.
     * @offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     */
    virtual int32_t preceding(int32_t offset);

    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
     * This method is to implement a simple version of RTTI, since not all
     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     *
     * @return          The class ID for this object. All objects of a
     *                  given class have the same class ID.  Objects of
     *                  other classes have different class IDs.
     */
    virtual UClassID getDynamicClassID(void) const;

    /**
     * Returns the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return          The class ID for all objects of this class.
     */
    static UClassID getStaticClassID(void);

protected:
    //=======================================================================
    // implementation
    //=======================================================================
    /**
     * This method is the actual implementation of the next() method.  All iteration
     * vectors through here.  This method initializes the state machine to state 1
     * and advances through the text character by character until we reach the end
     * of the text or the state machine transitions to state 0.  We update our return
     * value every time the state machine passes through a possible end state.
     */
    virtual int32_t handleNext(void);

    /**
     * dumps the cache of break positions (usually in response to a change in
     * position of some sort)
     */
    virtual void reset(void);

    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
                                               UErrorCode &status);


private:
    /**
     * This is the function that actually implements the dictionary-based
     * algorithm.  Given the endpoints of a range of text, it uses the
     * dictionary to determine the positions of any boundaries in this
     * range.  It stores all the boundary positions it discovers in
     * cachedBreakPositions so that we only have to do this work once
     * for each time we enter the range.
     */
    void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);

    /**
     * Used by the tables object to increment the count of dictionary characters
     * during iteration
     */
    void bumpDictionaryCharCount(void);

    /*
     * HSYS : Please revisit with Rich, the ctors of the DBBI class is currently
     * marked as private.
     */
    friend class DictionaryBasedBreakIteratorTables;
    friend class BreakIterator;
};

inline UClassID DictionaryBasedBreakIterator::getDynamicClassID(void) const {
    return RuleBasedBreakIterator::getStaticClassID();
}

inline UClassID DictionaryBasedBreakIterator::getStaticClassID(void) {
    return (UClassID)(&fgClassID);
}

inline void DictionaryBasedBreakIterator::bumpDictionaryCharCount(void) {
    ++dictionaryCharCount;
}
U_NAMESPACE_END

#endif
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`/*`
			`**********************************************************************`
ICU-903 updated copyright notices X-SVN-Rev: 4237 2001-03-21 20:31:13 +00:00			`* Copyright (C) 1999-2001 IBM Corp. All rights reserved.`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`**********************************************************************`
			`* Date Name Description`
			`* 12/1/99 rgillam Complete port from Java.`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`* 01/13/2000 helena Added UErrorCode to ctors.`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`**********************************************************************`
			`*/`

			`#ifndef DBBI_H`
			`#define DBBI_H`

			`#include "unicode/rbbi.h"`

ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_BEGIN`

ICU-201 add forward declarations before 'friend' statements X-SVN-Rev: 633 2000-01-18 19:57:46 +00:00			`/* forward declaration */`
			`class DictionaryBasedBreakIteratorTables;`

ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`/**`
			`* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary`
			`* to further subdivide ranges of text beyond what is possible using just the`
			`* state-table-based algorithm. This is necessary, for example, to handle`
			`* word and line breaking in Thai, which doesn't use spaces between words. The`
			`* state-table-based algorithm used by RuleBasedBreakIterator is used to divide`
			`* up text as far as possible, and then contiguous ranges of letters are`
			`* repeatedly compared against a list of known words (i.e., the dictionary)`
			`* to divide them up into words.`
			`*`
			`* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,`
			`* but adds one more special substitution name: <dictionary>. This substitution`
			`* name is used to identify characters in words in the dictionary. The idea is that`
			`* if the iterator passes over a chunk of text that includes two or more characters`
			`* in a row that are included in <dictionary>, it goes back through that range and`
			`* derives additional break positions (if possible) using the dictionary.`
			`*`
			`* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary`
			`* file. It follows a prescribed search path to locate the dictionary (right now,`
			`* it looks for it in /com/ibm/text/resources in each directory in the classpath,`
			`* and won't find it in JAR files, but this location is likely to change). The`
			`* dictionary file is in a serialized binary format. We have a very primitive (and`
			`* slow) BuildDictionaryFile utility for creating dictionary files, but aren't`
			`* currently making it public. Contact us for help.`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`* <p>`
			`* <b> NOTE </b> The DictionaryBasedIterator class is still under development. The`
			`* APIs are not in stable condition yet.`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`*/`
ICU-1126 move break iterator code to common library so that titlecasing can use it X-SVN-Rev: 7728 2002-02-21 04:42:32 +00:00			`class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`private:`
			`/**`
			`* a temporary hiding place for the number of dictionary characters in the`
			`* last range passed over by next()`
			`*/`
			`int32_t dictionaryCharCount;`

			`/**`
			`* when a range of characters is divided up using the dictionary, the break`
			`* positions that are discovered are stored here, preventing us from having`
			`* to use either the dictionary or the state table again until the iterator`
			`* leaves this range of text`
			`*/`
			`int32_t* cachedBreakPositions;`

			`/**`
			`* The number of elements in cachedBreakPositions`
			`*/`
			`int32_t numCachedBreakPositions;`

			`/**`
			`* if cachedBreakPositions is not null, this indicates which item in the`
			`* cache the current iteration position refers to`
			`*/`
			`int32_t positionInCache;`

			`/**`
			`* Class ID`
			`*/`
ICU-1099 Make some data a bit more const X-SVN-Rev: 6138 2001-10-09 22:57:29 +00:00			`static const char fgClassID;`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`/**=======================================================================`
			`* Create a dictionary based break boundary detection iterator.`
			`* @param tablesImage The location for the dictionary to be loaded into memory`
			`* @param dictionaryFilename The name of the dictionary file`
			`* @param status the error code status`
			`* @return A dictionary based break detection iterator. The UErrorCode& status`
			`* parameter is used to return status information to the user.`
			`* To check whether the construction succeeded or not, you should check`
			`* the value of U_SUCCESS(err). If you wish more detailed information, you`
			`* can check for informational error results which still indicate success. For example,`
			`* U_FILE_ACCESS_ERROR will be returned if the file does not exist.`
			`* The caller owns the returned object and is responsible for deleting it.`
			`======================================================================= */`
ICU-1099 Make some data a bit more const X-SVN-Rev: 6138 2001-10-09 22:57:29 +00:00			`DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status);`

			`public:`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`//=======================================================================`
			`// boilerplate`
			`//=======================================================================`

			`/**`
			`* Destructor`
			`*/`
			`virtual ~DictionaryBasedBreakIterator();`

			`/**`
			`* Assignment operator. Sets this iterator to have the same behavior,`
			`* and iterate over the same text, as the one passed in.`
			`*/`
			`DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);`

			`/**`
			`* Returns a newly-constructed RuleBasedBreakIterator with the same`
			`* behavior, and iterating over the same text, as this one.`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`virtual BreakIterator* clone(void) const;`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`//=======================================================================`
			`// BreakIterator overrides`
			`//=======================================================================`
			`/**`
			`* Advances the iterator backwards, to the last boundary preceding this one.`
			`* @return The position of the last boundary position preceding this one.`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`virtual int32_t previous(void);`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`/**`
			`* Sets the iterator to refer to the first boundary position following`
			`* the specified position.`
			`* @offset The position from which to begin searching for a break position.`
			`* @return The position of the first break after the current position.`
			`*/`
			`virtual int32_t following(int32_t offset);`

			`/**`
			`* Sets the iterator to refer to the last boundary position before the`
			`* specified position.`
			`* @offset The position to begin searching for a break from.`
			`* @return The position of the last boundary before the starting position.`
			`*/`
			`virtual int32_t preceding(int32_t offset);`

			`/**`
			`* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.`
			`* This method is to implement a simple version of RTTI, since not all`
			`* C++ compilers support genuine RTTI. Polymorphic operator==() and`
			`* clone() methods call this method.`
			`*`
			`* @return The class ID for this object. All objects of a`
			`* given class have the same class ID. Objects of`
			`* other classes have different class IDs.`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`virtual UClassID getDynamicClassID(void) const;`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`/**`
			`* Returns the class ID for this class. This is useful only for`
			`* comparing to a return value from getDynamicClassID(). For example:`
			`*`
			`* Base* polymorphic_pointer = createPolymorphicObject();`
			`* if (polymorphic_pointer->getDynamicClassID() ==`
			`* Derived::getStaticClassID()) ...`
			`*`
			`* @return The class ID for all objects of this class.`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`static UClassID getStaticClassID(void);`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`protected:`
			`//=======================================================================`
			`// implementation`
			`//=======================================================================`
			`/**`
			`* This method is the actual implementation of the next() method. All iteration`
			`* vectors through here. This method initializes the state machine to state 1`
			`* and advances through the text character by character until we reach the end`
			`* of the text or the state machine transitions to state 0. We update our return`
			`* value every time the state machine passes through a possible end state.`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`virtual int32_t handleNext(void);`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`/**`
			`* dumps the cache of break positions (usually in response to a change in`
			`* position of some sort)`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`virtual void reset(void);`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
ICU-1096 Make UVector and UStack use UErrorCode for U_MEMORY_ALLOCATION_ERROR X-SVN-Rev: 5565 2001-08-23 01:06:08 +00:00			`virtual BreakIterator * createBufferClone(void *stackBuffer,`
ICU-853 Implementations for ubrk_safeClone, including C++ API changes needed (createBufferClone & isBufferClone in BreakIterator classes) X-SVN-Rev: 3713 2001-02-21 23:40:41 +00:00			`int32_t &BufferSize,`
			`UErrorCode &status);`


ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`private:`
			`/**`
			`* This is the function that actually implements the dictionary-based`
			`* algorithm. Given the endpoints of a range of text, it uses the`
			`* dictionary to determine the positions of any boundaries in this`
			`* range. It stores all the boundary positions it discovers in`
			`* cachedBreakPositions so that we only have to do this work once`
			`* for each time we enter the range.`
			`*/`
ICU-1096 Make UVector and UStack use UErrorCode for U_MEMORY_ALLOCATION_ERROR X-SVN-Rev: 5565 2001-08-23 01:06:08 +00:00			`void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`/**`
			`* Used by the tables object to increment the count of dictionary characters`
			`* during iteration`
			`*/`
ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`void bumpDictionaryCharCount(void);`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
ICU-45 Updated the dbbi.h and rbbi.h not to expose the ctor APIs. X-SVN-Rev: 693 2000-01-25 17:57:36 +00:00			`/*`
			`* HSYS : Please revisit with Rich, the ctors of the DBBI class is currently`
			`* marked as private.`
			`*/`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`friend class DictionaryBasedBreakIteratorTables;`
ICU-45 Updated the dbbi.h and rbbi.h not to expose the ctor APIs. X-SVN-Rev: 693 2000-01-25 17:57:36 +00:00			`friend class BreakIterator;`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`};`

ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`inline UClassID DictionaryBasedBreakIterator::getDynamicClassID(void) const {`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`return RuleBasedBreakIterator::getStaticClassID();`
			`}`

ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`inline UClassID DictionaryBasedBreakIterator::getStaticClassID(void) {`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`return (UClassID)(&fgClassID);`
			`}`

ICU-45 Added ErrorCode to the constructor APIs. . X-SVN-Rev: 578 2000-01-14 00:13:59 +00:00			`inline void DictionaryBasedBreakIterator::bumpDictionaryCharCount(void) {`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00			`++dictionaryCharCount;`
			`}`
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_END`
ICU-45 Fixed the include path problems. X-SVN-Rev: 507 2000-01-10 21:21:52 +00:00
			`#endif`