2006-03-23 00:54:12 +00:00
|
|
|
/**
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
|
|
|
* All Rights Reserved. *
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef DICTBE_H
|
|
|
|
#define DICTBE_H
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uniset.h"
|
2006-04-22 05:29:27 +00:00
|
|
|
#include "unicode/utext.h"
|
|
|
|
|
2006-03-23 00:54:12 +00:00
|
|
|
#include "brkeng.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
class TrieWordDictionary;
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* DictionaryBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
|
|
|
|
* dictionary to determine language-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a DictionaryBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
2006-04-08 08:34:52 +00:00
|
|
|
class DictionaryBreakEngine : public LanguageBreakEngine {
|
2006-03-23 00:54:12 +00:00
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fSet;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The set of break types handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
uint32_t fTypes;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
DictionaryBreakEngine();
|
|
|
|
|
2006-05-26 00:57:09 +00:00
|
|
|
public:
|
|
|
|
|
2006-03-23 00:54:12 +00:00
|
|
|
/**
|
|
|
|
* <p>Constructor setting the break types handled.</p>
|
|
|
|
*
|
|
|
|
* @param breakTypes A bitmap of types handled by the engine.
|
|
|
|
*/
|
|
|
|
DictionaryBreakEngine( uint32_t breakTypes );
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~DictionaryBreakEngine();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Indicate whether this engine handles a particular character for
|
|
|
|
* a particular kind of break.</p>
|
|
|
|
*
|
|
|
|
* @param c A character which begins a run that the engine might handle
|
|
|
|
* @param breakType The type of text break which the caller wants to determine
|
|
|
|
* @return TRUE if this engine handles the particular character and break
|
|
|
|
* type.
|
|
|
|
*/
|
|
|
|
virtual UBool handles( UChar32 c, int32_t breakType ) const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Find any breaks within a run in the supplied text.</p>
|
|
|
|
*
|
2006-04-22 05:29:27 +00:00
|
|
|
* @param text A UText representing the text. The
|
2006-03-23 00:54:12 +00:00
|
|
|
* iterator is left at the end of the run of characters which the engine
|
|
|
|
* is capable of handling.
|
|
|
|
* @param startPos The start of the run within the supplied text.
|
|
|
|
* @param endPos The end of the run within the supplied text.
|
|
|
|
* @param reverse Whether the caller is looking for breaks in a reverse
|
|
|
|
* direction.
|
|
|
|
* @param breakType The type of break desired, or -1.
|
|
|
|
* @param foundBreaks An allocated C array of the breaks found, if any
|
|
|
|
* @return The number of breaks found.
|
|
|
|
*/
|
2006-04-22 05:29:27 +00:00
|
|
|
virtual int32_t findBreaks( UText *text,
|
2006-03-23 00:54:12 +00:00
|
|
|
int32_t startPos,
|
|
|
|
int32_t endPos,
|
|
|
|
UBool reverse,
|
|
|
|
int32_t breakType,
|
|
|
|
UStack &foundBreaks ) const;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Set the character set handled by this engine.</p>
|
|
|
|
*
|
|
|
|
* @param set A UnicodeSet of the set of characters handled by the engine
|
|
|
|
*/
|
|
|
|
virtual void setCharacters( UnicodeSet &set );
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Set the break types handled by this engine.</p>
|
|
|
|
*
|
|
|
|
* @param breakTypes A bitmap of types handled by the engine.
|
|
|
|
*/
|
2006-05-26 00:57:09 +00:00
|
|
|
// virtual void setBreakTypes( uint32_t breakTypes );
|
2006-03-23 00:54:12 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters.</p>
|
|
|
|
*
|
2006-04-22 05:29:27 +00:00
|
|
|
* @param text A UText representing the text
|
2006-03-23 00:54:12 +00:00
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
2006-04-22 05:29:27 +00:00
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
2006-03-23 00:54:12 +00:00
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
|
|
|
UStack &foundBreaks ) const = 0;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* ThaiBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
|
|
|
|
* TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a ThaiBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
2006-04-08 08:34:52 +00:00
|
|
|
class ThaiBreakEngine : public DictionaryBreakEngine {
|
2006-03-23 00:54:12 +00:00
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fThaiWordSet;
|
|
|
|
UnicodeSet fEndWordSet;
|
|
|
|
UnicodeSet fBeginWordSet;
|
|
|
|
UnicodeSet fSuffixSet;
|
|
|
|
UnicodeSet fMarkSet;
|
|
|
|
const TrieWordDictionary *fDictionary;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
|
|
|
|
* engine is deleted.
|
|
|
|
*/
|
|
|
|
ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~ThaiBreakEngine();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters.</p>
|
|
|
|
*
|
2006-04-22 05:29:27 +00:00
|
|
|
* @param text A UText representing the text
|
2006-03-23 00:54:12 +00:00
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
2006-04-22 05:29:27 +00:00
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
2006-03-23 00:54:12 +00:00
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
|
|
|
UStack &foundBreaks ) const;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
/* DICTBE_H */
|
|
|
|
#endif
|