1999-08-16 21:50:52 +00:00
|
|
|
|
/*
|
1999-11-22 21:47:27 +00:00
|
|
|
|
* Copyright <EFBFBD> {1997-1999}, International Business Machines Corporation and others. All Rights Reserved.
|
1999-08-16 21:50:52 +00:00
|
|
|
|
*****************************************************************************************
|
|
|
|
|
*
|
|
|
|
|
* File TXTBDAT.H
|
|
|
|
|
*
|
|
|
|
|
* Modification History:
|
|
|
|
|
*
|
|
|
|
|
* Date Name Description
|
|
|
|
|
* 02/18/97 aliu Converted from OpenClass.
|
|
|
|
|
* Made static data members const where appropriate.
|
|
|
|
|
* 03/25/97 aliu Removed subclasses, and merged their static data into this
|
|
|
|
|
* class. Instantiated four static instances for character,
|
|
|
|
|
* word, sentence, and line. Made forward(), backward(), and
|
|
|
|
|
* map() methods inline.
|
|
|
|
|
* 04/15/97 aliu Worked around bug in AIX xlC compiler which occurs if static
|
|
|
|
|
* arrays contain const elements.
|
|
|
|
|
* 05/06/97 aliu Made kSI, kStop, and kSI_Stop into #defines to help out
|
|
|
|
|
* non-compliant compilers.
|
|
|
|
|
*****************************************************************************************
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#ifndef TXTBDAT_H
|
|
|
|
|
#define TXTBDAT_H
|
|
|
|
|
|
1999-12-28 23:57:50 +00:00
|
|
|
|
#include "unicode/utypes.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
class WordBreakTable;
|
|
|
|
|
class UnicodeClassMapping;
|
|
|
|
|
class SpecialMapping;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This class wraps up the data tables needed for SimpleTextBoundary.
|
|
|
|
|
* It is statically instantiated for each type of text boundary. This
|
|
|
|
|
* class is not designed to be subclassed.
|
|
|
|
|
*/
|
|
|
|
|
class TextBoundaryData {
|
|
|
|
|
public:
|
|
|
|
|
~TextBoundaryData() {} // Do not subclass
|
|
|
|
|
|
|
|
|
|
// Fast inline accessors
|
|
|
|
|
const WordBreakTable* forward(void) const;
|
|
|
|
|
const WordBreakTable* backward(void) const;
|
|
|
|
|
const UnicodeClassMapping* map(void) const;
|
|
|
|
|
|
|
|
|
|
static const TextBoundaryData kCharacterBreakData;
|
|
|
|
|
static const TextBoundaryData kWordBreakData;
|
|
|
|
|
static const TextBoundaryData kLineBreakData;
|
|
|
|
|
static const TextBoundaryData kSentenceBreakData;
|
|
|
|
|
|
|
|
|
|
typedef uint8_t Node;
|
|
|
|
|
typedef uint8_t Type;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
static const UChar ASCII_END_OF_TEXT;
|
|
|
|
|
static const UChar ASCII_HORIZONTAL_TABULATION;
|
|
|
|
|
static const UChar ASCII_LINEFEED;
|
|
|
|
|
static const UChar ASCII_VERTICAL_TABULATION;
|
|
|
|
|
static const UChar ASCII_FORM_FEED;
|
|
|
|
|
static const UChar ASCII_CARRIAGE_RETURN;
|
|
|
|
|
static const UChar ASCII_SPACE;
|
|
|
|
|
static const UChar ASCII_EXCLAMATION_MARK;
|
|
|
|
|
static const UChar ASCII_QUOTATION_MARK;
|
|
|
|
|
static const UChar ASCII_NUMBER_SIGN;
|
|
|
|
|
static const UChar ASCII_DOLLAR_SIGN;
|
|
|
|
|
static const UChar ASCII_PERCENT;
|
|
|
|
|
static const UChar ASCII_AMPERSAND;
|
|
|
|
|
static const UChar ASCII_APOSTROPHE;
|
|
|
|
|
static const UChar ASCII_COMMA;
|
|
|
|
|
static const UChar ASCII_FULL_STOP;
|
|
|
|
|
static const UChar ASCII_COLON;
|
|
|
|
|
static const UChar ASCII_SEMICOLON;
|
|
|
|
|
static const UChar ASCII_QUESTION_MARK;
|
|
|
|
|
static const UChar ASCII_NONBREAKING_SPACE;
|
|
|
|
|
static const UChar ASCII_CENT_SIGN;
|
|
|
|
|
static const UChar ASCII_POUND_SIGN;
|
|
|
|
|
static const UChar ASCII_YEN_SIGN;
|
|
|
|
|
static const UChar LATIN1_SOFTHYPHEN;
|
|
|
|
|
static const UChar LATIN1_DEGREE_SIGN;
|
|
|
|
|
static const UChar ARABIC_PERCENT_SIGN;
|
|
|
|
|
static const UChar ARABIC_DECIMAL_SEPARATOR;
|
|
|
|
|
static const UChar HANGUL_CHOSEONG_LOW;
|
|
|
|
|
static const UChar HANGUL_CHOSEONG_HIGH;
|
|
|
|
|
static const UChar HANGUL_JUNGSEONG_LOW;
|
|
|
|
|
static const UChar HANGUL_JUNGSEONG_HIGH;
|
|
|
|
|
static const UChar HANGUL_JONGSEONG_LOW;
|
|
|
|
|
static const UChar HANGUL_JONGSEONG_HIGH;
|
|
|
|
|
static const UChar FIGURE_SPACE;
|
|
|
|
|
static const UChar NONBREAKING_HYPHEN;
|
|
|
|
|
static const UChar PUNCTUATION_HYPHENATION_POINT;
|
|
|
|
|
static const UChar PUNCTUATION_LINE_SEPARATOR;
|
|
|
|
|
static const UChar PUNCTUATION_PARAGRAPH_SEPARATOR;
|
|
|
|
|
static const UChar PER_MILLE_SIGN;
|
|
|
|
|
static const UChar PER_TEN_THOUSAND_SIGN;
|
|
|
|
|
static const UChar PRIME;
|
|
|
|
|
static const UChar DOUBLE_PRIME;
|
|
|
|
|
static const UChar TRIPLE_PRIME;
|
|
|
|
|
static const UChar DEGREE_CELSIUS;
|
|
|
|
|
static const UChar DEGREE_FAHRENHEIT;
|
|
|
|
|
static const UChar PUNCTUATION_IDEOGRAPHIC_COMMA;
|
|
|
|
|
static const UChar PUNCTUATION_IDEOGRAPHIC_FULL_STOP;
|
|
|
|
|
static const UChar IDEOGRAPHIC_ITERATION_MARK;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_A;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_A;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_I;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_I;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_U;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_U;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_E;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_E;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_O;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_O;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_DI;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_TU;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_TU;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_MO;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_YA;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_YA;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_YU;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_YU;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_YO;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_YO;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_RO;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_SMALL_WA;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_WA;
|
|
|
|
|
static const UChar HIRAGANA_LETTER_VU;
|
|
|
|
|
static const UChar COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK;
|
|
|
|
|
static const UChar HIRAGANA_SEMIVOICED_SOUND_MARK;
|
|
|
|
|
static const UChar HIRAGANA_ITERATION_MARK;
|
|
|
|
|
static const UChar HIRAGANA_VOICED_ITERATION_MARK;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_A;
|
|
|
|
|
static const UChar KATAKANA_LETTER_A;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_I;
|
|
|
|
|
static const UChar KATAKANA_LETTER_I;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_U;
|
|
|
|
|
static const UChar KATAKANA_LETTER_U;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_E;
|
|
|
|
|
static const UChar KATAKANA_LETTER_E;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_O;
|
|
|
|
|
static const UChar KATAKANA_LETTER_O;
|
|
|
|
|
static const UChar KATAKANA_LETTER_DI;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_TU;
|
|
|
|
|
static const UChar KATAKANA_LETTER_TU;
|
|
|
|
|
static const UChar KATAKANA_LETTER_MO;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_YA;
|
|
|
|
|
static const UChar KATAKANA_LETTER_YA;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_YU;
|
|
|
|
|
static const UChar KATAKANA_LETTER_YU;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_YO;
|
|
|
|
|
static const UChar KATAKANA_LETTER_YO;
|
|
|
|
|
static const UChar KATAKANA_LETTER_RO;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_WA;
|
|
|
|
|
static const UChar KATAKANA_LETTER_WA;
|
|
|
|
|
static const UChar KATAKANA_LETTER_VU;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_KA;
|
|
|
|
|
static const UChar KATAKANA_LETTER_SMALL_KE;
|
|
|
|
|
static const UChar KATAKANA_LETTER_VA;
|
|
|
|
|
static const UChar KATAKANA_LETTER_VO;
|
|
|
|
|
static const UChar KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK;
|
|
|
|
|
static const UChar KATAKANA_ITERATION_MARK;
|
|
|
|
|
static const UChar KATAKANA_VOICED_ITERATION_MARK;
|
|
|
|
|
static const UChar UNICODE_LOW_BOUND_HAN;
|
|
|
|
|
static const UChar UNICODE_HIGH_BOUND_HAN;
|
|
|
|
|
static const UChar HANGUL_SYL_LOW;
|
|
|
|
|
static const UChar HANGUL_SYL_HIGH;
|
|
|
|
|
static const UChar CJK_COMPATIBILITY_F900;
|
|
|
|
|
static const UChar CJK_COMPATIBILITY_FA2D;
|
|
|
|
|
static const UChar UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE;
|
|
|
|
|
static const UChar FULLWIDTH_EXCLAMATION_MARK;
|
|
|
|
|
static const UChar FULLWIDTH_FULL_STOP;
|
|
|
|
|
static const UChar FULLWIDTH_QUESTION_MARK;
|
|
|
|
|
static const UChar END_OF_STRING;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
// Character data
|
|
|
|
|
enum CharacterMapping
|
|
|
|
|
{
|
|
|
|
|
// These enum values must occur in this order; do not
|
|
|
|
|
// modify unless you know what you are doing! The forward
|
|
|
|
|
// and backward data tables are indexed by these enums.
|
|
|
|
|
kAccent_diacritic = 0,
|
|
|
|
|
kBaseForm = 1,
|
|
|
|
|
kBaseCR = 2,
|
|
|
|
|
kBaseLF = 3,
|
|
|
|
|
kChoseong = 4, // Korean initial consonant
|
|
|
|
|
kJungseong = 5, // Korean vowel
|
|
|
|
|
kJongseong = 6, // Korean final consonant
|
|
|
|
|
kEOS = 7,
|
|
|
|
|
kCharacterCol_count = 8
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static Node kCharacterForwardData[];
|
|
|
|
|
static const int32_t kCharacterForwardData_length;
|
|
|
|
|
static WordBreakTable* kCharacterForward;
|
|
|
|
|
static Node kCharacterBackwardData[];
|
|
|
|
|
static const int32_t kCharacterBackwardData_length;
|
|
|
|
|
static WordBreakTable* kCharacterBackward;
|
|
|
|
|
static Type kCharacterRawMapping[];
|
|
|
|
|
static const int32_t kCharacterRawMapping_length;
|
|
|
|
|
static SpecialMapping kCharacterExceptionChar[];
|
|
|
|
|
static const int32_t kCharacterExceptionChar_length;
|
|
|
|
|
static const bool_t kCharacterExceptionFlags[];
|
|
|
|
|
static UnicodeClassMapping* kCharacterMap;
|
|
|
|
|
static Type kCharacterAsciiValues[];
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
// Word data
|
|
|
|
|
enum WordMapping
|
|
|
|
|
{
|
|
|
|
|
// These enum values must occur in this order; do not
|
|
|
|
|
// modify unless you know what you are doing! The forward
|
|
|
|
|
// and backward data tables are indexed by these enums.
|
|
|
|
|
kBreak = 0,
|
|
|
|
|
kLetter = 1,
|
|
|
|
|
kNumber = 2,
|
|
|
|
|
kMidLetter = 3,
|
|
|
|
|
kMidLetNum = 4,
|
|
|
|
|
kPreNum = 5,
|
|
|
|
|
kPostNum = 6,
|
|
|
|
|
kMidNum = 7,
|
|
|
|
|
kPreMidNum = 8,
|
|
|
|
|
kBlank = 9,
|
|
|
|
|
kLF = 10,
|
|
|
|
|
kKata = 11,
|
|
|
|
|
kHira = 12,
|
|
|
|
|
kKanji = 13,
|
|
|
|
|
kDiacrit = 14,
|
|
|
|
|
kCR = 15,
|
|
|
|
|
kNsm = 16,
|
|
|
|
|
kwEOS = 17,
|
|
|
|
|
kWordCol_count = 18
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static Node kWordForwardData[];
|
|
|
|
|
static const int32_t kWordForwardData_length;
|
|
|
|
|
static WordBreakTable* kWordForward;
|
|
|
|
|
static Node kWordBackwardData[];
|
|
|
|
|
static const int32_t kWordBackwardData_length;
|
|
|
|
|
static WordBreakTable* kWordBackward;
|
|
|
|
|
static Type kWordRawMapping[];
|
|
|
|
|
static const int32_t kWordRawMapping_length;
|
|
|
|
|
static SpecialMapping kWordExceptionChar[];
|
|
|
|
|
static const int32_t kWordExceptionChar_length;
|
|
|
|
|
static UnicodeClassMapping* kWordMap;
|
|
|
|
|
static Type kWordAsciiValues[];
|
|
|
|
|
static const bool_t kWordExceptionFlags[];
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
// Sentence data
|
|
|
|
|
enum SentenceMapping
|
|
|
|
|
{
|
|
|
|
|
// These enum values must occur in this order; do not
|
|
|
|
|
// modify unless you know what you are doing! The forward
|
|
|
|
|
// and backward data tables are indexed by these enums.
|
|
|
|
|
kOther = 0,
|
|
|
|
|
kSpace = 1,
|
|
|
|
|
kTerminator = 2,
|
|
|
|
|
kAmbiguousTerm = 3,
|
|
|
|
|
kOpenBracket = 4,
|
|
|
|
|
kCloseBracket = 5,
|
|
|
|
|
kCJK = 6,
|
|
|
|
|
kParagraphBreak = 7,
|
|
|
|
|
kLowerCase = 8,
|
|
|
|
|
kUpperCase = 9,
|
|
|
|
|
ksNumber = 10,
|
|
|
|
|
kQuote = 11,
|
|
|
|
|
//ksCR,
|
|
|
|
|
ksNsm = 12,
|
|
|
|
|
ksEOS = 13,
|
|
|
|
|
kSentenceCol_count = 14
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static Node kSentenceForwardData[];
|
|
|
|
|
static const int32_t kSentenceForwardData_length;
|
|
|
|
|
static WordBreakTable* kSentenceForward;
|
|
|
|
|
static Node kSentenceBackwardData[];
|
|
|
|
|
static const int32_t kSentenceBackwardData_length;
|
|
|
|
|
static WordBreakTable* kSentenceBackward;
|
|
|
|
|
static Type kSentenceRawMapping[];
|
|
|
|
|
static const int32_t kSentenceRawMapping_length;
|
|
|
|
|
static SpecialMapping kSentenceExceptionChar[];
|
|
|
|
|
static const int32_t kSentenceExceptionChar_length;
|
|
|
|
|
static UnicodeClassMapping* kSentenceMap;
|
|
|
|
|
static Type kSentenceAsciiValues[];
|
|
|
|
|
static const bool_t kSentenceExceptionFlags[];
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
// Line data
|
|
|
|
|
enum LineMapping
|
|
|
|
|
{
|
|
|
|
|
// These enum values must occur in this order; do not
|
|
|
|
|
// modify unless you know what you are doing! The forward
|
|
|
|
|
// and backward data tables are indexed by these enums.
|
|
|
|
|
kLineBreak,
|
|
|
|
|
//always breaks (must be present as first item)
|
|
|
|
|
kLineBlank,
|
|
|
|
|
//spaces, tabs, nulls.
|
|
|
|
|
kLineCR,
|
|
|
|
|
//carriage return
|
|
|
|
|
kLineNonBlank,
|
|
|
|
|
//everything not included elsewhere
|
|
|
|
|
kLineOp,
|
|
|
|
|
//hyphens....
|
|
|
|
|
kLineJwrd,
|
|
|
|
|
//hiragana, katakana, and kanji
|
|
|
|
|
kLinePreJwrd,
|
|
|
|
|
//characters that bind to the beginning of a Japanese word
|
|
|
|
|
kLinePostJwrd,
|
|
|
|
|
//characters that bind to the end of a Japanese word
|
|
|
|
|
kLineDigit,
|
|
|
|
|
//digits
|
|
|
|
|
kLineNumPunct,
|
|
|
|
|
//punctuation that can appear within a number
|
|
|
|
|
kLineCurrency,
|
|
|
|
|
//currency symbols that can precede a number
|
|
|
|
|
kLineNsm,
|
|
|
|
|
// non-spacing marks
|
|
|
|
|
kLineNbsp,
|
|
|
|
|
// non-breaking characters
|
|
|
|
|
kLineEOS,
|
|
|
|
|
kLineCol_count
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static Node kLineForwardData[];
|
|
|
|
|
static const int32_t kLineForwardData_length;
|
|
|
|
|
static WordBreakTable* kLineForward;
|
|
|
|
|
static Node kLineBackwardData[];
|
|
|
|
|
static const int32_t kLineBackwardData_length;
|
|
|
|
|
static WordBreakTable* kLineBackward;
|
|
|
|
|
static Type kLineRawMapping[];
|
|
|
|
|
static const int32_t kLineRawMapping_length;
|
|
|
|
|
static SpecialMapping kLineExceptionChar[];
|
|
|
|
|
static const int32_t kLineExceptionChar_length;
|
|
|
|
|
static const bool_t kLineExceptionFlags[];
|
|
|
|
|
static UnicodeClassMapping* kLineMap;
|
|
|
|
|
static Type kLineAsciiValues[];
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
/**
|
|
|
|
|
* Copy constructor and assignment operator provided to make
|
|
|
|
|
* compiler happy only. DO NOT CALL.
|
|
|
|
|
*/
|
|
|
|
|
TextBoundaryData(const TextBoundaryData&) {}
|
|
|
|
|
TextBoundaryData& operator=(const TextBoundaryData&) { return *this; }
|
|
|
|
|
TextBoundaryData() {} // Do not subclass
|
|
|
|
|
TextBoundaryData(const WordBreakTable* forward,
|
|
|
|
|
const WordBreakTable* backward,
|
|
|
|
|
const UnicodeClassMapping* map)
|
|
|
|
|
: fForward(forward), fBackward(backward), fMap(map) {}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
const WordBreakTable* fForward;
|
|
|
|
|
const WordBreakTable* fBackward;
|
|
|
|
|
const UnicodeClassMapping* fMap;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
inline const WordBreakTable* TextBoundaryData::forward() const
|
|
|
|
|
{
|
|
|
|
|
return fForward;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline const WordBreakTable* TextBoundaryData::backward() const
|
|
|
|
|
{
|
|
|
|
|
return fBackward;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline const UnicodeClassMapping* TextBoundaryData::map() const
|
|
|
|
|
{
|
|
|
|
|
return fMap;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// These used to be static consts in the class, but some compilers didn't like that.
|
|
|
|
|
#define kStop (0)
|
|
|
|
|
#define kSI (0x80)
|
|
|
|
|
#define kSI_Stop (kSI+kStop)
|
|
|
|
|
|
|
|
|
|
#define kSI_1 (kSI+1)
|
|
|
|
|
#define kSI_2 (kSI+2)
|
|
|
|
|
#define kSI_3 (kSI+3)
|
|
|
|
|
#define kSI_4 (kSI+4)
|
|
|
|
|
#define kSI_5 (kSI+5)
|
|
|
|
|
#define kSI_6 (kSI+6)
|
|
|
|
|
#define kSI_7 (kSI+7)
|
|
|
|
|
#define kSI_8 (kSI+8)
|
|
|
|
|
#define kSI_9 (kSI+9)
|
|
|
|
|
#define kSI_10 (kSI+10)
|
|
|
|
|
#define kSI_11 (kSI+11)
|
|
|
|
|
#define kSI_12 (kSI+12)
|
|
|
|
|
#define kSI_13 (kSI+13)
|
|
|
|
|
#define kSI_14 (kSI+14)
|
|
|
|
|
|
|
|
|
|
#endif // _TXTBDAT
|
|
|
|
|
//eof
|