/* * Copyright © {1997-1999}, International Business Machines Corporation and others. All Rights Reserved. ***************************************************************************************** * * File TXTBDAT.H * * Modification History: * * Date Name Description * 02/18/97 aliu Converted from OpenClass. * Made static data members const where appropriate. * 03/25/97 aliu Removed subclasses, and merged their static data into this * class. Instantiated four static instances for character, * word, sentence, and line. Made forward(), backward(), and * map() methods inline. * 04/15/97 aliu Worked around bug in AIX xlC compiler which occurs if static * arrays contain const elements. * 05/06/97 aliu Made kSI, kStop, and kSI_Stop into #defines to help out * non-compliant compilers. ***************************************************************************************** */ #ifndef TXTBDAT_H #define TXTBDAT_H #include "unicode/utypes.h" class WordBreakTable; class UnicodeClassMapping; class SpecialMapping; /** * This class wraps up the data tables needed for SimpleTextBoundary. * It is statically instantiated for each type of text boundary. This * class is not designed to be subclassed. */ class TextBoundaryData { public: ~TextBoundaryData() {} // Do not subclass // Fast inline accessors const WordBreakTable* forward(void) const; const WordBreakTable* backward(void) const; const UnicodeClassMapping* map(void) const; static const TextBoundaryData kCharacterBreakData; static const TextBoundaryData kWordBreakData; static const TextBoundaryData kLineBreakData; static const TextBoundaryData kSentenceBreakData; typedef uint8_t Node; typedef uint8_t Type; private: static const UChar ASCII_END_OF_TEXT; static const UChar ASCII_HORIZONTAL_TABULATION; static const UChar ASCII_LINEFEED; static const UChar ASCII_VERTICAL_TABULATION; static const UChar ASCII_FORM_FEED; static const UChar ASCII_CARRIAGE_RETURN; static const UChar ASCII_SPACE; static const UChar ASCII_EXCLAMATION_MARK; static const UChar ASCII_QUOTATION_MARK; static const UChar ASCII_NUMBER_SIGN; static const UChar ASCII_DOLLAR_SIGN; static const UChar ASCII_PERCENT; static const UChar ASCII_AMPERSAND; static const UChar ASCII_APOSTROPHE; static const UChar ASCII_COMMA; static const UChar ASCII_FULL_STOP; static const UChar ASCII_COLON; static const UChar ASCII_SEMICOLON; static const UChar ASCII_QUESTION_MARK; static const UChar ASCII_NONBREAKING_SPACE; static const UChar ASCII_CENT_SIGN; static const UChar ASCII_POUND_SIGN; static const UChar ASCII_YEN_SIGN; static const UChar LATIN1_SOFTHYPHEN; static const UChar LATIN1_DEGREE_SIGN; static const UChar ARABIC_PERCENT_SIGN; static const UChar ARABIC_DECIMAL_SEPARATOR; static const UChar HANGUL_CHOSEONG_LOW; static const UChar HANGUL_CHOSEONG_HIGH; static const UChar HANGUL_JUNGSEONG_LOW; static const UChar HANGUL_JUNGSEONG_HIGH; static const UChar HANGUL_JONGSEONG_LOW; static const UChar HANGUL_JONGSEONG_HIGH; static const UChar FIGURE_SPACE; static const UChar NONBREAKING_HYPHEN; static const UChar PUNCTUATION_HYPHENATION_POINT; static const UChar PUNCTUATION_LINE_SEPARATOR; static const UChar PUNCTUATION_PARAGRAPH_SEPARATOR; static const UChar PER_MILLE_SIGN; static const UChar PER_TEN_THOUSAND_SIGN; static const UChar PRIME; static const UChar DOUBLE_PRIME; static const UChar TRIPLE_PRIME; static const UChar DEGREE_CELSIUS; static const UChar DEGREE_FAHRENHEIT; static const UChar PUNCTUATION_IDEOGRAPHIC_COMMA; static const UChar PUNCTUATION_IDEOGRAPHIC_FULL_STOP; static const UChar IDEOGRAPHIC_ITERATION_MARK; static const UChar HIRAGANA_LETTER_SMALL_A; static const UChar HIRAGANA_LETTER_A; static const UChar HIRAGANA_LETTER_SMALL_I; static const UChar HIRAGANA_LETTER_I; static const UChar HIRAGANA_LETTER_SMALL_U; static const UChar HIRAGANA_LETTER_U; static const UChar HIRAGANA_LETTER_SMALL_E; static const UChar HIRAGANA_LETTER_E; static const UChar HIRAGANA_LETTER_SMALL_O; static const UChar HIRAGANA_LETTER_O; static const UChar HIRAGANA_LETTER_DI; static const UChar HIRAGANA_LETTER_SMALL_TU; static const UChar HIRAGANA_LETTER_TU; static const UChar HIRAGANA_LETTER_MO; static const UChar HIRAGANA_LETTER_SMALL_YA; static const UChar HIRAGANA_LETTER_YA; static const UChar HIRAGANA_LETTER_SMALL_YU; static const UChar HIRAGANA_LETTER_YU; static const UChar HIRAGANA_LETTER_SMALL_YO; static const UChar HIRAGANA_LETTER_YO; static const UChar HIRAGANA_LETTER_RO; static const UChar HIRAGANA_LETTER_SMALL_WA; static const UChar HIRAGANA_LETTER_WA; static const UChar HIRAGANA_LETTER_VU; static const UChar COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK; static const UChar HIRAGANA_SEMIVOICED_SOUND_MARK; static const UChar HIRAGANA_ITERATION_MARK; static const UChar HIRAGANA_VOICED_ITERATION_MARK; static const UChar KATAKANA_LETTER_SMALL_A; static const UChar KATAKANA_LETTER_A; static const UChar KATAKANA_LETTER_SMALL_I; static const UChar KATAKANA_LETTER_I; static const UChar KATAKANA_LETTER_SMALL_U; static const UChar KATAKANA_LETTER_U; static const UChar KATAKANA_LETTER_SMALL_E; static const UChar KATAKANA_LETTER_E; static const UChar KATAKANA_LETTER_SMALL_O; static const UChar KATAKANA_LETTER_O; static const UChar KATAKANA_LETTER_DI; static const UChar KATAKANA_LETTER_SMALL_TU; static const UChar KATAKANA_LETTER_TU; static const UChar KATAKANA_LETTER_MO; static const UChar KATAKANA_LETTER_SMALL_YA; static const UChar KATAKANA_LETTER_YA; static const UChar KATAKANA_LETTER_SMALL_YU; static const UChar KATAKANA_LETTER_YU; static const UChar KATAKANA_LETTER_SMALL_YO; static const UChar KATAKANA_LETTER_YO; static const UChar KATAKANA_LETTER_RO; static const UChar KATAKANA_LETTER_SMALL_WA; static const UChar KATAKANA_LETTER_WA; static const UChar KATAKANA_LETTER_VU; static const UChar KATAKANA_LETTER_SMALL_KA; static const UChar KATAKANA_LETTER_SMALL_KE; static const UChar KATAKANA_LETTER_VA; static const UChar KATAKANA_LETTER_VO; static const UChar KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK; static const UChar KATAKANA_ITERATION_MARK; static const UChar KATAKANA_VOICED_ITERATION_MARK; static const UChar UNICODE_LOW_BOUND_HAN; static const UChar UNICODE_HIGH_BOUND_HAN; static const UChar HANGUL_SYL_LOW; static const UChar HANGUL_SYL_HIGH; static const UChar CJK_COMPATIBILITY_F900; static const UChar CJK_COMPATIBILITY_FA2D; static const UChar UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE; static const UChar FULLWIDTH_EXCLAMATION_MARK; static const UChar FULLWIDTH_FULL_STOP; static const UChar FULLWIDTH_QUESTION_MARK; static const UChar END_OF_STRING; private: // Character data enum CharacterMapping { // These enum values must occur in this order; do not // modify unless you know what you are doing! The forward // and backward data tables are indexed by these enums. kAccent_diacritic = 0, kBaseForm = 1, kBaseCR = 2, kBaseLF = 3, kChoseong = 4, // Korean initial consonant kJungseong = 5, // Korean vowel kJongseong = 6, // Korean final consonant kEOS = 7, kCharacterCol_count = 8 }; static Node kCharacterForwardData[]; static const int32_t kCharacterForwardData_length; static WordBreakTable* kCharacterForward; static Node kCharacterBackwardData[]; static const int32_t kCharacterBackwardData_length; static WordBreakTable* kCharacterBackward; static Type kCharacterRawMapping[]; static const int32_t kCharacterRawMapping_length; static SpecialMapping kCharacterExceptionChar[]; static const int32_t kCharacterExceptionChar_length; static const UBool kCharacterExceptionFlags[]; static UnicodeClassMapping* kCharacterMap; static Type kCharacterAsciiValues[]; private: // Word data enum WordMapping { // These enum values must occur in this order; do not // modify unless you know what you are doing! The forward // and backward data tables are indexed by these enums. kBreak = 0, kLetter = 1, kNumber = 2, kMidLetter = 3, kMidLetNum = 4, kPreNum = 5, kPostNum = 6, kMidNum = 7, kPreMidNum = 8, kBlank = 9, kLF = 10, kKata = 11, kHira = 12, kKanji = 13, kDiacrit = 14, kCR = 15, kNsm = 16, kwEOS = 17, kWordCol_count = 18 }; static Node kWordForwardData[]; static const int32_t kWordForwardData_length; static WordBreakTable* kWordForward; static Node kWordBackwardData[]; static const int32_t kWordBackwardData_length; static WordBreakTable* kWordBackward; static Type kWordRawMapping[]; static const int32_t kWordRawMapping_length; static SpecialMapping kWordExceptionChar[]; static const int32_t kWordExceptionChar_length; static UnicodeClassMapping* kWordMap; static Type kWordAsciiValues[]; static const UBool kWordExceptionFlags[]; private: // Sentence data enum SentenceMapping { // These enum values must occur in this order; do not // modify unless you know what you are doing! The forward // and backward data tables are indexed by these enums. kOther = 0, kSpace = 1, kTerminator = 2, kAmbiguousTerm = 3, kOpenBracket = 4, kCloseBracket = 5, kCJK = 6, kParagraphBreak = 7, kLowerCase = 8, kUpperCase = 9, ksNumber = 10, kQuote = 11, //ksCR, ksNsm = 12, ksEOS = 13, kSentenceCol_count = 14 }; static Node kSentenceForwardData[]; static const int32_t kSentenceForwardData_length; static WordBreakTable* kSentenceForward; static Node kSentenceBackwardData[]; static const int32_t kSentenceBackwardData_length; static WordBreakTable* kSentenceBackward; static Type kSentenceRawMapping[]; static const int32_t kSentenceRawMapping_length; static SpecialMapping kSentenceExceptionChar[]; static const int32_t kSentenceExceptionChar_length; static UnicodeClassMapping* kSentenceMap; static Type kSentenceAsciiValues[]; static const UBool kSentenceExceptionFlags[]; private: // Line data enum LineMapping { // These enum values must occur in this order; do not // modify unless you know what you are doing! The forward // and backward data tables are indexed by these enums. kLineBreak, //always breaks (must be present as first item) kLineBlank, //spaces, tabs, nulls. kLineCR, //carriage return kLineNonBlank, //everything not included elsewhere kLineOp, //hyphens.... kLineJwrd, //hiragana, katakana, and kanji kLinePreJwrd, //characters that bind to the beginning of a Japanese word kLinePostJwrd, //characters that bind to the end of a Japanese word kLineDigit, //digits kLineNumPunct, //punctuation that can appear within a number kLineCurrency, //currency symbols that can precede a number kLineNsm, // non-spacing marks kLineNbsp, // non-breaking characters kLineEOS, kLineCol_count }; static Node kLineForwardData[]; static const int32_t kLineForwardData_length; static WordBreakTable* kLineForward; static Node kLineBackwardData[]; static const int32_t kLineBackwardData_length; static WordBreakTable* kLineBackward; static Type kLineRawMapping[]; static const int32_t kLineRawMapping_length; static SpecialMapping kLineExceptionChar[]; static const int32_t kLineExceptionChar_length; static const UBool kLineExceptionFlags[]; static UnicodeClassMapping* kLineMap; static Type kLineAsciiValues[]; protected: /** * Copy constructor and assignment operator provided to make * compiler happy only. DO NOT CALL. */ TextBoundaryData(const TextBoundaryData&) {} TextBoundaryData& operator=(const TextBoundaryData&) { return *this; } TextBoundaryData() {} // Do not subclass TextBoundaryData(const WordBreakTable* forward, const WordBreakTable* backward, const UnicodeClassMapping* map) : fForward(forward), fBackward(backward), fMap(map) {} private: const WordBreakTable* fForward; const WordBreakTable* fBackward; const UnicodeClassMapping* fMap; }; inline const WordBreakTable* TextBoundaryData::forward() const { return fForward; } inline const WordBreakTable* TextBoundaryData::backward() const { return fBackward; } inline const UnicodeClassMapping* TextBoundaryData::map() const { return fMap; } // These used to be static consts in the class, but some compilers didn't like that. #define kStop (0) #define kSI (0x80) #define kSI_Stop (kSI+kStop) #define kSI_1 (kSI+1) #define kSI_2 (kSI+2) #define kSI_3 (kSI+3) #define kSI_4 (kSI+4) #define kSI_5 (kSI+5) #define kSI_6 (kSI+6) #define kSI_7 (kSI+7) #define kSI_8 (kSI+8) #define kSI_9 (kSI+9) #define kSI_10 (kSI+10) #define kSI_11 (kSI+11) #define kSI_12 (kSI+12) #define kSI_13 (kSI+13) #define kSI_14 (kSI+14) #endif // _TXTBDAT //eof