/* ******************************************************************************* * * Copyright (C) 2009-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: normalizer2impl.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2009nov22 * created by: Markus W. Scherer */ #ifndef __NORMALIZER2IMPL_H__ #define __NORMALIZER2IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" #include "unicode/udata.h" #include "unicode/unistr.h" #include "unicode/unorm.h" #include "unicode/utf16.h" #include "mutex.h" #include "uset_imp.h" #include "utrie2.h" U_NAMESPACE_BEGIN struct CanonIterData; class Hangul { public: /* Korean Hangul and Jamo constants */ enum { JAMO_L_BASE=0x1100, /* "lead" jamo */ JAMO_V_BASE=0x1161, /* "vowel" jamo */ JAMO_T_BASE=0x11a7, /* "trail" jamo */ HANGUL_BASE=0xac00, JAMO_L_COUNT=19, JAMO_V_COUNT=21, JAMO_T_COUNT=28, JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT }; static inline UBool isHangul(UChar32 c) { return HANGUL_BASE<=c && c=MIN_NORMAL_MAYBE_YES) { return (uint8_t)norm16; } if(norm16=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ uint16_t getFCD16(UChar32 c) const { if(c<0) { return 0; } else if(c<0x180) { return tccc180[c]; } else if(c<=0xffff) { if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } } return getFCD16FromNormData(c); } /** * Returns the FCD data for the next code point (post-increment). * Might skip only a lead surrogate rather than the whole surrogate pair if none of * the supplementary code points associated with the lead surrogate have non-zero FCD data. * @param s A valid pointer into a string. Requires s!=limit. * @param limit The end of the string, or NULL. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { UChar32 c=*s++; if(c<0x180) { return tccc180[c]; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } UChar c2; if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { c=U16_GET_SUPPLEMENTARY(c, c2); ++s; } return getFCD16FromNormData(c); } /** * Returns the FCD data for the previous code point (pre-decrement). * @param start The start of the string. * @param s A valid pointer into a string. Requires start>8]; if(bits==0) { return false; } return (UBool)((bits>>((lead>>5)&7))&1); } /** Returns the FCD value from the regular normalization data. */ uint16_t getFCD16FromNormData(UChar32 c) const; void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const; /** * Gets the decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; /** * Gets the raw decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; UChar32 composePair(UChar32 a, UChar32 b) const; UBool isCanonSegmentStarter(UChar32 c) const; UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; enum { MIN_CCC_LCCC_CP=0x300 }; enum { MIN_YES_YES_WITH_CC=0xff01, JAMO_VT=0xff00, MIN_NORMAL_MAYBE_YES=0xfe00, JAMO_L=1, MAX_DELTA=0x40 }; enum { // Byte offsets from the start of the data, after the generic header. IX_NORM_TRIE_OFFSET, IX_EXTRA_DATA_OFFSET, IX_SMALL_FCD_OFFSET, IX_RESERVED3_OFFSET, IX_RESERVED4_OFFSET, IX_RESERVED5_OFFSET, IX_RESERVED6_OFFSET, IX_TOTAL_SIZE, // Code point thresholds for quick check codes. IX_MIN_DECOMP_NO_CP, IX_MIN_COMP_NO_MAYBE_CP, // Norm16 value thresholds for quick check combinations and types of extra data. IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. IX_MIN_NO_NO, IX_LIMIT_NO_NO, IX_MIN_MAYBE_YES, IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. IX_RESERVED15, IX_COUNT }; enum { MAPPING_HAS_CCC_LCCC_WORD=0x80, MAPPING_HAS_RAW_MAPPING=0x40, MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, MAPPING_LENGTH_MASK=0x1f }; enum { COMP_1_LAST_TUPLE=0x8000, COMP_1_TRIPLE=1, COMP_1_TRAIL_LIMIT=0x3400, COMP_1_TRAIL_MASK=0x7ffe, COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit COMP_2_TRAIL_SHIFT=6, COMP_2_TRAIL_MASK=0xffc0 }; // higher-level functionality ------------------------------------------ *** const UChar *decompose(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void decomposeAndAppend(const UChar *src, const UChar *limit, UBool doDecompose, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool compose(const UChar *src, const UChar *limit, UBool onlyContiguous, UBool doCompose, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const UChar *composeQuickCheck(const UChar *src, const UChar *limit, UBool onlyContiguous, UNormalizationCheckResult *pQCResult) const; void composeAndAppend(const UChar *src, const UChar *limit, UBool doCompose, UBool onlyContiguous, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const UChar *makeFCD(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void makeFCDAndAppend(const UChar *src, const UChar *limit, UBool doMakeFCD, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool hasDecompBoundary(UChar32 c, UBool before) const; UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } UBool hasCompBoundaryBefore(UChar32 c) const { return c=minMaybeYes; } static UBool isInert(uint16_t norm16) { return norm16==0; } static UBool isJamoL(uint16_t norm16) { return norm16==1; } static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; // } uint8_t getCCFromNoNo(uint16_t norm16) const { const uint16_t *mapping=getMapping(norm16); if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { return (uint8_t)*(mapping-1); } else { return 0; } } // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; // Requires algorithmic-NoNo. UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { return c+norm16-(minMaybeYes-MAX_DELTA-1); } // Requires minYesNo