/* ******************************************************************** * COPYRIGHT: * Copyright (c) 1996-1999, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** */ #ifndef NORMLZR_H #define NORMLZR_H #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/chariter.h" /* forward declaration */ class ComposedCharIter; /** * Normalizer transforms Unicode text into an equivalent composed or * decomposed form, allowing for easier sorting and searching of text. * Normalizer supports the standard normalization forms described in * * Unicode Technical Report #15. *

* Characters with accents or other adornments can be encoded in * several different ways in Unicode. For example, take the character "Á" * (A-acute). In Unicode, this can be encoded as a single character (the * "composed" form): *

 *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
* or as two separate characters (the "decomposed" form): *
 *      0041    LATIN CAPITAL LETTER A
 *      0301    COMBINING ACUTE ACCENT
*

* To a user of your program, however, both of these sequences should be * treated as the same "user-level" character "Á". When you are searching or * comparing text, you must ensure that these two sequences are treated * equivalently. In addition, you must handle characters with more than one * accent. Sometimes the order of a character's combining accents is * significant, while in other cases accent sequences in different orders are * really equivalent. *

* Similarly, the string "ffi" can be encoded as three separate letters: *

 *      0066    LATIN SMALL LETTER F
 *      0066    LATIN SMALL LETTER F
 *      0069    LATIN SMALL LETTER I
* or as the single character *
 *      FB03    LATIN SMALL LIGATURE FFI
*

* The ffi ligature is not a distinct semantic character, and strictly speaking * it shouldn't be in Unicode at all, but it was included for compatibility * with existing character sets that already provided it. The Unicode standard * identifies such characters by giving them "compatibility" decompositions * into the corresponding semantic characters. When sorting and searching, you * will often want to use these mappings. *

* Normalizer helps solve these problems by transforming text into the * canonical composed and decomposed forms as shown in the first example above. * In addition, you can have it perform compatibility decompositions so that * you can treat compatibility characters the same as their equivalents. * Finally, Normalizer rearranges accents into the proper canonical * order, so that you do not have to worry about accent rearrangement on your * own. *

* Normalizer adds one optional behavior, {@link #IGNORE_HANGUL}, * that differs from * the standard Unicode Normalization Forms. This option can be passed * to the {@link #Normalizer constructors} and to the static * {@link #compose compose} and {@link #decompose decompose} methods. This * option, and any that are added in the future, will be turned off by default. *

* There are three common usage models for Normalizer. In the first, * the static {@link #normalize normalize()} method is used to process an * entire input string at once. Second, you can create a Normalizer * object and use it to iterate through the normalized form of a string by * calling {@link #first} and {@link #next}. Finally, you can use the * {@link #setIndex setIndex()} and {@link #getIndex} methods to perform * random-access iteration, which is very useful for searching. *

* Note: Normalizer objects behave like iterators and have * methods such as setIndex, next, previous, etc. * You should note that while the setIndex and getIndex refer * to indices in the underlying input text being processed, the * next and previous methods it iterate through characters * in the normalized output. This means that there is not * necessarily a one-to-one correspondence between characters returned * by next and previous and the indices passed to and * returned from setIndex and getIndex. It is for this * reason that Normalizer does not implement the * {@link CharacterIterator} interface. *

* Note: Normalizer is currently based on version 2.1.8 * of the Unicode Standard. * It will be updated as later versions of Unicode are released. If you are * using this class on a JDK that supports an earlier version of Unicode, it * is possible that Normalizer may generate composed or dedecomposed * characters for which your JDK's {@link java.lang.Character} class does not * have any data. *

* @author Laura Werner, Mark Davis */ class U_COMMON_API Normalizer { public: // This tells us what the bits in the "mode" mean. enum { COMPAT_BIT = 1, DECOMP_BIT = 2, COMPOSE_BIT = 4 }; /** If DONE is returned, then there are no more normalization results available. */ enum { DONE=0xffff }; /** The mode of a Normalizer object */ enum EMode { /** * Null operation for use with the {@link #Normalizer constructors} * and the static {@link #normalize normalize} method. This value tells * the Normalizer to do nothing but return unprocessed characters * from the underlying String or CharacterIterator. If you have code which * requires raw text at some times and normalized text at others, you can * use NO_OP for the cases where you want raw text, rather * than having a separate code path that bypasses Normalizer * altogether. *

* @see #setMode */ NO_OP = 0, /** * Canonical decomposition followed by canonical composition. Used with * the {@link #Normalizer constructors} and the static * {@link #normalize normalize} * method to determine the operation to be performed. *

* If all optional features (e.g. {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * Unicode Canonical * Form * C. *

* @see #setMode */ COMPOSE = COMPOSE_BIT, /** * Compatibility decomposition followed by canonical composition. * Used with the {@link #Normalizer constructors} and the static * {@link #normalize normalize} method to determine the operation to be * performed. *

* If all optional features (e.g. {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * Unicode Canonical * Form * KC. *

* @see #setMode */ COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT, /** * Canonical decomposition. This value is passed to the * {@link #Normalizer constructors} and the static * {@link #normalize normalize} * method to determine the operation to be performed. *

* If all optional features (e.g. {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * Unicode Canonical * Form * D. *

* @see #setMode */ DECOMP = DECOMP_BIT, /** * Compatibility decomposition. This value is passed to the * {@link #Normalizer constructors} and the static * {@link #normalize normalize} * method to determine the operation to be performed. *

* If all optional features (e.g. {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * Unicode Canonical * Form * KD. *

* @see #setMode */ DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT }; /** The options for a Normalizer object */ enum { /** * Option to disable Hangul/Jamo composition and decomposition. * This option applies to Korean text, * which can be represented either in the Jamo alphabet or in Hangul * characters, which are really just two or three Jamo combined * into one visual glyph. Since Jamo takes up more storage space than * Hangul, applications that process only Hangul text may wish to turn * this option on when decomposing text. *

* The Unicode standard treates Hangul to Jamo conversion as a * canonical decomposition, so this option must be turned off if you * wish to transform strings into one of the standard * * Unicode Normalization Forms. *

* @see #setOption */ IGNORE_HANGUL = 0x001 }; // Constructors /** * Creates a new Normalizer object for iterating over the * normalized form of a given string. *

* @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * @stable */ Normalizer(const UnicodeString& str, EMode mode); /** * Creates a new Normalizer object for iterating over the * normalized form of a given string. *

* The options parameter specifies which optional * Normalizer features are to be enabled for this object. *

* @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * * @param opt Any optional features to be enabled. * Currently the only available option is {@link #IGNORE_HANGUL} * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument * @stable */ Normalizer(const UnicodeString& str, EMode mode, int32_t opt); /** * Creates a new Normalizer object for iterating over the * normalized form of a given UChar string. *

* @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param length Lenght of the string * @param mode The normalization mode. * @stable * */ Normalizer(const UChar* str, int32_t length, EMode mode); /** * Creates a new Normalizer object for iterating over the * normalized form of a given UChar string. *

* @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param length Lenght of the string * @param mode The normalization mode. * @param opt Any optional features to be enabled. * Currently the only available option is {@link #IGNORE_HANGUL} * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument * @add * */ Normalizer(const UChar* str, int32_t length, EMode mode, int32_t option); /** * Creates a new Normalizer object for iterating over the * normalized form of the given text. *

* @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * @stable * */ Normalizer(const CharacterIterator& iter, EMode mode); /** * Creates a new Normalizer object for iterating over the * normalized form of the given text. *

* @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * * @param opt Any optional features to be enabled. * Currently the only available option is {@link #IGNORE_HANGUL} * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument * @stable */ Normalizer(const CharacterIterator& iter, EMode mode, int32_t opt); /** * Copy constructor. * @stable */ Normalizer(const Normalizer& copy); /** * Destructor * @stable */ ~Normalizer(); //------------------------------------------------------------------------- // Static utility methods //------------------------------------------------------------------------- /** * Normalizes a String using the given normalization operation. *

* The options parameter specifies which optional * Normalizer features are to be enabled for this operation. * Currently the only available option is {@link #IGNORE_HANGUL}. * If you want the default behavior corresponding to one of the standard * Unicode Normalization Forms, use 0 for this argument. *

* @param source the input string to be normalized. * * @param aMode the normalization mode * * @param options the optional features to be enabled. * * @param result The normalized string (on output). * * @param status The error code. * @stable */ static void normalize(const UnicodeString& source, EMode mode, int32_t options, UnicodeString& result, UErrorCode &status); /** * Compose a String. *

* The options parameter specifies which optional * Normalizer features are to be enabled for this operation. * Currently the only available option is {@link #IGNORE_HANGUL}. * If you want the default behavior corresponding * to Unicode Normalization Form C or KC, * use 0 for this argument. *

* @param source the string to be composed. * * @param compat Perform compatibility decomposition before composition. * If this argument is false, only canonical * decomposition will be performed. * * @param options the optional features to be enabled. * * @param result The composed string (on output). * * @param status The error code. * @stable */ static void compose(const UnicodeString& source, UBool compat, int32_t options, UnicodeString& result, UErrorCode &status); /** * Static method to decompose a String. *

* The options parameter specifies which optional * Normalizer features are to be enabled for this operation. * Currently the only available option is {@link #IGNORE_HANGUL}. * The desired options should be OR'ed together to determine the value * of this argument. If you want the default behavior corresponding * to Unicode Normalization Form D or KD, * use 0 for this argument. *

* @param str the string to be decomposed. * * @param compat Perform compatibility decomposition. * If this argument is false, only canonical * decomposition will be performed. * * @param options the optional features to be enabled. * * @param result The composed string (on output). * * @param status The error code. * * @return the decomposed string. * @stable */ static void decompose(const UnicodeString& source, UBool compat, int32_t options, UnicodeString& result, UErrorCode &status); //------------------------------------------------------------------------- // CharacterIterator overrides //------------------------------------------------------------------------- /** * Return the current character in the normalized text. * @draft */ UChar32 current(void) const; /** * Return the first character in the normalized text. This resets * the Normalizer's position to the beginning of the text. * @draft */ UChar32 first(void); /** * Return the last character in the normalized text. This resets * the Normalizer's position to be just before the * the input text corresponding to that normalized character. * @draft */ UChar32 last(void); /** * Return the next character in the normalized text and advance * the iteration position by one. If the end * of the text has already been reached, {@link #DONE} is returned. * @draft */ UChar32 next(void); /** * Return the previous character in the normalized text and decrement * the iteration position by one. If the beginning * of the text has already been reached, {@link #DONE} is returned. * @draft */ UChar32 previous(void); /** * Set the iteration position in the input text that is being normalized * and return the first normalized character at that position. *

* Note: This method sets the position in the input text, * while {@link #next} and {@link #previous} iterate through characters * in the normalized output. This means that there is not * necessarily a one-to-one correspondence between characters returned * by next and previous and the indices passed to and * returned from setIndex and {@link #getIndex}. *

* @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating * forward starting at the given index. * @draft */ UChar32 setIndex(UTextOffset index); /** * Reset the iterator so that it is in the same state that it was just after * it was constructed. A subsequent call to next will return the first * character in the normalized text. In contrast, calling setIndex(0) followed * by next will return the second character in the normalized text, * because setIndex itself returns the first character * @stable */ void reset(void); /** * Retrieve the current iteration position in the input text that is * being normalized. This method is useful in applications such as * searching, where you need to be able to determine the position in * the input text that corresponds to a given normalized output character. *

* Note: This method sets the position in the input, while * {@link #next} and {@link #previous} iterate through characters in the * output. This means that there is not necessarily a one-to-one * correspondence between characters returned by next and * previous and the indices passed to and returned from * setIndex and {@link #getIndex}. * @stable */ UTextOffset getIndex(void) const; /** * Retrieve the index of the start of the input text. This is the begin index * of the CharacterIterator or the start (i.e. 0) of the String * over which this Normalizer is iterating * @stable */ UTextOffset startIndex(void) const; /** * Retrieve the index of the end of the input text. This is the end index * of the CharacterIterator or the length of the String * over which this Normalizer is iterating * @stable */ UTextOffset endIndex(void) const; /** * Returns true when both iterators refer to the same character in the same * character-storage object. * @stable */ // virtual UBool operator==(const CharacterIterator& that) const; UBool operator==(const Normalizer& that) const; inline UBool operator!=(const Normalizer& that) const; /** * Returns a pointer to a new Normalizer that is a clone of this one. * The caller is responsible for deleting the new clone. * @stable */ Normalizer* clone(void) const; /** * Generates a hash code for this iterator. * @stable */ int32_t hashCode(void) const; //------------------------------------------------------------------------- // Property access methods //------------------------------------------------------------------------- /** * Set the normalization mode for this object. *

* Note:If the normalization mode is changed while iterating * over a string, calls to {@link #next} and {@link #previous} may * return previously buffers characters in the old normalization mode * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, * {@link #last}, etc. after calling setMode. *

* @param newMode the new mode for this Normalizer. * The supported modes are: *

* * @see #getMode * @stable */ void setMode(EMode newMode); /** * Return the basic operation performed by this Normalizer * * @see #setMode * @stable */ EMode getMode(void) const; /** * Set options that affect this Normalizer's operation. * Options do not change the basic composition or decomposition operation * that is being performed , but they control whether * certain optional portions of the operation are done. * Currently the only available option is: *

*

*

* @param option the option whose value is to be set. * @param value the new setting for the option. Use true to * turn the option on and false to turn it off. * * @see #getOption * @stable */ void setOption(int32_t option, UBool value); /** * Determine whether an option is turned on or off. *

* @see #setOption * @stable */ UBool getOption(int32_t option) const; /** * Set the input text over which this Normalizer will iterate. * The iteration position is set to the beginning. * @stable */ void setText(const UnicodeString& newText, UErrorCode &status); /** * Set the input text over which this Normalizer will iterate. * The iteration position is set to the beginning. * @stable */ void setText(const CharacterIterator& newText, UErrorCode &status); /** * Set the input text over which this Normalizer will iterate. * The iteration position is set to the beginning. * @stable */ void setText(const UChar* newText, int32_t length, UErrorCode &status); /** * Copies the text under iteration into the UnicodeString referred to by * "result". * @param result Receives a copy of the text under iteration. * @draft should also return the result UnicodeString & */ void getText(UnicodeString& result); /** * Returns the text under iteration into the UChar* buffer pointer. * @param result Receives a copy of the text under iteration. * @add */ const UChar* getText(int32_t& count); private: // Private utility methods for iteration // For documentation, see the source code UChar nextCompose(void); UChar prevCompose(void); UChar nextDecomp(void); UChar prevDecomp(void); UChar curForward(void); UChar curBackward(void); void init(CharacterIterator* iter, EMode mode, int32_t option); void initBuffer(void); void clearBuffer(void); // Utilities used by Compose static void bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass); static uint32_t getComposeClass(UChar ch); static uint16_t composeLookup(UChar ch); static uint16_t composeAction(uint16_t baseIndex, uint16_t comIndex); static void explode(UnicodeString& target, uint16_t index); static UChar pairExplode(UnicodeString& target, uint16_t action); // Utilities used by Decompose static void fixCanonical(UnicodeString& result); // Reorders combining marks static uint8_t getClass(UChar ch); // Gets char's combining class // Other static utility methods static void doAppend(const UChar source[], uint16_t offset, UnicodeString& dest); static void doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos); static void hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit); static void jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest); static void jamoToHangul(UnicodeString& buffer, UTextOffset start); //------------------------------------------------------------------------- // Private data //------------------------------------------------------------------------- EMode fMode; int32_t fOptions; int16_t minDecomp; // The input text and our position in it CharacterIterator* text; // A buffer for holding intermediate results UnicodeString buffer; UTextOffset bufferPos; UTextOffset bufferLimit; UChar currentChar; // Another buffer for use during iterative composition UnicodeString explodeBuf; enum { EMPTY = -1, STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder STR_LENGTH_MASK = 0x0003 }; enum { HANGUL_BASE = 0xac00, HANGUL_LIMIT = 0xd7a4, JAMO_LBASE = 0x1100, JAMO_VBASE = 0x1161, JAMO_TBASE = 0x11a7, JAMO_LCOUNT = 19, JAMO_VCOUNT = 21, JAMO_TCOUNT = 28, JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT }; friend class ComposedCharIter; }; inline UBool Normalizer::operator!= (const Normalizer& other) const { return ! operator==(other); } #endif // _NORMLZR