e5970bccc3
X-SVN-Rev: 973
777 lines
27 KiB
C++
777 lines
27 KiB
C++
/*
|
|
********************************************************************
|
|
* COPYRIGHT:
|
|
* Copyright (c) 1996-1999, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
********************************************************************
|
|
*/
|
|
|
|
#ifndef NORMLZR_H
|
|
#define NORMLZR_H
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/chariter.h"
|
|
|
|
/* forward declaration */
|
|
class ComposedCharIter;
|
|
|
|
/**
|
|
* <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or
|
|
* decomposed form, allowing for easier sorting and searching of text.
|
|
* <tt>Normalizer</tt> supports the standard normalization forms described in
|
|
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
|
|
* Unicode Technical Report #15</a>.
|
|
* <p>
|
|
* Characters with accents or other adornments can be encoded in
|
|
* several different ways in Unicode. For example, take the character "Á"
|
|
* (A-acute). In Unicode, this can be encoded as a single character (the
|
|
* "composed" form):
|
|
* <pre>
|
|
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre>
|
|
* or as two separate characters (the "decomposed" form):
|
|
* <pre>
|
|
* 0041 LATIN CAPITAL LETTER A
|
|
* 0301 COMBINING ACUTE ACCENT</pre>
|
|
* <p>
|
|
* To a user of your program, however, both of these sequences should be
|
|
* treated as the same "user-level" character "Á". When you are searching or
|
|
* comparing text, you must ensure that these two sequences are treated
|
|
* equivalently. In addition, you must handle characters with more than one
|
|
* accent. Sometimes the order of a character's combining accents is
|
|
* significant, while in other cases accent sequences in different orders are
|
|
* really equivalent.
|
|
* <p>
|
|
* Similarly, the string "ffi" can be encoded as three separate letters:
|
|
* <pre>
|
|
* 0066 LATIN SMALL LETTER F
|
|
* 0066 LATIN SMALL LETTER F
|
|
* 0069 LATIN SMALL LETTER I</pre>
|
|
* or as the single character
|
|
* <pre>
|
|
* FB03 LATIN SMALL LIGATURE FFI</pre>
|
|
* <p>
|
|
* The ffi ligature is not a distinct semantic character, and strictly speaking
|
|
* it shouldn't be in Unicode at all, but it was included for compatibility
|
|
* with existing character sets that already provided it. The Unicode standard
|
|
* identifies such characters by giving them "compatibility" decompositions
|
|
* into the corresponding semantic characters. When sorting and searching, you
|
|
* will often want to use these mappings.
|
|
* <p>
|
|
* <tt>Normalizer</tt> helps solve these problems by transforming text into the
|
|
* canonical composed and decomposed forms as shown in the first example above.
|
|
* In addition, you can have it perform compatibility decompositions so that
|
|
* you can treat compatibility characters the same as their equivalents.
|
|
* Finally, <tt>Normalizer</tt> rearranges accents into the proper canonical
|
|
* order, so that you do not have to worry about accent rearrangement on your
|
|
* own.
|
|
* <p>
|
|
* <tt>Normalizer</tt> adds one optional behavior, {@link #IGNORE_HANGUL},
|
|
* that differs from
|
|
* the standard Unicode Normalization Forms. This option can be passed
|
|
* to the {@link #Normalizer constructors} and to the static
|
|
* {@link #compose compose} and {@link #decompose decompose} methods. This
|
|
* option, and any that are added in the future, will be turned off by default.
|
|
* <p>
|
|
* There are three common usage models for <tt>Normalizer</tt>. In the first,
|
|
* the static {@link #normalize normalize()} method is used to process an
|
|
* entire input string at once. Second, you can create a <tt>Normalizer</tt>
|
|
* object and use it to iterate through the normalized form of a string by
|
|
* calling {@link #first} and {@link #next}. Finally, you can use the
|
|
* {@link #setIndex setIndex()} and {@link #getIndex} methods to perform
|
|
* random-access iteration, which is very useful for searching.
|
|
* <p>
|
|
* <b>Note:</b> <tt>Normalizer</tt> objects behave like iterators and have
|
|
* methods such as <tt>setIndex</tt>, <tt>next</tt>, <tt>previous</tt>, etc.
|
|
* You should note that while the <tt>setIndex</tt> and <tt>getIndex</tt> refer
|
|
* to indices in the underlying <em>input</em> text being processed, the
|
|
* <tt>next</tt> and <tt>previous</tt> methods it iterate through characters
|
|
* in the normalized <em>output</em>. This means that there is not
|
|
* necessarily a one-to-one correspondence between characters returned
|
|
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
|
|
* returned from <tt>setIndex</tt> and <tt>getIndex</tt>. It is for this
|
|
* reason that <tt>Normalizer</tt> does not implement the
|
|
* {@link CharacterIterator} interface.
|
|
* <p>
|
|
* <b>Note:</b> <tt>Normalizer</tt> is currently based on version 2.1.8
|
|
* of the <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
|
|
* It will be updated as later versions of Unicode are released. If you are
|
|
* using this class on a JDK that supports an earlier version of Unicode, it
|
|
* is possible that <tt>Normalizer</tt> may generate composed or dedecomposed
|
|
* characters for which your JDK's {@link java.lang.Character} class does not
|
|
* have any data.
|
|
* <p>
|
|
* @author Laura Werner, Mark Davis
|
|
*/
|
|
class U_COMMON_API Normalizer
|
|
{
|
|
|
|
public:
|
|
// This tells us what the bits in the "mode" mean.
|
|
enum {
|
|
COMPAT_BIT = 1,
|
|
DECOMP_BIT = 2,
|
|
COMPOSE_BIT = 4
|
|
};
|
|
|
|
|
|
|
|
/** */
|
|
static const UChar DONE;
|
|
|
|
/** The mode of a Normalizer object */
|
|
enum EMode {
|
|
|
|
/**
|
|
* Null operation for use with the {@link #Normalizer constructors}
|
|
* and the static {@link #normalize normalize} method. This value tells
|
|
* the <tt>Normalizer</tt> to do nothing but return unprocessed characters
|
|
* from the underlying String or CharacterIterator. If you have code which
|
|
* requires raw text at some times and normalized text at others, you can
|
|
* use <tt>NO_OP</tt> for the cases where you want raw text, rather
|
|
* than having a separate code path that bypasses <tt>Normalizer</tt>
|
|
* altogether.
|
|
* <p>
|
|
* @see #setMode
|
|
*/
|
|
NO_OP = 0,
|
|
|
|
/**
|
|
* Canonical decomposition followed by canonical composition. Used with
|
|
* the {@link #Normalizer constructors} and the static
|
|
* {@link #normalize normalize}
|
|
* method to determine the operation to be performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>C</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
*/
|
|
COMPOSE = COMPOSE_BIT,
|
|
|
|
/**
|
|
* Compatibility decomposition followed by canonical composition.
|
|
* Used with the {@link #Normalizer constructors} and the static
|
|
* {@link #normalize normalize} method to determine the operation to be
|
|
* performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>KC</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
*/
|
|
COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
|
|
|
|
/**
|
|
* Canonical decomposition. This value is passed to the
|
|
* {@link #Normalizer constructors} and the static
|
|
* {@link #normalize normalize}
|
|
* method to determine the operation to be performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>D</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
*/
|
|
DECOMP = DECOMP_BIT,
|
|
|
|
/**
|
|
* Compatibility decomposition. This value is passed to the
|
|
* {@link #Normalizer constructors} and the static
|
|
* {@link #normalize normalize}
|
|
* method to determine the operation to be performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>KD</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
*/
|
|
DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT
|
|
};
|
|
|
|
/** The options for a Normalizer object */
|
|
enum {
|
|
|
|
/**
|
|
* Option to disable Hangul/Jamo composition and decomposition.
|
|
* This option applies to Korean text,
|
|
* which can be represented either in the Jamo alphabet or in Hangul
|
|
* characters, which are really just two or three Jamo combined
|
|
* into one visual glyph. Since Jamo takes up more storage space than
|
|
* Hangul, applications that process only Hangul text may wish to turn
|
|
* this option on when decomposing text.
|
|
* <p>
|
|
* The Unicode standard treates Hangul to Jamo conversion as a
|
|
* canonical decomposition, so this option must be turned <b>off</b> if you
|
|
* wish to transform strings into one of the standard
|
|
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
|
|
* Unicode Normalization Forms</a>.
|
|
* <p>
|
|
* @see #setOption
|
|
*/
|
|
IGNORE_HANGUL = 0x001
|
|
};
|
|
|
|
// Constructors
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of a given string.
|
|
* <p>
|
|
* @param str The string to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
* @stable
|
|
*/
|
|
Normalizer(const UnicodeString& str,
|
|
EMode mode);
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of a given string.
|
|
* <p>
|
|
* The <tt>options</tt> parameter specifies which optional
|
|
* <tt>Normalizer</tt> features are to be enabled for this object.
|
|
* <p>
|
|
* @param str The string to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
*
|
|
* @param opt Any optional features to be enabled.
|
|
* Currently the only available option is {@link #IGNORE_HANGUL}
|
|
* If you want the default behavior corresponding to one of the
|
|
* standard Unicode Normalization Forms, use 0 for this argument
|
|
* @stable
|
|
*/
|
|
Normalizer(const UnicodeString& str,
|
|
EMode mode,
|
|
int32_t opt);
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of a given UChar string.
|
|
* <p>
|
|
* @param str The string to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param length Lenght of the string
|
|
* @param mode The normalization mode.
|
|
* @stable
|
|
*
|
|
*/
|
|
Normalizer(const UChar* str,
|
|
int32_t length,
|
|
EMode mode);
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of a given UChar string.
|
|
* <p>
|
|
* @param str The string to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param length Lenght of the string
|
|
* @param mode The normalization mode.
|
|
* @param opt Any optional features to be enabled.
|
|
* Currently the only available option is {@link #IGNORE_HANGUL}
|
|
* If you want the default behavior corresponding to one of the
|
|
* standard Unicode Normalization Forms, use 0 for this argument
|
|
* @add
|
|
*
|
|
*/
|
|
Normalizer(const UChar* str,
|
|
int32_t length,
|
|
EMode mode,
|
|
int32_t option);
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of the given text.
|
|
* <p>
|
|
* @param iter The input text to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
* @stable
|
|
*
|
|
*/
|
|
Normalizer(const CharacterIterator& iter,
|
|
EMode mode);
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of the given text.
|
|
* <p>
|
|
* @param iter The input text to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
*
|
|
* @param opt Any optional features to be enabled.
|
|
* Currently the only available option is {@link #IGNORE_HANGUL}
|
|
* If you want the default behavior corresponding to one of the
|
|
* standard Unicode Normalization Forms, use 0 for this argument
|
|
* @stable
|
|
*/
|
|
Normalizer(const CharacterIterator& iter,
|
|
EMode mode,
|
|
int32_t opt);
|
|
|
|
/**
|
|
* Copy constructor.
|
|
* @stable
|
|
*/
|
|
Normalizer(const Normalizer& copy);
|
|
|
|
/**
|
|
* Destructor
|
|
* @stable
|
|
*/
|
|
~Normalizer();
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
// Static utility methods
|
|
//-------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Normalizes a <tt>String</tt> using the given normalization operation.
|
|
* <p>
|
|
* The <tt>options</tt> parameter specifies which optional
|
|
* <tt>Normalizer</tt> features are to be enabled for this operation.
|
|
* Currently the only available option is {@link #IGNORE_HANGUL}.
|
|
* If you want the default behavior corresponding to one of the standard
|
|
* Unicode Normalization Forms, use 0 for this argument.
|
|
* <p>
|
|
* @param source the input string to be normalized.
|
|
*
|
|
* @param aMode the normalization mode
|
|
*
|
|
* @param options the optional features to be enabled.
|
|
*
|
|
* @param result The normalized string (on output).
|
|
*
|
|
* @param status The error code.
|
|
* @stable
|
|
*/
|
|
static void normalize(const UnicodeString& source,
|
|
EMode mode,
|
|
int32_t options,
|
|
UnicodeString& result,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Compose a <tt>String</tt>.
|
|
* <p>
|
|
* The <tt>options</tt> parameter specifies which optional
|
|
* <tt>Normalizer</tt> features are to be enabled for this operation.
|
|
* Currently the only available option is {@link #IGNORE_HANGUL}.
|
|
* If you want the default behavior corresponding
|
|
* to Unicode Normalization Form <b>C</b> or <b>KC</b>,
|
|
* use 0 for this argument.
|
|
* <p>
|
|
* @param source the string to be composed.
|
|
*
|
|
* @param compat Perform compatibility decomposition before composition.
|
|
* If this argument is <tt>false</tt>, only canonical
|
|
* decomposition will be performed.
|
|
*
|
|
* @param options the optional features to be enabled.
|
|
*
|
|
* @param result The composed string (on output).
|
|
*
|
|
* @param status The error code.
|
|
* @stable
|
|
*/
|
|
static void compose(const UnicodeString& source,
|
|
bool_t compat,
|
|
int32_t options,
|
|
UnicodeString& result,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Static method to decompose a <tt>String</tt>.
|
|
* <p>
|
|
* The <tt>options</tt> parameter specifies which optional
|
|
* <tt>Normalizer</tt> features are to be enabled for this operation.
|
|
* Currently the only available option is {@link #IGNORE_HANGUL}.
|
|
* The desired options should be OR'ed together to determine the value
|
|
* of this argument. If you want the default behavior corresponding
|
|
* to Unicode Normalization Form <b>D</b> or <b>KD</b>,
|
|
* use 0 for this argument.
|
|
* <p>
|
|
* @param str the string to be decomposed.
|
|
*
|
|
* @param compat Perform compatibility decomposition.
|
|
* If this argument is <tt>false</tt>, only canonical
|
|
* decomposition will be performed.
|
|
*
|
|
* @param options the optional features to be enabled.
|
|
*
|
|
* @param result The composed string (on output).
|
|
*
|
|
* @param status The error code.
|
|
*
|
|
* @return the decomposed string.
|
|
* @stable
|
|
*/
|
|
static void decompose(const UnicodeString& source,
|
|
bool_t compat,
|
|
int32_t options,
|
|
UnicodeString& result,
|
|
UErrorCode &status);
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
// CharacterIterator overrides
|
|
//-------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Return the current character in the normalized text.
|
|
* @draft
|
|
*/
|
|
UChar current(void) const;
|
|
|
|
/**
|
|
* Return the first character in the normalized text. This resets
|
|
* the <tt>Normalizer's</tt> position to the beginning of the text.
|
|
* @draft
|
|
*/
|
|
UChar first(void);
|
|
|
|
/**
|
|
* Return the last character in the normalized text. This resets
|
|
* the <tt>Normalizer's</tt> position to be just before the
|
|
* the input text corresponding to that normalized character.
|
|
* @draft
|
|
*/
|
|
UChar last(void);
|
|
|
|
/**
|
|
* Return the next character in the normalized text and advance
|
|
* the iteration position by one. If the end
|
|
* of the text has already been reached, {@link #DONE} is returned.
|
|
* @draft
|
|
*/
|
|
UChar next(void);
|
|
|
|
/**
|
|
* Return the previous character in the normalized text and decrement
|
|
* the iteration position by one. If the beginning
|
|
* of the text has already been reached, {@link #DONE} is returned.
|
|
* @draft
|
|
*/
|
|
UChar previous(void);
|
|
|
|
/**
|
|
* Set the iteration position in the input text that is being normalized
|
|
* and return the first normalized character at that position.
|
|
* <p>
|
|
* <b>Note:</b> This method sets the position in the <em>input</em> text,
|
|
* while {@link #next} and {@link #previous} iterate through characters
|
|
* in the normalized <em>output</em>. This means that there is not
|
|
* necessarily a one-to-one correspondence between characters returned
|
|
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
|
|
* returned from <tt>setIndex</tt> and {@link #getIndex}.
|
|
* <p>
|
|
* @param index the desired index in the input text.
|
|
*
|
|
* @return the first normalized character that is the result of iterating
|
|
* forward starting at the given index.
|
|
* @draft
|
|
*/
|
|
UChar setIndex(UTextOffset index);
|
|
|
|
/**
|
|
* Reset the iterator so that it is in the same state that it was just after
|
|
* it was constructed. A subsequent call to <tt>next</tt> will return the first
|
|
* character in the normalized text. In contrast, calling <tt>setIndex(0)</tt> followed
|
|
* by <tt>next</tt> will return the <em>second</em> character in the normalized text,
|
|
* because <tt>setIndex</tt> itself returns the first character
|
|
* @stable
|
|
*/
|
|
void reset(void);
|
|
|
|
/**
|
|
* Retrieve the current iteration position in the input text that is
|
|
* being normalized. This method is useful in applications such as
|
|
* searching, where you need to be able to determine the position in
|
|
* the input text that corresponds to a given normalized output character.
|
|
* <p>
|
|
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
|
* {@link #next} and {@link #previous} iterate through characters in the
|
|
* <em>output</em>. This means that there is not necessarily a one-to-one
|
|
* correspondence between characters returned by <tt>next</tt> and
|
|
* <tt>previous</tt> and the indices passed to and returned from
|
|
* <tt>setIndex</tt> and {@link #getIndex}.
|
|
* @stable
|
|
*/
|
|
UTextOffset getIndex(void) const;
|
|
|
|
/**
|
|
* Retrieve the index of the start of the input text. This is the begin index
|
|
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
|
|
* over which this <tt>Normalizer</tt> is iterating
|
|
* @stable
|
|
*/
|
|
UTextOffset startIndex(void) const;
|
|
|
|
/**
|
|
* Retrieve the index of the end of the input text. This is the end index
|
|
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
|
* over which this <tt>Normalizer</tt> is iterating
|
|
* @stable
|
|
*/
|
|
UTextOffset endIndex(void) const;
|
|
|
|
|
|
/**
|
|
* Returns true when both iterators refer to the same character in the same
|
|
* character-storage object.
|
|
* @stable
|
|
*/
|
|
// virtual bool_t operator==(const CharacterIterator& that) const;
|
|
bool_t operator==(const Normalizer& that) const;
|
|
inline bool_t operator!=(const Normalizer& that) const;
|
|
|
|
/**
|
|
* Returns a pointer to a new Normalizer that is a clone of this one.
|
|
* The caller is responsible for deleting the new clone.
|
|
* @stable
|
|
*/
|
|
Normalizer* clone(void) const;
|
|
|
|
/**
|
|
* Generates a hash code for this iterator.
|
|
* @stable
|
|
*/
|
|
int32_t hashCode(void) const;
|
|
|
|
//-------------------------------------------------------------------------
|
|
// Property access methods
|
|
//-------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Set the normalization mode for this object.
|
|
* <p>
|
|
* <b>Note:</b>If the normalization mode is changed while iterating
|
|
* over a string, calls to {@link #next} and {@link #previous} may
|
|
* return previously buffers characters in the old normalization mode
|
|
* until the iteration is able to re-sync at the next base character.
|
|
* It is safest to call {@link #setText setText()}, {@link #first},
|
|
* {@link #last}, etc. after calling <tt>setMode</tt>.
|
|
* <p>
|
|
* @param newMode the new mode for this <tt>Normalizer</tt>.
|
|
* The supported modes are:
|
|
* <ul>
|
|
* <li>{@link #COMPOSE} - Unicode canonical decompositiion
|
|
* followed by canonical composition.
|
|
* <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
|
|
* follwed by canonical composition.
|
|
* <li>{@link #DECOMP} - Unicode canonical decomposition
|
|
* <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
|
|
* <li>{@link #NO_OP} - Do nothing but return characters
|
|
* from the underlying input text.
|
|
* </ul>
|
|
*
|
|
* @see #getMode
|
|
* @stable
|
|
*/
|
|
void setMode(EMode newMode);
|
|
|
|
/**
|
|
* Return the basic operation performed by this <tt>Normalizer</tt>
|
|
*
|
|
* @see #setMode
|
|
* @stable
|
|
*/
|
|
EMode getMode(void) const;
|
|
|
|
/**
|
|
* Set options that affect this <tt>Normalizer</tt>'s operation.
|
|
* Options do not change the basic composition or decomposition operation
|
|
* that is being performed , but they control whether
|
|
* certain optional portions of the operation are done.
|
|
* Currently the only available option is:
|
|
* <p>
|
|
* <ul>
|
|
* <li>{@link #IGNORE_HANGUL} - Do not decompose Hangul syllables into the
|
|
* Jamo alphabet and vice-versa. This option is off by default
|
|
* (<i>i.e.</i> Hangul processing is enabled) since the Unicode
|
|
* standard specifies that Hangul to Jamo is a canonical decomposition.
|
|
* For any of the standard Unicode Normalization
|
|
* Forms, you should leave this option off.
|
|
* </ul>
|
|
* <p>
|
|
* @param option the option whose value is to be set.
|
|
* @param value the new setting for the option. Use <tt>true</tt> to
|
|
* turn the option on and <tt>false</tt> to turn it off.
|
|
*
|
|
* @see #getOption
|
|
* @stable
|
|
*/
|
|
void setOption(int32_t option,
|
|
bool_t value);
|
|
|
|
/**
|
|
* Determine whether an option is turned on or off.
|
|
* <p>
|
|
* @see #setOption
|
|
* @stable
|
|
*/
|
|
bool_t getOption(int32_t option) const;
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning.
|
|
* @stable
|
|
*/
|
|
void setText(const UnicodeString& newText,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning.
|
|
* @stable
|
|
*/
|
|
void setText(const CharacterIterator& newText,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning.
|
|
* @stable
|
|
*/
|
|
void setText(const UChar* newText,
|
|
int32_t length,
|
|
UErrorCode &status);
|
|
/**
|
|
* Copies the text under iteration into the UnicodeString referred to by
|
|
* "result".
|
|
* @param result Receives a copy of the text under iteration.
|
|
* @draft should also return the result UnicodeString &
|
|
*/
|
|
void getText(UnicodeString& result);
|
|
|
|
/**
|
|
* Returns the text under iteration into the UChar* buffer pointer.
|
|
* @param result Receives a copy of the text under iteration.
|
|
* @add
|
|
*/
|
|
const UChar* getText(int32_t& count);
|
|
|
|
private:
|
|
// Private utility methods for iteration
|
|
// For documentation, see the source code
|
|
UChar nextCompose(void);
|
|
UChar prevCompose(void);
|
|
UChar nextDecomp(void);
|
|
UChar prevDecomp(void);
|
|
|
|
UChar curForward(void);
|
|
UChar curBackward(void);
|
|
|
|
void init(CharacterIterator* iter,
|
|
EMode mode,
|
|
int32_t option);
|
|
void initBuffer(void);
|
|
void clearBuffer(void);
|
|
|
|
// Utilities used by Compose
|
|
static void bubbleAppend(UnicodeString& target,
|
|
UChar ch,
|
|
uint32_t cclass);
|
|
static uint32_t getComposeClass(UChar ch);
|
|
static uint16_t composeLookup(UChar ch);
|
|
static uint16_t composeAction(uint16_t baseIndex,
|
|
uint16_t comIndex);
|
|
static void explode(UnicodeString& target,
|
|
uint16_t index);
|
|
static UChar pairExplode(UnicodeString& target,
|
|
uint16_t action);
|
|
|
|
// Utilities used by Decompose
|
|
static void fixCanonical(UnicodeString& result); // Reorders combining marks
|
|
static uint8_t getClass(UChar ch); // Gets char's combining class
|
|
|
|
// Other static utility methods
|
|
static void doAppend(const UChar source[],
|
|
uint16_t offset,
|
|
UnicodeString& dest);
|
|
static void doInsert(const UChar source[],
|
|
uint16_t offset,
|
|
UnicodeString& dest,
|
|
UTextOffset pos);
|
|
|
|
static void hangulToJamo(UChar ch,
|
|
UnicodeString& result,
|
|
uint16_t decompLimit);
|
|
static void jamoAppend(UChar ch,
|
|
uint16_t decompLimit,
|
|
UnicodeString& dest);
|
|
static void jamoToHangul(UnicodeString& buffer,
|
|
UTextOffset start);
|
|
|
|
//-------------------------------------------------------------------------
|
|
// Private data
|
|
//-------------------------------------------------------------------------
|
|
|
|
EMode fMode;
|
|
int32_t fOptions;
|
|
int16_t minDecomp;
|
|
|
|
// The input text and our position in it
|
|
CharacterIterator* text;
|
|
|
|
// A buffer for holding intermediate results
|
|
UnicodeString buffer;
|
|
UTextOffset bufferPos;
|
|
UTextOffset bufferLimit;
|
|
UChar currentChar;
|
|
|
|
// Another buffer for use during iterative composition
|
|
UnicodeString explodeBuf;
|
|
|
|
enum {
|
|
EMPTY = -1,
|
|
STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder
|
|
STR_LENGTH_MASK = 0x0003
|
|
};
|
|
|
|
static const UChar HANGUL_BASE;
|
|
static const UChar HANGUL_LIMIT;
|
|
static const UChar JAMO_LBASE;
|
|
static const UChar JAMO_VBASE;
|
|
static const UChar JAMO_TBASE;
|
|
static const int16_t JAMO_LCOUNT;
|
|
static const int16_t JAMO_VCOUNT;
|
|
static const int16_t JAMO_TCOUNT;
|
|
static const int16_t JAMO_NCOUNT;
|
|
|
|
friend class ComposedCharIter;
|
|
};
|
|
|
|
inline bool_t
|
|
Normalizer::operator!= (const Normalizer& other) const
|
|
{ return ! operator==(other); }
|
|
|
|
#endif // _NORMLZR
|
|
|
|
|
|
|
|
|