ICU-96 more collation cleanup, plus moving normalization C API
X-SVN-Rev: 3146
This commit is contained in:
parent
82e011125e
commit
eb03d8dab2
@ -1579,14 +1579,16 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
UChar *uTarget = uTstart;
|
||||
uint32_t sourceLen = source.length();
|
||||
uint32_t targetLen = target.length();
|
||||
if(sourceLen > tblcoll_StackBufferLen) {
|
||||
uSource = new UChar[sourceLen];
|
||||
if(sourceLen >= tblcoll_StackBufferLen) {
|
||||
uSource = new UChar[sourceLen+1];
|
||||
}
|
||||
if(targetLen > tblcoll_StackBufferLen) {
|
||||
uTarget = new UChar[targetLen];
|
||||
if(targetLen >= tblcoll_StackBufferLen) {
|
||||
uTarget = new UChar[targetLen+1];
|
||||
}
|
||||
source.extract(0, sourceLen, uSource);
|
||||
uSource[sourceLen] = 0;
|
||||
target.extract(0, targetLen, uTarget);
|
||||
uTarget[targetLen] = 0;
|
||||
Collator::EComparisonResult result = compare(uSource, sourceLen, uTarget, targetLen);
|
||||
|
||||
if(uSstart != uSource) {
|
||||
@ -1639,10 +1641,11 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
UChar sStart[tblcoll_StackBufferLen];
|
||||
UChar *uSource = sStart;
|
||||
uint32_t sourceLen = source.length();
|
||||
if(sourceLen > tblcoll_StackBufferLen) {
|
||||
uSource = new UChar[sourceLen];
|
||||
if(sourceLen >= tblcoll_StackBufferLen) {
|
||||
uSource = new UChar[sourceLen+1];
|
||||
}
|
||||
source.extract(0, sourceLen, uSource);
|
||||
uSource[sourceLen] = 0;
|
||||
CollationKey& result = RuleBasedCollator::getCollationKey(uSource, sourceLen, sortkey, status);
|
||||
if(sStart != uSource) {
|
||||
delete[] uSource;
|
||||
@ -2992,10 +2995,11 @@ int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
|
||||
UChar sStart[tblcoll_StackBufferLen];
|
||||
UChar *uSource = sStart;
|
||||
uint32_t sourceLen = source.length();
|
||||
if(sourceLen > tblcoll_StackBufferLen) {
|
||||
uSource = new UChar[sourceLen];
|
||||
if(sourceLen >= tblcoll_StackBufferLen) {
|
||||
uSource = new UChar[sourceLen+1];
|
||||
}
|
||||
source.extract(0, sourceLen, uSource);
|
||||
uSource[sourceLen] = 0;
|
||||
int32_t resLen = ucol_getSortKey((UCollator *)this, uSource, sourceLen, result, resultLength);
|
||||
if(sStart != uSource) {
|
||||
delete[] uSource;
|
||||
|
@ -14,6 +14,8 @@
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "cpputils.h"
|
||||
|
||||
|
||||
static uint8_t utf16fixup[32] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
@ -46,7 +48,7 @@ struct collIterate {
|
||||
|
||||
#define UCOL_UNMAPPEDCHARVALUE 0x7fff0000 // from coleiterator
|
||||
|
||||
#define UCOL_LEVELTERMINATOR 0
|
||||
#define UCOL_LEVELTERMINATOR 1
|
||||
#define UCOL_IGNORABLE 0x0000
|
||||
#define UCOL_CHARINDEX 0x70000000 // need look up in .commit()
|
||||
#define UCOL_EXPANDCHARINDEX 0x7E000000 // Expand index follows
|
||||
@ -65,7 +67,7 @@ struct collIterate {
|
||||
#define UCOL_SECONDARYDIFFERENCEONLY 0xffffff00 // use only the primary and secondary difference
|
||||
#define UCOL_PRIMARYORDERSHIFT 16 // primary order shift
|
||||
#define UCOL_SECONDARYORDERSHIFT 8 // secondary order shift
|
||||
#define UCOL_SORTKEYOFFSET 1 // minimum sort key offset
|
||||
#define UCOL_SORTKEYOFFSET 2 // minimum sort key offset
|
||||
#define UCOL_CONTRACTCHAROVERFLOW 0x7FFFFFFF // Indicates the char is a contract char
|
||||
|
||||
#define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT)
|
||||
@ -83,48 +85,6 @@ struct collIterate {
|
||||
*/
|
||||
#define UCOL_ISTHAIBASECONSONANT(ch) ((uint32_t)(ch) - 0xe01) <= (0xe2e - 0xe01)
|
||||
|
||||
U_CAPI int32_t
|
||||
u_normalize(const UChar* source,
|
||||
int32_t sourceLength,
|
||||
UNormalizationMode mode,
|
||||
int32_t option,
|
||||
UChar* result,
|
||||
int32_t resultLength,
|
||||
UErrorCode* status)
|
||||
{
|
||||
if(U_FAILURE(*status)) return -1;
|
||||
|
||||
Normalizer::EMode normMode;
|
||||
switch(mode) {
|
||||
case UCOL_NO_NORMALIZATION:
|
||||
normMode = Normalizer::NO_OP;
|
||||
break;
|
||||
case UCOL_DECOMP_CAN:
|
||||
normMode = Normalizer::DECOMP;
|
||||
break;
|
||||
case UCOL_DECOMP_COMPAT:
|
||||
normMode = Normalizer::DECOMP_COMPAT;
|
||||
break;
|
||||
case UCOL_DECOMP_CAN_COMP_COMPAT:
|
||||
normMode = Normalizer::COMPOSE;
|
||||
break;
|
||||
case UCOL_DECOMP_COMPAT_COMP_CAN:
|
||||
normMode = Normalizer::COMPOSE_COMPAT;
|
||||
break;
|
||||
default:
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
|
||||
const UnicodeString src((UChar*)source, len, len);
|
||||
UnicodeString dst(result, 0, resultLength);
|
||||
Normalizer::normalize(src, normMode, option, dst, *status);
|
||||
int32_t actualLen;
|
||||
T_fillOutputParams(&dst, result, resultLength, &actualLen, status);
|
||||
return actualLen;
|
||||
}
|
||||
|
||||
U_CAPI UCollator*
|
||||
ucol_open( const char *loc,
|
||||
UErrorCode *status)
|
||||
@ -420,23 +380,24 @@ int32_t getComplicatedCE(const UCollator *coll, collIterate *source, UErrorCode
|
||||
EntryPair *pair = (EntryPair *)list->at(0); // Taking out the first one.
|
||||
int32_t order = pair->value; // This got us mapping for just the first element - the one that signalled a contraction.
|
||||
|
||||
key[posKey++] = *(source->pos);
|
||||
key[posKey++] = *(source->pos++);
|
||||
// This tries to find the longes common match for the data in contraction table...
|
||||
// and needs to be rewritten, especially the test down there!
|
||||
int32_t i;
|
||||
int32_t listSize = list->size();
|
||||
UBool foundSmaller = TRUE;
|
||||
while(source->pos<source->len && foundSmaller) {
|
||||
|
||||
key[posKey++] = *(++source->pos);
|
||||
key[posKey++] = *source->pos;
|
||||
|
||||
foundSmaller = FALSE;
|
||||
i = 0;
|
||||
while(i<listSize && !foundSmaller) {
|
||||
pair = list->at(i);
|
||||
if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
|
||||
order = pair->value;
|
||||
foundSmaller = TRUE;
|
||||
if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
|
||||
/* Found a matching contraction sequence */
|
||||
order = pair->value; /* change the CE value */
|
||||
source->pos++; /* consume another char from the source */
|
||||
foundSmaller = TRUE;
|
||||
}
|
||||
i++;
|
||||
|
||||
@ -520,7 +481,7 @@ struct incrementalContext {
|
||||
};
|
||||
|
||||
|
||||
void init_incrementalContext(UCharForwardIterator *source, void *sourceContext, incrementalContext *s, UBool isWritable) {
|
||||
void init_incrementalContext(UCharForwardIterator *source, void *sourceContext, incrementalContext *s) {
|
||||
s->len = s->pos = s->string ;
|
||||
s->CEpos = s->toReturn = s->CEs;
|
||||
s->source = source;
|
||||
@ -588,9 +549,9 @@ int32_t ucol_getIncrementalCE(const UCollator *coll, incrementalContext *ctx, UE
|
||||
int32_t listSize = list->size();
|
||||
UBool foundSmaller = TRUE;
|
||||
UBool endOfString = FALSE;
|
||||
*(ctx->len++) = ctx->lastChar;
|
||||
while(!endOfString && foundSmaller) {
|
||||
endOfString = ((ctx->lastChar = ctx->source(ctx->sourceContext)) == 0xFFFF);
|
||||
*(ctx->len++) = ctx->lastChar;
|
||||
key[posKey++] = ctx->lastChar;
|
||||
|
||||
foundSmaller = FALSE;
|
||||
@ -599,13 +560,13 @@ int32_t ucol_getIncrementalCE(const UCollator *coll, incrementalContext *ctx, UE
|
||||
pair = list->at(i);
|
||||
if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
|
||||
order = pair->value;
|
||||
*(ctx->len++) = ctx->lastChar;
|
||||
foundSmaller = TRUE;
|
||||
}
|
||||
i++;
|
||||
|
||||
}
|
||||
}
|
||||
//*(ctx->CEpos) = order;
|
||||
}
|
||||
}
|
||||
// Expansion sequence start...
|
||||
@ -654,8 +615,8 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
||||
|
||||
incrementalContext sColl, tColl;
|
||||
|
||||
init_incrementalContext(source, sourceContext, &sColl, FALSE);
|
||||
init_incrementalContext(target, targetContext, &tColl, FALSE);
|
||||
init_incrementalContext(source, sourceContext, &sColl);
|
||||
init_incrementalContext(target, targetContext, &tColl);
|
||||
|
||||
if(cppColl->getDecomposition() != Normalizer::NO_OP) { // run away screaming!!!!
|
||||
return alternateIncrementalProcessing(coll, &sColl, &tColl);
|
||||
@ -667,7 +628,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
||||
}
|
||||
|
||||
UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
|
||||
int32_t sOrder, tOrder;
|
||||
uint32_t sOrder=UCOL_NULLORDER, tOrder=UCOL_NULLORDER;
|
||||
uint32_t pSOrder, pTOrder;
|
||||
UBool gets = TRUE, gett = TRUE;
|
||||
UBool initialCheckSecTer = strength >= UCOL_SECONDARY;
|
||||
@ -881,7 +842,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
||||
sOrder = ucol_getIncrementalCE(coll, &sColl, &status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
}
|
||||
|
||||
|
||||
gets = TRUE;
|
||||
|
||||
if (gett)
|
||||
@ -1072,7 +1033,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// For IDENTICAL comparisons, we use a bitwise character comparison
|
||||
// as a tiebreaker if all else is equal
|
||||
// NOTE: The java code compares result with 0, and
|
||||
@ -1150,7 +1111,7 @@ ucol_strcoll( const UCollator *coll,
|
||||
}
|
||||
|
||||
UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
|
||||
int32_t sOrder, tOrder;
|
||||
uint32_t sOrder=UCOL_NULLORDER, tOrder=UCOL_NULLORDER;
|
||||
uint32_t pSOrder, pTOrder;
|
||||
UBool gets = TRUE, gett = TRUE;
|
||||
UBool initialCheckSecTer = strength >= UCOL_SECONDARY;
|
||||
@ -1625,16 +1586,6 @@ ucol_getSortKey(const UCollator *coll,
|
||||
int32_t resultLength)
|
||||
{
|
||||
|
||||
/*
|
||||
Still problems in:
|
||||
SUMMARY:
|
||||
******* [Total error count: 213]
|
||||
Errors in
|
||||
[tscoll/capitst/TestSortKey] // this is normal, since we are changing binary keys
|
||||
[tscoll/cfrtst/TestSecondary] // this is also OK, ICU original implementation was messed up
|
||||
[tscoll/cfrtst/TestTertiary] // probably the same as above
|
||||
*/
|
||||
|
||||
uint32_t i = 0; // general purpose counter
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@ -1647,6 +1598,15 @@ ucol_getSortKey(const UCollator *coll,
|
||||
UChar *normSource = normBuffer;
|
||||
int32_t normSourceLen = 2048;
|
||||
|
||||
for(i = 0; i<UCOL_MAX_BUFFER; i++) {
|
||||
prim[i]=second[i]=tert[i]='\0';
|
||||
}
|
||||
|
||||
for(i = UCOL_MAX_BUFFER; i<2*UCOL_MAX_BUFFER; i++) {
|
||||
prim[i]=normBuffer[i]='\0';
|
||||
}
|
||||
|
||||
|
||||
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
|
||||
|
||||
UBool compareSec = (((RuleBasedCollator *)coll)->getStrength() >= Collator::SECONDARY);
|
||||
@ -1667,7 +1627,7 @@ ucol_getSortKey(const UCollator *coll,
|
||||
uint8_t *secstart = secondaries;
|
||||
uint8_t *terstart = tertiaries;
|
||||
|
||||
collIterate s;
|
||||
collIterate s;
|
||||
init_collIterate((UChar *)source, len, &s, FALSE);
|
||||
|
||||
// If we need to normalize, we'll do it all at once at the beggining!
|
||||
@ -1687,7 +1647,7 @@ ucol_getSortKey(const UCollator *coll,
|
||||
s.len = normSource+normSourceLen;
|
||||
}
|
||||
|
||||
int32_t order = 0;
|
||||
uint32_t order = 0;
|
||||
|
||||
uint16_t primary = 0;
|
||||
uint8_t secondary = 0;
|
||||
@ -1700,8 +1660,8 @@ ucol_getSortKey(const UCollator *coll,
|
||||
tertiary = (order & UCOL_TERTIARYORDERMASK);
|
||||
|
||||
if(primary != UCOL_IGNORABLE) {
|
||||
*(primaries++) = (primary+UCOL_SORTKEYOFFSET)>>8;
|
||||
*(primaries++) = (primary+UCOL_SORTKEYOFFSET)&0xFF;
|
||||
*(primaries++) = (primary>>8)+UCOL_SORTKEYOFFSET;
|
||||
*(primaries++) = (primary&0xFF)+UCOL_SORTKEYOFFSET;
|
||||
if(compareSec) {
|
||||
*(secondaries++) = secondary+UCOL_SORTKEYOFFSET;
|
||||
}
|
||||
@ -1719,11 +1679,10 @@ ucol_getSortKey(const UCollator *coll,
|
||||
UCOL_GETNEXTCE(order, coll, s, status);
|
||||
}
|
||||
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
|
||||
|
||||
if(compareSec) {
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
uint32_t secsize = secondaries-secstart;
|
||||
if(ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, &status) == UCOL_ON) { // do the reverse copy
|
||||
for(i = 0; i<secsize; i++) {
|
||||
@ -1734,27 +1693,28 @@ ucol_getSortKey(const UCollator *coll,
|
||||
primaries += secsize;
|
||||
}
|
||||
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
}
|
||||
|
||||
if(compareTer) {
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
uint32_t tersize = tertiaries - terstart;
|
||||
uprv_memcpy(primaries, terstart, tersize);
|
||||
primaries += tersize;
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
}
|
||||
|
||||
|
||||
if(compareIdent) {
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
UChar *ident = s.string;
|
||||
while(ident < s.len) {
|
||||
*(primaries++) = (*(ident) >> 8) + utf16fixup[*(ident) >> 11];
|
||||
*(primaries++) = (*(ident) & 0xFF);
|
||||
ident++;
|
||||
}
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
}
|
||||
|
||||
*(primaries++) = '\0';
|
||||
|
||||
uprv_memcpy(result, primstart, uprv_min(resultLength, (primaries-primstart)));
|
||||
|
||||
if(terstart != tert) {
|
||||
|
@ -8,6 +8,7 @@
|
||||
#define UCOL_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unorm.h"
|
||||
/**
|
||||
* @name Collator C API
|
||||
*
|
||||
@ -105,20 +106,20 @@ typedef void* UCollator;
|
||||
* @see u_strcoll()
|
||||
**/
|
||||
/** Possible values for a comparison result */
|
||||
enum UCollationResult {
|
||||
typedef enum {
|
||||
/** string a == string b */
|
||||
UCOL_EQUAL = 0,
|
||||
/** string a > string b */
|
||||
UCOL_GREATER = 1,
|
||||
/** string a < string b */
|
||||
UCOL_LESS = -1
|
||||
};
|
||||
typedef enum UCollationResult UCollationResult;
|
||||
} UCollationResult ;
|
||||
|
||||
|
||||
typedef enum {
|
||||
/* accepted by most attributes */
|
||||
UCOL_DEFAULT = -1,
|
||||
|
||||
/* for UCOL_STRENGTH */
|
||||
/** Primary collation strength */
|
||||
UCOL_PRIMARY = 0,
|
||||
@ -126,61 +127,33 @@ typedef enum {
|
||||
UCOL_SECONDARY = 1,
|
||||
/** Tertiary collation strength */
|
||||
UCOL_TERTIARY = 2,
|
||||
/** Default collation strength */
|
||||
UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
|
||||
/** Quaternary collation strength */
|
||||
UCOL_QUATERNARY=3,
|
||||
/** Identical collation strength */
|
||||
UCOL_IDENTICAL=15,
|
||||
|
||||
/* for UCOL_FRENCH_COLLATION & UCOL_CASE_LEVEL*/
|
||||
/* for UCOL_FRENCH_COLLATION, UCOL_CASE_LEVEL & UCOL_DECOMPOSITION_MODE*/
|
||||
UCOL_OFF = 16,
|
||||
UCOL_ON = 17,
|
||||
|
||||
/* for UCOL_ALTERNATE_HANDLING */
|
||||
UCOL_SHIFTED = 0,
|
||||
UCOL_NON_IGNORABLE = 1,
|
||||
UCOL_SHIFTED = 20,
|
||||
UCOL_NON_IGNORABLE = 21,
|
||||
|
||||
/* for UCOL_CASE_FIRST */
|
||||
UCOL_LOWER_FIRST = 0,
|
||||
UCOL_UPPER_FIRST = 1,
|
||||
UCOL_LOWER_FIRST = 24,
|
||||
UCOL_UPPER_FIRST = 25,
|
||||
|
||||
/* for UCOL_NORMALIZATION_MODE */
|
||||
/** No decomposition/composition */
|
||||
UCOL_NO_NORMALIZATION = 1,
|
||||
/** Canonical decomposition */
|
||||
UCOL_DECOMP_CAN = 2,
|
||||
/** Compatibility decomposition */
|
||||
UCOL_DECOMP_COMPAT = 3,
|
||||
/** Default normalization */
|
||||
UCOL_DEFAULT_NORMALIZATION = UCOL_DECOMP_COMPAT,
|
||||
/** Canonical decomposition followed by canonical composition */
|
||||
UCOL_DECOMP_CAN_COMP_COMPAT = 4,
|
||||
/** Compatibility decomposition followed by canonical composition */
|
||||
UCOL_DECOMP_COMPAT_COMP_CAN =5,
|
||||
/** Default collation strength */
|
||||
UCOL_ON_WITHOUT_HANGUL = 28,
|
||||
|
||||
/** No more attribute values after this*/
|
||||
UCOL_ATTRIBUTE_VALUE_COUNT
|
||||
|
||||
} UColAttributeValue;
|
||||
|
||||
/**
|
||||
* UCOL_NO_NORMALIZATION : Accented characters will not be decomposed for sorting.
|
||||
* UCOL_DECOM_CAN : Characters that are canonical variants according
|
||||
* to Unicode 2.0 will be decomposed for sorting.
|
||||
* UCOL_DECOMP_COMPAT : Characters that are compatibility variants will be
|
||||
* decomposed for sorting. This is the default normalization mode used.
|
||||
* UCOL_DECOMP_CAN_COMP_COMPAT : Canonical decomposition followed by canonical composition
|
||||
* UCOL_DECOMP_COMPAT_COMP_CAN : Compatibility decomposition followed by canonical composition
|
||||
*
|
||||
**/
|
||||
/** Possible collation normalization modes - see UColAttributeValue for the enum */
|
||||
typedef UColAttributeValue UNormalizationMode;
|
||||
|
||||
/** Possible normalization options */
|
||||
typedef enum {
|
||||
/** Do not normalize Hangul */
|
||||
UCOL_IGNORE_HANGUL = 1
|
||||
} UNormalizationOption;
|
||||
|
||||
/**
|
||||
* Base letter represents a primary difference. Set comparison
|
||||
* level to UCOL_PRIMARY to ignore secondary and tertiary differences.
|
||||
@ -218,91 +191,6 @@ typedef enum {
|
||||
UCOL_ATTRIBUTE_COUNT
|
||||
} UColAttribute;
|
||||
|
||||
/**
|
||||
* @name Unicode normalization API
|
||||
*
|
||||
* <tt>u_normalize</tt> transforms Unicode text into an equivalent composed or
|
||||
* decomposed form, allowing for easier sorting and searching of text.
|
||||
* <tt>u_normalize</tt> supports the standard normalization forms described in
|
||||
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
|
||||
* Unicode Technical Report #15</a>.
|
||||
* <p>
|
||||
* Characters with accents or other adornments can be encoded in
|
||||
* several different ways in Unicode. For example, take the character "Á"
|
||||
* (A-acute). In Unicode, this can be encoded as a single character (the
|
||||
* "composed" form):
|
||||
* <pre>
|
||||
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre>
|
||||
* or as two separate characters (the "decomposed" form):
|
||||
* <pre>
|
||||
* 0041 LATIN CAPITAL LETTER A
|
||||
* 0301 COMBINING ACUTE ACCENT</pre>
|
||||
* <p>
|
||||
* To a user of your program, however, both of these sequences should be
|
||||
* treated as the same "user-level" character "Á". When you are searching or
|
||||
* comparing text, you must ensure that these two sequences are treated
|
||||
* equivalently. In addition, you must handle characters with more than one
|
||||
* accent. Sometimes the order of a character's combining accents is
|
||||
* significant, while in other cases accent sequences in different orders are
|
||||
* really equivalent.
|
||||
* <p>
|
||||
* Similarly, the string "ffi" can be encoded as three separate letters:
|
||||
* <pre>
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0069 LATIN SMALL LETTER I</pre>
|
||||
* or as the single character
|
||||
* <pre>
|
||||
* FB03 LATIN SMALL LIGATURE FFI</pre>
|
||||
* <p>
|
||||
* The ffi ligature is not a distinct semantic character, and strictly speaking
|
||||
* it shouldn't be in Unicode at all, but it was included for compatibility
|
||||
* with existing character sets that already provided it. The Unicode standard
|
||||
* identifies such characters by giving them "compatibility" decompositions
|
||||
* into the corresponding semantic characters. When sorting and searching, you
|
||||
* will often want to use these mappings.
|
||||
* <p>
|
||||
* <tt>u_normalize</tt> helps solve these problems by transforming text into the
|
||||
* canonical composed and decomposed forms as shown in the first example above.
|
||||
* In addition, you can have it perform compatibility decompositions so that
|
||||
* you can treat compatibility characters the same as their equivalents.
|
||||
* Finally, <tt>u_normalize</tt> rearranges accents into the proper canonical
|
||||
* order, so that you do not have to worry about accent rearrangement on your
|
||||
* own.
|
||||
* <p>
|
||||
* <tt>u_normalize</tt> adds one optional behavior, {@link #UCOL_IGNORE_HANGUL},
|
||||
* that differs from
|
||||
* the standard Unicode Normalization Forms.
|
||||
**/
|
||||
|
||||
|
||||
/**
|
||||
* Normalize a string.
|
||||
* The string will be normalized according the the specified normalization mode
|
||||
* and options.
|
||||
* @param source The string to normalize.
|
||||
* @param sourceLength The length of source, or -1 if null-terminated.
|
||||
* @param mode The normalization mode; one of UCOL_NO_NORMALIZATION,
|
||||
* UCOL_CAN_DECOMP, UCOL_COMPAT_DECOMP, UCOL_CAN_DECOMP_COMPAT_COMP,
|
||||
* UCOL_COMPAT_DECOMP_CAN_COMP, UCOL_DEFAULT_NORMALIZATION
|
||||
* @param options The normalization options, ORed together; possible values
|
||||
* are UCOL_IGNORE_HANGUL
|
||||
* @param result A pointer to a buffer to receive the attribute.
|
||||
* @param resultLength The maximum size of result.
|
||||
* @param status A pointer to an UErrorCode to receive any errors
|
||||
* @return The total buffer size needed; if greater than resultLength,
|
||||
* the output was truncated.
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI int32_t
|
||||
u_normalize(const UChar* source,
|
||||
int32_t sourceLength,
|
||||
UNormalizationMode mode,
|
||||
int32_t options,
|
||||
UChar* result,
|
||||
int32_t resultLength,
|
||||
UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Open a UCollator for comparing strings.
|
||||
* The UCollator may be used in calls to \Ref{ucol_strcoll}.
|
||||
|
Loading…
Reference in New Issue
Block a user