ICU-96 more collation cleanup, plus moving normalization C API

X-SVN-Rev: 3146
This commit is contained in:
Vladimir Weinstein 2000-12-06 00:53:48 +00:00
parent 82e011125e
commit eb03d8dab2
3 changed files with 63 additions and 211 deletions

View File

@ -1579,14 +1579,16 @@ RuleBasedCollator::compare(const UnicodeString& source,
UChar *uTarget = uTstart;
uint32_t sourceLen = source.length();
uint32_t targetLen = target.length();
if(sourceLen > tblcoll_StackBufferLen) {
uSource = new UChar[sourceLen];
if(sourceLen >= tblcoll_StackBufferLen) {
uSource = new UChar[sourceLen+1];
}
if(targetLen > tblcoll_StackBufferLen) {
uTarget = new UChar[targetLen];
if(targetLen >= tblcoll_StackBufferLen) {
uTarget = new UChar[targetLen+1];
}
source.extract(0, sourceLen, uSource);
uSource[sourceLen] = 0;
target.extract(0, targetLen, uTarget);
uTarget[targetLen] = 0;
Collator::EComparisonResult result = compare(uSource, sourceLen, uTarget, targetLen);
if(uSstart != uSource) {
@ -1639,10 +1641,11 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
UChar sStart[tblcoll_StackBufferLen];
UChar *uSource = sStart;
uint32_t sourceLen = source.length();
if(sourceLen > tblcoll_StackBufferLen) {
uSource = new UChar[sourceLen];
if(sourceLen >= tblcoll_StackBufferLen) {
uSource = new UChar[sourceLen+1];
}
source.extract(0, sourceLen, uSource);
uSource[sourceLen] = 0;
CollationKey& result = RuleBasedCollator::getCollationKey(uSource, sourceLen, sortkey, status);
if(sStart != uSource) {
delete[] uSource;
@ -2992,10 +2995,11 @@ int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
UChar sStart[tblcoll_StackBufferLen];
UChar *uSource = sStart;
uint32_t sourceLen = source.length();
if(sourceLen > tblcoll_StackBufferLen) {
uSource = new UChar[sourceLen];
if(sourceLen >= tblcoll_StackBufferLen) {
uSource = new UChar[sourceLen+1];
}
source.extract(0, sourceLen, uSource);
uSource[sourceLen] = 0;
int32_t resLen = ucol_getSortKey((UCollator *)this, uSource, sourceLen, result, resultLength);
if(sStart != uSource) {
delete[] uSource;

View File

@ -14,6 +14,8 @@
#include "unicode/ustring.h"
#include "unicode/normlzr.h"
#include "cpputils.h"
static uint8_t utf16fixup[32] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
@ -46,7 +48,7 @@ struct collIterate {
#define UCOL_UNMAPPEDCHARVALUE 0x7fff0000 // from coleiterator
#define UCOL_LEVELTERMINATOR 0
#define UCOL_LEVELTERMINATOR 1
#define UCOL_IGNORABLE 0x0000
#define UCOL_CHARINDEX 0x70000000 // need look up in .commit()
#define UCOL_EXPANDCHARINDEX 0x7E000000 // Expand index follows
@ -65,7 +67,7 @@ struct collIterate {
#define UCOL_SECONDARYDIFFERENCEONLY 0xffffff00 // use only the primary and secondary difference
#define UCOL_PRIMARYORDERSHIFT 16 // primary order shift
#define UCOL_SECONDARYORDERSHIFT 8 // secondary order shift
#define UCOL_SORTKEYOFFSET 1 // minimum sort key offset
#define UCOL_SORTKEYOFFSET 2 // minimum sort key offset
#define UCOL_CONTRACTCHAROVERFLOW 0x7FFFFFFF // Indicates the char is a contract char
#define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT)
@ -83,48 +85,6 @@ struct collIterate {
*/
#define UCOL_ISTHAIBASECONSONANT(ch) ((uint32_t)(ch) - 0xe01) <= (0xe2e - 0xe01)
U_CAPI int32_t
u_normalize(const UChar* source,
int32_t sourceLength,
UNormalizationMode mode,
int32_t option,
UChar* result,
int32_t resultLength,
UErrorCode* status)
{
if(U_FAILURE(*status)) return -1;
Normalizer::EMode normMode;
switch(mode) {
case UCOL_NO_NORMALIZATION:
normMode = Normalizer::NO_OP;
break;
case UCOL_DECOMP_CAN:
normMode = Normalizer::DECOMP;
break;
case UCOL_DECOMP_COMPAT:
normMode = Normalizer::DECOMP_COMPAT;
break;
case UCOL_DECOMP_CAN_COMP_COMPAT:
normMode = Normalizer::COMPOSE;
break;
case UCOL_DECOMP_COMPAT_COMP_CAN:
normMode = Normalizer::COMPOSE_COMPAT;
break;
default:
*status = U_ILLEGAL_ARGUMENT_ERROR;
return -1;
}
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
const UnicodeString src((UChar*)source, len, len);
UnicodeString dst(result, 0, resultLength);
Normalizer::normalize(src, normMode, option, dst, *status);
int32_t actualLen;
T_fillOutputParams(&dst, result, resultLength, &actualLen, status);
return actualLen;
}
U_CAPI UCollator*
ucol_open( const char *loc,
UErrorCode *status)
@ -420,23 +380,24 @@ int32_t getComplicatedCE(const UCollator *coll, collIterate *source, UErrorCode
EntryPair *pair = (EntryPair *)list->at(0); // Taking out the first one.
int32_t order = pair->value; // This got us mapping for just the first element - the one that signalled a contraction.
key[posKey++] = *(source->pos);
key[posKey++] = *(source->pos++);
// This tries to find the longes common match for the data in contraction table...
// and needs to be rewritten, especially the test down there!
int32_t i;
int32_t listSize = list->size();
UBool foundSmaller = TRUE;
while(source->pos<source->len && foundSmaller) {
key[posKey++] = *(++source->pos);
key[posKey++] = *source->pos;
foundSmaller = FALSE;
i = 0;
while(i<listSize && !foundSmaller) {
pair = list->at(i);
if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
order = pair->value;
foundSmaller = TRUE;
if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
/* Found a matching contraction sequence */
order = pair->value; /* change the CE value */
source->pos++; /* consume another char from the source */
foundSmaller = TRUE;
}
i++;
@ -520,7 +481,7 @@ struct incrementalContext {
};
void init_incrementalContext(UCharForwardIterator *source, void *sourceContext, incrementalContext *s, UBool isWritable) {
void init_incrementalContext(UCharForwardIterator *source, void *sourceContext, incrementalContext *s) {
s->len = s->pos = s->string ;
s->CEpos = s->toReturn = s->CEs;
s->source = source;
@ -588,9 +549,9 @@ int32_t ucol_getIncrementalCE(const UCollator *coll, incrementalContext *ctx, UE
int32_t listSize = list->size();
UBool foundSmaller = TRUE;
UBool endOfString = FALSE;
*(ctx->len++) = ctx->lastChar;
while(!endOfString && foundSmaller) {
endOfString = ((ctx->lastChar = ctx->source(ctx->sourceContext)) == 0xFFFF);
*(ctx->len++) = ctx->lastChar;
key[posKey++] = ctx->lastChar;
foundSmaller = FALSE;
@ -599,13 +560,13 @@ int32_t ucol_getIncrementalCE(const UCollator *coll, incrementalContext *ctx, UE
pair = list->at(i);
if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
order = pair->value;
*(ctx->len++) = ctx->lastChar;
foundSmaller = TRUE;
}
i++;
}
}
//*(ctx->CEpos) = order;
}
}
// Expansion sequence start...
@ -654,8 +615,8 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
incrementalContext sColl, tColl;
init_incrementalContext(source, sourceContext, &sColl, FALSE);
init_incrementalContext(target, targetContext, &tColl, FALSE);
init_incrementalContext(source, sourceContext, &sColl);
init_incrementalContext(target, targetContext, &tColl);
if(cppColl->getDecomposition() != Normalizer::NO_OP) { // run away screaming!!!!
return alternateIncrementalProcessing(coll, &sColl, &tColl);
@ -667,7 +628,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
}
UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
int32_t sOrder, tOrder;
uint32_t sOrder=UCOL_NULLORDER, tOrder=UCOL_NULLORDER;
uint32_t pSOrder, pTOrder;
UBool gets = TRUE, gett = TRUE;
UBool initialCheckSecTer = strength >= UCOL_SECONDARY;
@ -881,7 +842,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
sOrder = ucol_getIncrementalCE(coll, &sColl, &status);
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
}
gets = TRUE;
if (gett)
@ -1072,7 +1033,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
}
}
// For IDENTICAL comparisons, we use a bitwise character comparison
// as a tiebreaker if all else is equal
// NOTE: The java code compares result with 0, and
@ -1150,7 +1111,7 @@ ucol_strcoll( const UCollator *coll,
}
UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
int32_t sOrder, tOrder;
uint32_t sOrder=UCOL_NULLORDER, tOrder=UCOL_NULLORDER;
uint32_t pSOrder, pTOrder;
UBool gets = TRUE, gett = TRUE;
UBool initialCheckSecTer = strength >= UCOL_SECONDARY;
@ -1625,16 +1586,6 @@ ucol_getSortKey(const UCollator *coll,
int32_t resultLength)
{
/*
Still problems in:
SUMMARY:
******* [Total error count: 213]
Errors in
[tscoll/capitst/TestSortKey] // this is normal, since we are changing binary keys
[tscoll/cfrtst/TestSecondary] // this is also OK, ICU original implementation was messed up
[tscoll/cfrtst/TestTertiary] // probably the same as above
*/
uint32_t i = 0; // general purpose counter
UErrorCode status = U_ZERO_ERROR;
@ -1647,6 +1598,15 @@ ucol_getSortKey(const UCollator *coll,
UChar *normSource = normBuffer;
int32_t normSourceLen = 2048;
for(i = 0; i<UCOL_MAX_BUFFER; i++) {
prim[i]=second[i]=tert[i]='\0';
}
for(i = UCOL_MAX_BUFFER; i<2*UCOL_MAX_BUFFER; i++) {
prim[i]=normBuffer[i]='\0';
}
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
UBool compareSec = (((RuleBasedCollator *)coll)->getStrength() >= Collator::SECONDARY);
@ -1667,7 +1627,7 @@ ucol_getSortKey(const UCollator *coll,
uint8_t *secstart = secondaries;
uint8_t *terstart = tertiaries;
collIterate s;
collIterate s;
init_collIterate((UChar *)source, len, &s, FALSE);
// If we need to normalize, we'll do it all at once at the beggining!
@ -1687,7 +1647,7 @@ ucol_getSortKey(const UCollator *coll,
s.len = normSource+normSourceLen;
}
int32_t order = 0;
uint32_t order = 0;
uint16_t primary = 0;
uint8_t secondary = 0;
@ -1700,8 +1660,8 @@ ucol_getSortKey(const UCollator *coll,
tertiary = (order & UCOL_TERTIARYORDERMASK);
if(primary != UCOL_IGNORABLE) {
*(primaries++) = (primary+UCOL_SORTKEYOFFSET)>>8;
*(primaries++) = (primary+UCOL_SORTKEYOFFSET)&0xFF;
*(primaries++) = (primary>>8)+UCOL_SORTKEYOFFSET;
*(primaries++) = (primary&0xFF)+UCOL_SORTKEYOFFSET;
if(compareSec) {
*(secondaries++) = secondary+UCOL_SORTKEYOFFSET;
}
@ -1719,11 +1679,10 @@ ucol_getSortKey(const UCollator *coll,
UCOL_GETNEXTCE(order, coll, s, status);
}
*(primaries++) = UCOL_LEVELTERMINATOR;
*(primaries++) = UCOL_LEVELTERMINATOR;
if(compareSec) {
*(primaries++) = UCOL_LEVELTERMINATOR;
uint32_t secsize = secondaries-secstart;
if(ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, &status) == UCOL_ON) { // do the reverse copy
for(i = 0; i<secsize; i++) {
@ -1734,27 +1693,28 @@ ucol_getSortKey(const UCollator *coll,
primaries += secsize;
}
*(primaries++) = UCOL_LEVELTERMINATOR;
}
if(compareTer) {
*(primaries++) = UCOL_LEVELTERMINATOR;
uint32_t tersize = tertiaries - terstart;
uprv_memcpy(primaries, terstart, tersize);
primaries += tersize;
*(primaries++) = UCOL_LEVELTERMINATOR;
}
if(compareIdent) {
*(primaries++) = UCOL_LEVELTERMINATOR;
UChar *ident = s.string;
while(ident < s.len) {
*(primaries++) = (*(ident) >> 8) + utf16fixup[*(ident) >> 11];
*(primaries++) = (*(ident) & 0xFF);
ident++;
}
*(primaries++) = UCOL_LEVELTERMINATOR;
}
*(primaries++) = '\0';
uprv_memcpy(result, primstart, uprv_min(resultLength, (primaries-primstart)));
if(terstart != tert) {

View File

@ -8,6 +8,7 @@
#define UCOL_H
#include "unicode/utypes.h"
#include "unicode/unorm.h"
/**
* @name Collator C API
*
@ -105,20 +106,20 @@ typedef void* UCollator;
* @see u_strcoll()
**/
/** Possible values for a comparison result */
enum UCollationResult {
typedef enum {
/** string a == string b */
UCOL_EQUAL = 0,
/** string a > string b */
UCOL_GREATER = 1,
/** string a < string b */
UCOL_LESS = -1
};
typedef enum UCollationResult UCollationResult;
} UCollationResult ;
typedef enum {
/* accepted by most attributes */
UCOL_DEFAULT = -1,
/* for UCOL_STRENGTH */
/** Primary collation strength */
UCOL_PRIMARY = 0,
@ -126,61 +127,33 @@ typedef enum {
UCOL_SECONDARY = 1,
/** Tertiary collation strength */
UCOL_TERTIARY = 2,
/** Default collation strength */
UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
/** Quaternary collation strength */
UCOL_QUATERNARY=3,
/** Identical collation strength */
UCOL_IDENTICAL=15,
/* for UCOL_FRENCH_COLLATION & UCOL_CASE_LEVEL*/
/* for UCOL_FRENCH_COLLATION, UCOL_CASE_LEVEL & UCOL_DECOMPOSITION_MODE*/
UCOL_OFF = 16,
UCOL_ON = 17,
/* for UCOL_ALTERNATE_HANDLING */
UCOL_SHIFTED = 0,
UCOL_NON_IGNORABLE = 1,
UCOL_SHIFTED = 20,
UCOL_NON_IGNORABLE = 21,
/* for UCOL_CASE_FIRST */
UCOL_LOWER_FIRST = 0,
UCOL_UPPER_FIRST = 1,
UCOL_LOWER_FIRST = 24,
UCOL_UPPER_FIRST = 25,
/* for UCOL_NORMALIZATION_MODE */
/** No decomposition/composition */
UCOL_NO_NORMALIZATION = 1,
/** Canonical decomposition */
UCOL_DECOMP_CAN = 2,
/** Compatibility decomposition */
UCOL_DECOMP_COMPAT = 3,
/** Default normalization */
UCOL_DEFAULT_NORMALIZATION = UCOL_DECOMP_COMPAT,
/** Canonical decomposition followed by canonical composition */
UCOL_DECOMP_CAN_COMP_COMPAT = 4,
/** Compatibility decomposition followed by canonical composition */
UCOL_DECOMP_COMPAT_COMP_CAN =5,
/** Default collation strength */
UCOL_ON_WITHOUT_HANGUL = 28,
/** No more attribute values after this*/
UCOL_ATTRIBUTE_VALUE_COUNT
} UColAttributeValue;
/**
* UCOL_NO_NORMALIZATION : Accented characters will not be decomposed for sorting.
* UCOL_DECOM_CAN : Characters that are canonical variants according
* to Unicode 2.0 will be decomposed for sorting.
* UCOL_DECOMP_COMPAT : Characters that are compatibility variants will be
* decomposed for sorting. This is the default normalization mode used.
* UCOL_DECOMP_CAN_COMP_COMPAT : Canonical decomposition followed by canonical composition
* UCOL_DECOMP_COMPAT_COMP_CAN : Compatibility decomposition followed by canonical composition
*
**/
/** Possible collation normalization modes - see UColAttributeValue for the enum */
typedef UColAttributeValue UNormalizationMode;
/** Possible normalization options */
typedef enum {
/** Do not normalize Hangul */
UCOL_IGNORE_HANGUL = 1
} UNormalizationOption;
/**
* Base letter represents a primary difference. Set comparison
* level to UCOL_PRIMARY to ignore secondary and tertiary differences.
@ -218,91 +191,6 @@ typedef enum {
UCOL_ATTRIBUTE_COUNT
} UColAttribute;
/**
* @name Unicode normalization API
*
* <tt>u_normalize</tt> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <tt>u_normalize</tt> supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Technical Report #15</a>.
* <p>
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character "Á"
* (A-acute). In Unicode, this can be encoded as a single character (the
* "composed" form):
* <pre>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre>
* or as two separate characters (the "decomposed" form):
* <pre>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT</pre>
* <p>
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "Á". When you are searching or
* comparing text, you must ensure that these two sequences are treated
* equivalently. In addition, you must handle characters with more than one
* accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
* <p>
* Similarly, the string "ffi" can be encoded as three separate letters:
* <pre>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I</pre>
* or as the single character
* <pre>
* FB03 LATIN SMALL LIGATURE FFI</pre>
* <p>
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
* into the corresponding semantic characters. When sorting and searching, you
* will often want to use these mappings.
* <p>
* <tt>u_normalize</tt> helps solve these problems by transforming text into the
* canonical composed and decomposed forms as shown in the first example above.
* In addition, you can have it perform compatibility decompositions so that
* you can treat compatibility characters the same as their equivalents.
* Finally, <tt>u_normalize</tt> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
* <p>
* <tt>u_normalize</tt> adds one optional behavior, {@link #UCOL_IGNORE_HANGUL},
* that differs from
* the standard Unicode Normalization Forms.
**/
/**
* Normalize a string.
* The string will be normalized according the the specified normalization mode
* and options.
* @param source The string to normalize.
* @param sourceLength The length of source, or -1 if null-terminated.
* @param mode The normalization mode; one of UCOL_NO_NORMALIZATION,
* UCOL_CAN_DECOMP, UCOL_COMPAT_DECOMP, UCOL_CAN_DECOMP_COMPAT_COMP,
* UCOL_COMPAT_DECOMP_CAN_COMP, UCOL_DEFAULT_NORMALIZATION
* @param options The normalization options, ORed together; possible values
* are UCOL_IGNORE_HANGUL
* @param result A pointer to a buffer to receive the attribute.
* @param resultLength The maximum size of result.
* @param status A pointer to an UErrorCode to receive any errors
* @return The total buffer size needed; if greater than resultLength,
* the output was truncated.
* @stable
*/
U_CAPI int32_t
u_normalize(const UChar* source,
int32_t sourceLength,
UNormalizationMode mode,
int32_t options,
UChar* result,
int32_t resultLength,
UErrorCode* status);
/**
* Open a UCollator for comparing strings.
* The UCollator may be used in calls to \Ref{ucol_strcoll}.