ICU-96 more collation cleanup, plus moving normalization C API

X-SVN-Rev: 3146
2000-12-06 00:53:48 +00:00 · 2000-12-06 00:53:48 +00:00 · eb03d8dab2
commit eb03d8dab2
parent 82e011125e
3 changed files with 63 additions and 211 deletions
--- a/icu4c/source/i18n/tblcoll.cpp
+++ b/icu4c/source/i18n/tblcoll.cpp
@ -1579,14 +1579,16 @@ RuleBasedCollator::compare(const UnicodeString& source,
 	UChar *uTarget = uTstart;
 	uint32_t sourceLen = source.length();
 	uint32_t targetLen = target.length();
-	if(sourceLen > tblcoll_StackBufferLen) {
-		uSource = new UChar[sourceLen];
+	if(sourceLen >= tblcoll_StackBufferLen) {
+		uSource = new UChar[sourceLen+1];
 	}
-	if(targetLen > tblcoll_StackBufferLen) {
-		uTarget = new UChar[targetLen];
+	if(targetLen >= tblcoll_StackBufferLen) {
+		uTarget = new UChar[targetLen+1];
 	}
    source.extract(0, sourceLen, uSource);
+    uSource[sourceLen] = 0;
    target.extract(0, targetLen, uTarget);
+    uTarget[targetLen] = 0;
 	Collator::EComparisonResult result = compare(uSource, sourceLen, uTarget, targetLen);

 	if(uSstart != uSource) {
@ -1639,10 +1641,11 @@ RuleBasedCollator::getCollationKey( const   UnicodeString&  source,
 	UChar sStart[tblcoll_StackBufferLen];
 	UChar *uSource = sStart;
 	uint32_t sourceLen = source.length();
-	if(sourceLen > tblcoll_StackBufferLen) {
-		uSource = new UChar[sourceLen];
+	if(sourceLen >= tblcoll_StackBufferLen) {
+		uSource = new UChar[sourceLen+1];
 	}
    source.extract(0, sourceLen, uSource);
+    uSource[sourceLen] = 0;
 	CollationKey& result = RuleBasedCollator::getCollationKey(uSource, sourceLen, sortkey, status);
 	if(sStart != uSource) {
 		delete[] uSource;
@ -2992,10 +2995,11 @@ int32_t RuleBasedCollator::getSortKey(const   UnicodeString&  source,
 	UChar sStart[tblcoll_StackBufferLen];
 	UChar *uSource = sStart;
 	uint32_t sourceLen = source.length();
-	if(sourceLen > tblcoll_StackBufferLen) {
-		uSource = new UChar[sourceLen];
+	if(sourceLen >= tblcoll_StackBufferLen) {
+		uSource = new UChar[sourceLen+1];
 	}
    source.extract(0, sourceLen, uSource);
+    uSource[sourceLen] = 0;
 	int32_t resLen = ucol_getSortKey((UCollator *)this, uSource, sourceLen, result, resultLength);
 	if(sStart != uSource) {
 		delete[] uSource;
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -14,6 +14,8 @@
 #include "unicode/ustring.h"
 #include "unicode/normlzr.h"
 #include "cpputils.h"
+
+
 static uint8_t utf16fixup[32] = {
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
@ -46,7 +48,7 @@ struct collIterate {

 #define UCOL_UNMAPPEDCHARVALUE 0x7fff0000     // from coleiterator

-#define UCOL_LEVELTERMINATOR 0
+#define UCOL_LEVELTERMINATOR 1
 #define UCOL_IGNORABLE 0x0000
 #define UCOL_CHARINDEX 0x70000000             // need look up in .commit()
 #define UCOL_EXPANDCHARINDEX 0x7E000000       // Expand index follows
@ -65,7 +67,7 @@ struct collIterate {
 #define UCOL_SECONDARYDIFFERENCEONLY 0xffffff00  // use only the primary and secondary difference
 #define UCOL_PRIMARYORDERSHIFT 16             // primary order shift
 #define UCOL_SECONDARYORDERSHIFT 8            // secondary order shift
-#define UCOL_SORTKEYOFFSET 1                  // minimum sort key offset
+#define UCOL_SORTKEYOFFSET 2                  // minimum sort key offset
 #define UCOL_CONTRACTCHAROVERFLOW 0x7FFFFFFF  // Indicates the char is a contract char

 #define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT)
@ -83,48 +85,6 @@ struct collIterate {
 */
 #define UCOL_ISTHAIBASECONSONANT(ch) ((uint32_t)(ch) - 0xe01) <= (0xe2e - 0xe01)

-U_CAPI int32_t
-u_normalize(const UChar*            source,
-        int32_t                 sourceLength, 
-        UNormalizationMode      mode, 
-        int32_t                 option,
-        UChar*                  result,
-        int32_t                 resultLength,
-        UErrorCode*             status)
-{
-  if(U_FAILURE(*status)) return -1;
-
-  Normalizer::EMode normMode;
-  switch(mode) {
-  case UCOL_NO_NORMALIZATION:
-    normMode = Normalizer::NO_OP;
-    break;
-  case UCOL_DECOMP_CAN:
-    normMode = Normalizer::DECOMP;
-    break;
-  case UCOL_DECOMP_COMPAT:
-    normMode = Normalizer::DECOMP_COMPAT;
-    break;
-  case UCOL_DECOMP_CAN_COMP_COMPAT:
-    normMode = Normalizer::COMPOSE;
-    break;
-  case UCOL_DECOMP_COMPAT_COMP_CAN:
-    normMode = Normalizer::COMPOSE_COMPAT;
-    break;
-  default:
-    *status = U_ILLEGAL_ARGUMENT_ERROR;
-    return -1;
-  }
-
-  int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
-  const UnicodeString src((UChar*)source, len, len);
-  UnicodeString dst(result, 0, resultLength);
-  Normalizer::normalize(src, normMode, option, dst, *status);
-  int32_t actualLen;
-  T_fillOutputParams(&dst, result, resultLength, &actualLen, status);
-  return actualLen;
-}
-
 U_CAPI UCollator*
 ucol_open(    const    char         *loc,
        UErrorCode      *status)
@ -420,23 +380,24 @@ int32_t getComplicatedCE(const UCollator *coll, collIterate *source, UErrorCode
 				EntryPair *pair = (EntryPair *)list->at(0); // Taking out the first one.
 				int32_t order = pair->value; // This got us mapping for just the first element - the one that signalled a contraction.

-				key[posKey++] = *(source->pos);
+				key[posKey++] = *(source->pos++);
 				// This tries to find the longes common match for the data in contraction table...
 				// and needs to be rewritten, especially the test down there!
 				int32_t i;
                int32_t listSize = list->size();
 				UBool foundSmaller = TRUE;
 				while(source->pos<source->len && foundSmaller) {
-
-					key[posKey++] = *(++source->pos);
+					key[posKey++] = *source->pos;

 					foundSmaller = FALSE;
 					i = 0;
 					while(i<listSize && !foundSmaller) {
 						pair = list->at(i);
-						if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
-							order = pair->value;
-							foundSmaller = TRUE;
+                        if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) { 
+                            /* Found a matching contraction sequence */
+                            order = pair->value; /* change the CE value */
+                            source->pos++;       /* consume another char from the source */
+							foundSmaller = TRUE; 
 						}
 						i++;

@ -520,7 +481,7 @@ struct incrementalContext {
 };


-void init_incrementalContext(UCharForwardIterator *source, void *sourceContext, incrementalContext *s, UBool isWritable) {
+void init_incrementalContext(UCharForwardIterator *source, void *sourceContext, incrementalContext *s) {
    s->len = s->pos = s->string ;
    s->CEpos = s->toReturn = s->CEs;
    s->source = source;
@ -588,9 +549,9 @@ int32_t ucol_getIncrementalCE(const UCollator *coll, incrementalContext *ctx, UE
                int32_t listSize = list->size();
 				UBool foundSmaller = TRUE;
                UBool endOfString = FALSE;
+                *(ctx->len++) = ctx->lastChar;
 				while(!endOfString && foundSmaller) {
                    endOfString = ((ctx->lastChar = ctx->source(ctx->sourceContext)) == 0xFFFF);
-                    *(ctx->len++) = ctx->lastChar;
 					key[posKey++] = ctx->lastChar;

 					foundSmaller = FALSE;
@ -599,13 +560,13 @@ int32_t ucol_getIncrementalCE(const UCollator *coll, incrementalContext *ctx, UE
 						pair = list->at(i);
 						if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->equalTo(key, posKey))) {
 							order = pair->value;
+                            *(ctx->len++) = ctx->lastChar;
 							foundSmaller = TRUE;
 						}
 						i++;

 					}
 				}
-				//*(ctx->CEpos) = order;
 			}
    }
 	// Expansion sequence start...
@ -654,8 +615,8 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,

    incrementalContext sColl, tColl;

-    init_incrementalContext(source, sourceContext, &sColl, FALSE);
-    init_incrementalContext(target, targetContext, &tColl, FALSE);
+    init_incrementalContext(source, sourceContext, &sColl);
+    init_incrementalContext(target, targetContext, &tColl);

    if(cppColl->getDecomposition() != Normalizer::NO_OP) { // run away screaming!!!!
        return alternateIncrementalProcessing(coll, &sColl, &tColl);
@ -667,7 +628,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
    }

    UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
-    int32_t sOrder, tOrder;
+    uint32_t sOrder=UCOL_NULLORDER, tOrder=UCOL_NULLORDER;
    uint32_t pSOrder, pTOrder;
    UBool gets = TRUE, gett = TRUE;
    UBool initialCheckSecTer = strength  >= UCOL_SECONDARY;
@ -881,7 +842,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
                sOrder = ucol_getIncrementalCE(coll, &sColl, &status);
                *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
            }
-
+ 
            gets = TRUE;

            if (gett)
@ -1072,7 +1033,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
        }
    }

-
+ 
    // For IDENTICAL comparisons, we use a bitwise character comparison
    // as a tiebreaker if all else is equal
    // NOTE: The java code compares result with 0, and 
@ -1150,7 +1111,7 @@ ucol_strcoll(    const    UCollator    *coll,
    }

    UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
-    int32_t sOrder, tOrder;
+    uint32_t sOrder=UCOL_NULLORDER, tOrder=UCOL_NULLORDER;
    uint32_t pSOrder, pTOrder;
    UBool gets = TRUE, gett = TRUE;
    UBool initialCheckSecTer = strength  >= UCOL_SECONDARY;
@ -1625,16 +1586,6 @@ ucol_getSortKey(const    UCollator    *coll,
        int32_t        resultLength)
 {

-    /* 
-    Still problems in:
-    SUMMARY:
-        ******* [Total error count:     213]
-         Errors in
-           [tscoll/capitst/TestSortKey]  // this is normal, since we are changing binary keys
-           [tscoll/cfrtst/TestSecondary] // this is also OK, ICU original implementation was messed up
-           [tscoll/cfrtst/TestTertiary]  // probably the same as above
-    */
-
    uint32_t i = 0; // general purpose counter

 	UErrorCode status = U_ZERO_ERROR;
@ -1647,6 +1598,15 @@ ucol_getSortKey(const    UCollator    *coll,
    UChar *normSource = normBuffer;
    int32_t normSourceLen = 2048;

+    for(i = 0; i<UCOL_MAX_BUFFER; i++) {
+        prim[i]=second[i]=tert[i]='\0';
+    }
+
+    for(i = UCOL_MAX_BUFFER; i<2*UCOL_MAX_BUFFER; i++) {
+        prim[i]=normBuffer[i]='\0';
+    }
+
+
 	int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);

    UBool  compareSec   = (((RuleBasedCollator *)coll)->getStrength() >= Collator::SECONDARY);
@ -1667,7 +1627,7 @@ ucol_getSortKey(const    UCollator    *coll,
    uint8_t *secstart = secondaries;
    uint8_t *terstart = tertiaries;

-	collIterate s;
+   collIterate s;
   init_collIterate((UChar *)source, len, &s, FALSE);

    // If we need to normalize, we'll do it all at once at the beggining!
@ -1687,7 +1647,7 @@ ucol_getSortKey(const    UCollator    *coll,
 		s.len = normSource+normSourceLen;
 	}

-    int32_t order = 0;
+    uint32_t order = 0;

    uint16_t primary = 0;
    uint8_t secondary = 0;
@ -1700,8 +1660,8 @@ ucol_getSortKey(const    UCollator    *coll,
        tertiary = (order & UCOL_TERTIARYORDERMASK);

        if(primary != UCOL_IGNORABLE) {
-            *(primaries++) = (primary+UCOL_SORTKEYOFFSET)>>8;
-            *(primaries++) = (primary+UCOL_SORTKEYOFFSET)&0xFF;
+            *(primaries++) = (primary>>8)+UCOL_SORTKEYOFFSET;
+            *(primaries++) = (primary&0xFF)+UCOL_SORTKEYOFFSET;
            if(compareSec) {
                *(secondaries++) = secondary+UCOL_SORTKEYOFFSET;
            }
@ -1719,11 +1679,10 @@ ucol_getSortKey(const    UCollator    *coll,
        UCOL_GETNEXTCE(order, coll, s, status);
    }

-    *(primaries++) = UCOL_LEVELTERMINATOR;
-    *(primaries++) = UCOL_LEVELTERMINATOR;


    if(compareSec) {
+    *(primaries++) = UCOL_LEVELTERMINATOR;
      uint32_t secsize = secondaries-secstart;
      if(ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, &status) == UCOL_ON) { // do the reverse copy
          for(i = 0; i<secsize; i++) {
@ -1734,27 +1693,28 @@ ucol_getSortKey(const    UCollator    *coll,
            primaries += secsize;
        }

-        *(primaries++) = UCOL_LEVELTERMINATOR;
    }

    if(compareTer) {
+        *(primaries++) = UCOL_LEVELTERMINATOR;
      uint32_t tersize = tertiaries - terstart;
      uprv_memcpy(primaries, terstart, tersize);
      primaries += tersize;
-      *(primaries++) = UCOL_LEVELTERMINATOR;
    }


    if(compareIdent) {
+      *(primaries++) = UCOL_LEVELTERMINATOR;
 		UChar *ident = s.string;
 		while(ident < s.len) {
          *(primaries++) = (*(ident) >> 8) + utf16fixup[*(ident) >> 11];
          *(primaries++) = (*(ident) & 0xFF);
 		  ident++;
      }
-      *(primaries++) = UCOL_LEVELTERMINATOR;
    }

+    *(primaries++) = '\0';
+
    uprv_memcpy(result, primstart, uprv_min(resultLength, (primaries-primstart)));

    if(terstart != tert) {
--- a/icu4c/source/i18n/unicode/ucol.h
+++ b/icu4c/source/i18n/unicode/ucol.h
@ -8,6 +8,7 @@
 #define UCOL_H

 #include "unicode/utypes.h"
+#include "unicode/unorm.h"
 /**
 * @name Collator C API
 *
@ -105,20 +106,20 @@ typedef void* UCollator;
     * @see u_strcoll()
     **/
 /** Possible values for a comparison result */
-enum UCollationResult {
+typedef enum {
  /** string a == string b */
  UCOL_EQUAL    = 0,
  /** string a > string b */
  UCOL_GREATER    = 1,
  /** string a < string b */
  UCOL_LESS    = -1
-};
-typedef enum UCollationResult UCollationResult;
+} UCollationResult ;


 typedef enum {
  /* accepted by most attributes */
  UCOL_DEFAULT = -1,
+
  /* for UCOL_STRENGTH */
  /** Primary collation strength */
  UCOL_PRIMARY = 0,
@ -126,61 +127,33 @@ typedef enum {
  UCOL_SECONDARY = 1,
  /** Tertiary collation strength */
  UCOL_TERTIARY = 2,
+  /** Default collation strength */
  UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
  /** Quaternary collation strength */
  UCOL_QUATERNARY=3,
  /** Identical collation strength */
  UCOL_IDENTICAL=15,

-  /* for UCOL_FRENCH_COLLATION & UCOL_CASE_LEVEL*/
+  /* for UCOL_FRENCH_COLLATION, UCOL_CASE_LEVEL & UCOL_DECOMPOSITION_MODE*/
  UCOL_OFF = 16,
  UCOL_ON = 17,
  
  /* for UCOL_ALTERNATE_HANDLING */
-  UCOL_SHIFTED = 0,
-  UCOL_NON_IGNORABLE = 1,
+  UCOL_SHIFTED = 20,
+  UCOL_NON_IGNORABLE = 21,

  /* for UCOL_CASE_FIRST */
-  UCOL_LOWER_FIRST = 0,
-  UCOL_UPPER_FIRST = 1,
+  UCOL_LOWER_FIRST = 24,
+  UCOL_UPPER_FIRST = 25,

  /* for UCOL_NORMALIZATION_MODE */
-  /** No decomposition/composition */
-  UCOL_NO_NORMALIZATION = 1,
-  /** Canonical decomposition */
-  UCOL_DECOMP_CAN = 2,
-  /** Compatibility decomposition */
-  UCOL_DECOMP_COMPAT = 3,
-  /** Default normalization */
-  UCOL_DEFAULT_NORMALIZATION = UCOL_DECOMP_COMPAT, 
-  /** Canonical decomposition followed by canonical composition */
-  UCOL_DECOMP_CAN_COMP_COMPAT = 4,
-  /** Compatibility decomposition followed by canonical composition */
-  UCOL_DECOMP_COMPAT_COMP_CAN =5,
-  /** Default collation strength */
+  UCOL_ON_WITHOUT_HANGUL = 28,
+
+  /** No more attribute values after this*/
  UCOL_ATTRIBUTE_VALUE_COUNT

 } UColAttributeValue;

-  /**
-    * UCOL_NO_NORMALIZATION : Accented characters will not be decomposed for sorting.  
-    * UCOL_DECOM_CAN          : Characters that are canonical variants according 
-    * to Unicode 2.0 will be decomposed for sorting. 
-    * UCOL_DECOMP_COMPAT    : Characters that are compatibility variants will be
-    * decomposed for sorting. This is the default normalization mode used.
-    * UCOL_DECOMP_CAN_COMP_COMPAT : Canonical decomposition followed by canonical composition 
-    * UCOL_DECOMP_COMPAT_COMP_CAN : Compatibility decomposition followed by canonical composition
-    *
-    **/
-/** Possible collation normalization modes  - see UColAttributeValue for the enum */
-typedef UColAttributeValue UNormalizationMode;
-
-/** Possible normalization options */
-typedef enum {
-  /** Do not normalize Hangul */
-  UCOL_IGNORE_HANGUL    = 1
-} UNormalizationOption;
-
    /**
     * Base letter represents a primary difference.  Set comparison
     * level to UCOL_PRIMARY to ignore secondary and tertiary differences.
@ -218,91 +191,6 @@ typedef enum {
     UCOL_ATTRIBUTE_COUNT
 } UColAttribute;

-/**
- * @name Unicode normalization API
- *
- * <tt>u_normalize</tt> transforms Unicode text into an equivalent composed or
- * decomposed form, allowing for easier sorting and searching of text.
- * <tt>u_normalize</tt> supports the standard normalization forms described in
- * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
- * Unicode Technical Report #15</a>.
- * <p>
- * Characters with accents or other adornments can be encoded in
- * several different ways in Unicode.  For example, take the character "Á"
- * (A-acute).   In Unicode, this can be encoded as a single character (the
- * "composed" form):
- * <pre>
- *      00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
- * or as two separate characters (the "decomposed" form):
- * <pre>
- *      0041    LATIN CAPITAL LETTER A
- *      0301    COMBINING ACUTE ACCENT</pre>
- * <p>
- * To a user of your program, however, both of these sequences should be
- * treated as the same "user-level" character "Á".  When you are searching or
- * comparing text, you must ensure that these two sequences are treated 
- * equivalently.  In addition, you must handle characters with more than one
- * accent.  Sometimes the order of a character's combining accents is
- * significant, while in other cases accent sequences in different orders are
- * really equivalent.
- * <p>
- * Similarly, the string "ffi" can be encoded as three separate letters:
- * <pre>
- *      0066    LATIN SMALL LETTER F
- *      0066    LATIN SMALL LETTER F
- *      0069    LATIN SMALL LETTER I</pre>
- * or as the single character
- * <pre>
- *      FB03    LATIN SMALL LIGATURE FFI</pre>
- * <p>
- * The ffi ligature is not a distinct semantic character, and strictly speaking
- * it shouldn't be in Unicode at all, but it was included for compatibility
- * with existing character sets that already provided it.  The Unicode standard
- * identifies such characters by giving them "compatibility" decompositions
- * into the corresponding semantic characters.  When sorting and searching, you
- * will often want to use these mappings.
- * <p>
- * <tt>u_normalize</tt> helps solve these problems by transforming text into the
- * canonical composed and decomposed forms as shown in the first example above.  
- * In addition, you can have it perform compatibility decompositions so that 
- * you can treat compatibility characters the same as their equivalents.
- * Finally, <tt>u_normalize</tt> rearranges accents into the proper canonical
- * order, so that you do not have to worry about accent rearrangement on your
- * own.
- * <p>
- * <tt>u_normalize</tt> adds one optional behavior, {@link #UCOL_IGNORE_HANGUL},
- * that differs from
- * the standard Unicode Normalization Forms. 
- **/
- 
- 
-/**
- * Normalize a string.
- * The string will be normalized according the the specified normalization mode
- * and options.
- * @param source The string to normalize.
- * @param sourceLength The length of source, or -1 if null-terminated.
- * @param mode The normalization mode; one of UCOL_NO_NORMALIZATION, 
- * UCOL_CAN_DECOMP, UCOL_COMPAT_DECOMP, UCOL_CAN_DECOMP_COMPAT_COMP, 
- * UCOL_COMPAT_DECOMP_CAN_COMP, UCOL_DEFAULT_NORMALIZATION
- * @param options The normalization options, ORed together; possible values
- * are UCOL_IGNORE_HANGUL
- * @param result A pointer to a buffer to receive the attribute.
- * @param resultLength The maximum size of result.
- * @param status A pointer to an UErrorCode to receive any errors
- * @return The total buffer size needed; if greater than resultLength,
- * the output was truncated.
- * @stable
- */
-U_CAPI int32_t
-u_normalize(const UChar*           source,
-        int32_t                 sourceLength, 
-        UNormalizationMode      mode, 
-        int32_t            options,
-        UChar*                  result,
-        int32_t                 resultLength,
-        UErrorCode*             status);    
-
 /**
 * Open a UCollator for comparing strings.
 * The UCollator may be used in calls to \Ref{ucol_strcoll}.