ICU-96 sortkey algorithm is now in... just compiles for now...

X-SVN-Rev: 3387
2001-01-08 06:51:18 +00:00 · 2001-01-08 06:51:18 +00:00 · 8eb80185f6
commit 8eb80185f6
parent e8272cf29b
3 changed files with 179 additions and 31 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -148,7 +148,7 @@ ucol_openNew(    const    char         *loc,
  /* Do we need a name or other stuff?                                  */
  UCollatorNew *result = NULL;
  UResourceBundle *b = ures_open(NULL, loc, status);
-  UResourceBundle *binary = ures_getByKey(b, "%%Collation", NULL, status);
+  UResourceBundle *binary = ures_getByKey(b, "%%CollationNew", NULL, status);
  if(*status = U_MISSING_RESOURCE_ERROR) { /* if we don't find tailoring, we'll fallback to UCA */
    result = UCA;
@ -409,6 +409,20 @@ uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *sec
  return newStart;
 }
 #define MIN_VALUE 0x02
 #define UNMARKED 0x03
 #define UCOL_VARIABLE_MAX 0x20
 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
  uint8_t temp;
  while(start<end) {
    temp = *start;
    *start++ = *end;
    *end-- = temp;
  }
 }
 int32_t
 ucol_calcSortKeyNew(const    UCollatorNew    *coll,
        const    UChar        *source,
@ -453,7 +467,14 @@ ucol_calcSortKeyNew(const    UCollatorNew    *coll,
    UBool  compareQuad  = (strength >= UCOL_QUATERNARY);
    UBool  compareIdent = (strength == UCOL_IDENTICAL);
    UBool  doCase = (ucol_getAttributeNew(coll, UCOL_CASE_LEVEL, status) == UCOL_ON);
-    UBool  lowerFirst = (ucol_getAttributeNew(coll, UCOL_CASE_FIRST, status) == UCOL_LOWER_FIRST);
+    UBool  upperFirst = (ucol_getAttributeNew(coll, UCOL_CASE_FIRST, status) == UCOL_UPPER_FIRST);
    UBool  shifted = (ucol_getAttributeNew(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED);
    UBool  isFrenchSec = (ucol_getAttributeNew(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
    /* support for special features like caselevel and funky secondaries */
    uint8_t *frenchStartPtr = NULL;
    uint8_t *frenchEndPtr = NULL;
    uint32_t caseShift = 0;
    sortKeySize += ((compareSec?1:0) + (compareTer?1:0) + (doCase?1:0) + (compareQuad?1:0) + (compareIdent?1:0));
@ -490,8 +511,12 @@ ucol_calcSortKeyNew(const    UCollatorNew    *coll,
    uint8_t *quadStart = quads;
    uint32_t order = 0;
    uint32_t ce = 0;
    uint16_t primary = 0;
    uint8_t primary1 = 0;
    uint8_t primary2 = 0;
    uint8_t primary3 = 0;
    uint8_t secondary = 0;
    uint8_t tertiary = 0;
@ -510,10 +535,119 @@ ucol_calcSortKeyNew(const    UCollatorNew    *coll,
                break;
            }
-            primary = ((order & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT);
+            /* We're saving order in ce, since we will destroy order in order to get primary, secondary, tertiary in order ;)*/
-            secondary = ((order & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT);
+            ce = order;
            tertiary = (order & UCOL_TERTIARYORDERMASK);
            tertiary = (order & UCOL_TERTIARYORDERMASK);
            secondary = (order >>= 8) & 0xFF;
            primary3 = 0; /* the third primary */
            primary2 = (order >>= 8) & 0xFF;;
            primary1 = order >>= 8;
            if((tertiary & 0xF0) == 0xF0) { /* This indicates a long primary (11110000) */
              /* Note: long primary can appear both as a normal CE or as a continuation CE (not that it matters much) */
              primary3 = secondary;
              secondary = (tertiary & 0x0F) + MIN_VALUE;
              tertiary = UNMARKED;
            }
            if(upperFirst && !(isContinuation(ce))) { 
              /* Upper cases have this bit turned on, so that they always come after the lower cases */
              /* if we want to reverse this situation, we'll flip this bit */
              tertiary ^= 0x80;
            }
            /* In the code below, every increase in any of buffers is followed by the increase to  */
            /* sortKeySize - this might look tedious, but it is needed so that we can find out if  */
            /* we're using too much space and need to reallocate the primary buffer or easily bail */
            /* out to ucol_getSortKeySizeNew.                                                      */
            if(shifted && primary1 < UCOL_VARIABLE_MAX && primary1 > 0) { 
              /* We are dealing with a variable and we're treating them as shifted */
              /* This is a shifted ignorable */
              *quads++ = primary1;
              sortKeySize++;
              if(primary2 != 0) {
                *quads++ = primary2;
                sortKeySize++;
              }
            } else {
              /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
              /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
              /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
              if(primary1 != 0) {
                *primaries++ = primary1; /* scriptOrder[primary1]; */ /* This is the script ordering thingie */
                sortKeySize++;
              }               
              if(primary2 != 0) {
                *primaries++ = primary2; /* second part */
                sortKeySize++;
              }
              if(primary3 != 0) {
                *primaries++ = primary2; /* third part */
                sortKeySize++;
              }
              if(compareSec && secondary != 0) { /* I think that != 0 test should be != IGNORABLE */
                /* This thing should also contain the compression logic, as in: */
                /*
                  if (ws  == COMMON2 && COMMON2  <= secondary[-1] && secondary[-1] < COMMON_MAX2)
                      ++secondary[-1]; // simply increment!!
                  else *secondary++ = ws;
                */
                *secondaries++ = secondary;
                sortKeySize++;
                if(isFrenchSec) {
                  /* Do the special handling for French secondaries */
                  /* We need to get continuation elements and do intermediate restore */
                  /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
                  if(isContinuation(ce)) {
                    if (frenchStartPtr == NULL) {
                      frenchStartPtr = secondaries - 2;
                    }
                    frenchEndPtr = secondaries-1;
                  } else if (frenchStartPtr != NULL) {
                      /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
                    uprv_ucol_reverse_buffer(frenchStartPtr, frenchEndPtr);
                    frenchStartPtr = NULL;
                  }
                }
              }
              if(doCase) {
                if (caseShift  == 0) {
                  *cases++ = 0x80;
                  sortKeySize++;
                  caseShift = 7;
                }
                *(cases-1) |= (tertiary & 0x80) >> (8-caseShift--);
              }
              if(compareTer && tertiary != 0) { /* I think that != 0 test should be != IGNORABLE */
                /* This thing should also contain the compression logic, as in: */
                /*
                  if (ws  == COMMON2 && COMMON2  <= secondary[-1] && secondary[-1] < COMMON_MAX2)
                      ++secondary[-1]; // simply increment!!
                  else *secondary++ = ws;
                */
                *tertiaries++ = tertiary;
                sortKeySize++;
              }
              if(compareQuad && shifted && primary1 > 0) {
                *quads++ = 0xFF;
                sortKeySize++;
              }
            }
 /* This is an old peace of code... I'm leaving it here just for discussion regarding */
 /* ignorables and situations with primary ignorable vs. variable top and ignorables  */            
 #if 0
 /*
            if(primary != UCOL_PRIMIGNORABLE) {
                *(primaries++) = (primary>>8);
                *(primaries++) = (primary&0xFF);
@ -541,26 +675,27 @@ ucol_calcSortKeyNew(const    UCollatorNew    *coll,
                    sortKeySize++;
                }
            }
-            if(sortKeySize>resultLength) {
+*/
-                if(allocatePrimary == FALSE) {
+#endif
-                    resultOverflow = TRUE;
+            if(sortKeySize>resultLength) { /* We have stepped over the primary buffer */
-                    sortKeySize = ucol_getSortKeySizeNew(coll, &s, sortKeySize, strength, len);
+              if(allocatePrimary == FALSE) { /* need to save our butts if we cannot reallocate */
-                    *status = U_MEMORY_ALLOCATION_ERROR;
+                resultOverflow = TRUE;
-                    finished = TRUE;
+                sortKeySize = ucol_getSortKeySizeNew(coll, &s, sortKeySize, strength, len);
-                    break;
+                *status = U_MEMORY_ALLOCATION_ERROR;
-                    /*goto cleanup;*/
+                finished = TRUE;
-                } else {
+                break;
-                    uint8_t *newStart;
+              } else { /* It's much nicer if we can actually reallocate */
-                    newStart = (uint8_t *)uprv_realloc(primStart, 2*sortKeySize);
+                uint8_t *newStart;
-                    if(primStart == NULL) {
+                newStart = (uint8_t *)uprv_realloc(primStart, 2*sortKeySize);
-                        *status = U_MEMORY_ALLOCATION_ERROR;
+                if(primStart == NULL) {
-                        finished = TRUE;
+                  *status = U_MEMORY_ALLOCATION_ERROR;
-                        break;
+                  finished = TRUE;
-                    }
+                  break;
                    primaries=newStart+(primaries-primStart);
                    resultLength = 2*sortKeySize;
                    primStart = *result = newStart;
                }
                primaries=newStart+(primaries-primStart);
                resultLength = 2*sortKeySize;
                primStart = *result = newStart;
              }
            }
        }
        if(finished) {
@ -580,14 +715,18 @@ ucol_calcSortKeyNew(const    UCollatorNew    *coll,
      if(compareSec) {
        *(primaries++) = UCOL_LEVELTERMINATOR;
        uint32_t secsize = secondaries-secStart;
-        if(ucol_getAttributeNew(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON) { // do the reverse copy
+        if(isFrenchSec) { /* do the reverse copy */
-            for(i = 0; i<secsize; i++) {
+          /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
-                *(primaries++) = *(secondaries-i-1);
+          if(frenchStartPtr != NULL) { 
-            }
+            uprv_ucol_reverse_buffer(frenchStartPtr, frenchEndPtr);
-          } else { 
+          }           
-              uprv_memcpy(primaries, secStart, secsize); 
+          for(i = 0; i<secsize; i++) {
-              primaries += secsize;
+              *(primaries++) = *(secondaries-i-1);
          }
        } else { 
          uprv_memcpy(primaries, secStart, secsize); 
          primaries += secsize;
        }
      }
--- a/icu4c/source/i18n/ucolimp.h
+++ b/icu4c/source/i18n/ucolimp.h
@ -234,7 +234,9 @@ ucol_cloneRuleData(UCollator *coll, int32_t *length, UErrorCode *status);
 #define UCOL_THAI 0xF3000000
 #define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF)
 #define isContinuation(CE) isSpecial((CE))
 #define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT)
 #define isContraction(CE) (isSpecial((CE)) && (getCETag((CE)) == CONTRACTION_TAG))
 #define constructContractCE(CE) (UCOL_SPECIAL_FLAG | (CONTRACTION_TAG<<UCOL_TAG_SHIFT) | ((CE))&0xFFFFFF)
 #define getContractOffset(CE) ((CE)&0xFFFFFF)
 #define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4)
--- a/icu4c/source/i18n/unicode/ucol.h
+++ b/icu4c/source/i18n/unicode/ucol.h
@ -233,6 +233,10 @@ U_CAPI UCollator*
 ucol_open(    const    char         *loc,
        UErrorCode      *status);
 U_CAPI UCollatorNew*
 ucol_openNew(    const    char         *loc,
        UErrorCode      *status);
 /**
 * Open a UCollator for comparing strings.
 * The UCollator may be used in calls to \Ref{ucol_strcoll}.
@ -264,6 +268,9 @@ ucol_openRules(    const    UChar        *rules,
 U_CAPI void
 ucol_close(UCollator *coll);
 U_CAPI void
 ucol_closeNew(UCollatorNew *coll);
 /**
 * Compare two strings.
 * The strings will be compared using the normalization mode and options