ICU-861

Added normalization to contraction. Note this is not totally working yet. X-SVN-Rev: 4538
2001-04-24 03:18:54 +00:00 · 2001-04-24 03:18:54 +00:00 · 662d4ab558
commit 662d4ab558
parent 4c635dfeca
1 changed files with 234 additions and 34 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -603,9 +603,6 @@ void ucol_initUCA(UErrorCode *status) {
  }
 }
 /*    collIterNormalize     Incremental Normalization happens here.                       */
 /*                          pick up the range of chars identifed by FCD,                  */
 /*                          normalize it into the collIterate's writable buffer,          */
@ -620,6 +617,15 @@ void collIterNormalize(collIterate *collationSource)
    normLen = unorm_normalize(srcP, endP-srcP, UNORM_NFD, 0, collationSource->writableBuffer,
                              collationSource->writableBufSize, &status);
    if (normLen == collationSource->writableBufSize) {
        UChar *temp = (UChar *)uprv_malloc((normLen+1)*sizeof(UChar));
        uprv_memcpy(temp, collationSource->writableBuffer, normLen * sizeof(UChar));
        temp[normLen] = 0;
        if (collationSource->writableBuffer != collationSource->stackWritableBuffer) {
            uprv_free( collationSource->writableBuffer);
        }
        collationSource->writableBuffer = temp;
    }
    if (U_FAILURE(status)) { /* This would be buffer overflow */
        if (status == U_BUFFER_OVERFLOW_ERROR) {
            if (collationSource->writableBuffer != collationSource->stackWritableBuffer) {
@ -636,6 +642,7 @@ void collIterNormalize(collIterate *collationSource)
            return;
        }
    }
    collationSource->pos        = collationSource->writableBuffer;
    collationSource->origFlags  = collationSource->flags;
    collationSource->flags     |= UCOL_ITER_INNORMBUF;
@ -656,7 +663,7 @@ void collIterNormalize(collIterate *collationSource)
 /*          the trailing combining class of the previous char was zero.                   */
 /*          True because the previous call to this function will have always exited       */
 /*          that way, and we get called for every char where cc might be non-zero.        */
-inline void collIterFCD(collIterate *collationSource) {
+inline UBool collIterFCD(collIterate *collationSource) {
    UChar32     codepoint;
    UChar       *srcP;
    int         length;
@ -710,22 +717,16 @@ inline void collIterFCD(collIterate *collationSource) {
        }
    }
-    collationSource->fcdPosition = srcP + count;
+    collationSource->fcdPosition = srcP + (count - 1);
    if (codepoint == 0 && (collationSource->flags & UCOL_ITER_HASLEN)==0) {
        // We checked the string's trailing null, which would advance fcdPosition past the null.
        //   back it up to point to the null.
        collationSource->fcdPosition--;
    }
-    if (needNormalize) {
+    return needNormalize;
        collIterNormalize(collationSource);
    }
 }
 /****************************************************************************/
 /* Following are the CE retrieval functions                                 */
 /*                                                                          */
@ -827,7 +828,9 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
        }
        // Need a more complete FCD check and possible normalization.
-        collIterFCD(collationSource);
+        if (collIterFCD(collationSource)) {
            collIterNormalize(collationSource);
        }
        if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
            //  No normalization was needed.  Go ahead and process the char we already had.
            break;
@ -875,7 +878,7 @@ U_CAPI uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSour
 void collPrevIterNormalize(collIterate *data)
 {
    UErrorCode status  = U_ZERO_ERROR;
-    UChar      *pEnd   = data->pos + 1;         /* End normalize + 1 */
+    UChar      *pEnd   = data->pos;         /* End normalize + 1 */
    UChar      *pStart;
    uint32_t    normLen;
    UChar      *pStartNorm;
@ -888,8 +891,9 @@ void collPrevIterNormalize(collIterate *data)
        pStart = data->fcdPosition + 1;
    }
-    normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0,
+    normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
                              data->writableBuffer, 0, &status);
    if (data->writableBufSize <= normLen) {
            if (data->writableBuffer != data->stackWritableBuffer) {
                uprv_free( data->writableBuffer);
@ -906,8 +910,8 @@ void collPrevIterNormalize(collIterate *data)
    */
    pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
    *(pStartNorm - 1) = 0;
-    unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, 
+    unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm, 
-                    &status);
+                    normLen, &status);
    data->pos        = data->writableBuffer + data->writableBufSize;
    data->origFlags  = data->flags;
@ -1326,13 +1330,14 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
        //*(collationSource->CEpos++) = 0x04000080 | (ch & 0x001F) << 27;
      }
-      // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
+      /* 
-      // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
+      we must skip all 00, 01, 02 bytes, so most bytes have 253 values
-      // we shift so that HAN all has the same first primary, for compression.
+      we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
-      // for the 4 byte case, we make the gap as large as we can fit.
+      we shift so that HAN all has the same first primary, for compression.
-      // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
+      for the 4 byte case, we make the gap as large as we can fit.
-      // Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
+      Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
-      
+      Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
      */
      int32_t last0 = cp - IMPLICIT_BOUNDARY_;
      uint32_t r = 0;
@ -1527,12 +1532,200 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
  return order; /* return the CE */
 }
 /**
 * Inserts the argument character into the end of the buffer pushing back the 
 * null terminator.
 * @param data collation element iterator data
 * @param ch character to be appended
 */
 inline void insertBufferEnd(collIterate *data, UChar ch) 
 {
          uint32_t  size    = data->writableBufSize;
          uint32_t  strlen  = u_strlen(data->writableBuffer);
          UChar    *newbuffer;
    const uint32_t  incsize = 5;
    if (size > strlen) {
        UChar *end = data->writableBuffer + strlen;
        *end = ch;
        *(end + 1) = 0;
        return;
    }
    /* 
    buffer will always be null terminated at the end.
    giving extra space since it is likely that more characters will be added.
    */
    size += incsize;
    newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
    uprv_memcpy(newbuffer, data->writableBuffer, 
                data->writableBufSize * sizeof(UChar));
    newbuffer        = newbuffer + data->writableBufSize;
    *newbuffer       = ch;
    *(newbuffer + 1) = 0;
    if (data->writableBuffer != data->stackWritableBuffer) {
        uprv_free(data->writableBuffer);
    }
    data->writableBufSize = size;
    data->writableBuffer  = newbuffer;
 }
 /**
 * Special normalization function for contraction in the forwards iterator.
 * This normalization sequence will place the current character at source->pos
 * and its following normalized sequence into the buffer.
 * The fcd position, pos will be changed. 
 * pos will now point to positions in the buffer.
 * Flags will be changed accordingly.
 * @param data collation iterator data
 */
 inline void normalizeNextContraction(collIterate *data)
 { 
    UChar      *buffer     = data->writableBuffer;
    uint32_t    buffersize = data->writableBufSize;
    uint32_t    strsize;
    UErrorCode  status     = U_ZERO_ERROR;
    /* data->pos - 1 is already in buffer */
    UChar      *pStart     = data->pos; 
    UChar      *pEnd;
    uint32_t    normLen;
    UChar      *pStartNorm;
    if (data->flags & UCOL_ITER_HASLEN) {
        *data->writableBuffer = *(pStart - 1);
        strsize               = 1;
    }
    else {
        strsize = u_strlen(data->writableBuffer);
    }
    pEnd = data->fcdPosition; 
    normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, 
                              &status);
    if (buffersize <= normLen + strsize) {
        uint32_t  size = strsize + normLen + 1;
        UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
        uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
        if (data->writableBuffer != data->stackWritableBuffer) {
            uprv_free(buffer);
        }
        data->writableBuffer = temp;
        data->writableBufSize = size;
    }
    status            = U_ZERO_ERROR;
    pStartNorm        = buffer + strsize;
    /* null-termination will be added here */
    unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, 
                    &status);
    data->pos        = data->writableBuffer + strsize;
    data->origFlags  = data->flags;
    data->flags     |= UCOL_ITER_INNORMBUF;
    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
 }
 /**
 * Contraction character management function that returns the next character
 * for the forwards iterator.
 * Does nothing if the next character is in buffer and not the first character 
 * in it.
 * Else it checks next character in data string to see if it is normalizable.
 * If it is not, the character is simply copied into the buffer, else
 * the whole normalized substring is copied into the buffer, including the 
 * current character.
 * @param data collation element iterator data
 * @return next character
 */
 inline UChar getNextNormalizedChar(collIterate *data) 
 {
    UChar  nextch = 0;
    UChar  ch;
    if ((data->flags & UCOL_ITER_NORM) == 0 || 
        ((data->flags & UCOL_ITER_INNORMBUF) && *data->pos != 0)) {
        /* 
        if no normalization.
        if next character is in normalized buffer, no further normalization
        is required
        */
        return *(data->pos ++);
    }
    ch = *(data->pos);
    if (data->flags & UCOL_ITER_HASLEN) {
        /* in data string */
        if (data->pos + 1 == data->endp) {
            data->pos ++;
            return ch;
        }
        nextch = *(data->pos + 1);
    }
    else {
        if (data->flags & UCOL_ITER_INNORMBUF) {
            /* 
            in writable buffer, at this point fcdPosition can not be 
            pointing to the end of the data string. see contracting tag.
            */
            if (data->fcdPosition == data->endp - 1) {
                /* at the end of the string, just dump it into the normalizer */
                insertBufferEnd(data, *(data->fcdPosition));
                data->pos = data->endp;
                return *(data->fcdPosition);
            }
            data->pos = data->fcdPosition;
        }
        else {
            if (*(data->pos + 1) == 0) {
                return *(data->pos ++);
            }
        }
        ch     = *(data->pos);
        nextch = *(data->pos + 1);
    }
    /* 
    * if the current character is not fcd.
    * Trailing combining class == 0.
    */
    if (data->fcdPosition < data->pos && 
        (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
         ch >= NFC_ZERO_CC_BLOCK_LIMIT_) && collIterFCD(data)) {
            /* 
            Need a more complete FCD check and possible normalization. 
            normalize substring will be appended to buffer 
            */
            normalizeNextContraction(data);
            data->pos ++;
            return ch;
    }
    if (data->flags & UCOL_ITER_INNORMBUF) {
        /* 
        no normalization is to be done hence only one character will be 
        appended to the buffer.
        */
        insertBufferEnd(data, ch);
    }
    /* points back to the pos in string */
    data->pos ++;
    return ch;
 }
 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
 /* It is called by both getNextCE and getNextUCA                                         */
 uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status) {
  uint32_t i = 0; /* general counter */
  uint32_t firstCE = UCOL_NOT_FOUND;
  UChar   *firstUChar = source->pos;
  collIterateState state;
  backupState(source, &state);
  //uint32_t CE = *source->CEpos;
  for (;;) {
    const uint32_t *CEOffset = NULL;
@ -1581,16 +1774,19 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
        /* First we position ourselves at the begining of contraction sequence */
        const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
-        if ((source->flags & UCOL_ITER_HASLEN) && source->pos>=source->endp) {
+        if (((source->flags & UCOL_ITER_HASLEN) && source->pos>=source->endp) 
            || ((source->flags & UCOL_ITER_INNORMBUF) && *source->pos == 0 &&
                source->fcdPosition >= source->endp)) {
                                           /* this is the end of string.  (Null terminated handled later,
                                            when the null doesn't match the contraction sequence.)     */
          {
            CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); /* So we'll pick whatever we have at the point... */
            if (CE == UCOL_NOT_FOUND) {
-              source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
+              // source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
              if(firstCE != UCOL_NOT_FOUND) {
                CE = firstCE;
              }
              loadState(source, &state);
            }
          }
          break;
@ -1601,7 +1797,7 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
        UCharOffset++; /* skip the backward offset, see above */
-        schar = *source->pos++;
+        schar = getNextNormalizedChar(source);
        while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
          UCharOffset++;
        }
@ -1613,7 +1809,8 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
        if(CE == UCOL_NOT_FOUND) {
-          source->pos   = firstUChar; /* spit all the not found chars, which led us in this contraction */
+          // source->pos   = firstUChar; /* spit all the not found chars, which led us in this contraction */
          loadState(source, &state);
          if(firstCE != UCOL_NOT_FOUND) {
            CE = firstCE;
          }
@ -1624,7 +1821,8 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
          uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
          if(tempCE != UCOL_NOT_FOUND) {
            firstCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
-            firstUChar = source->pos-1;
+            /* firstUChar = source->pos-1; */
            backupState(source, &state);
          }
        } else {
          break;
@ -1664,8 +1862,6 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
 /**
 * Inserts the argument character into the front of the buffer replacing the 
 * front null terminator.
 * Repoints the pos pointer to the next character in the writablebuffer.
 * Changes the flags up to date.
 * @param data collation element iterator data
 * @param ch character to be appended
 */
@ -1680,7 +1876,9 @@ inline void insertBufferFront(collIterate *data, UChar ch)
        if (*end == 0) {
            *end       = ch;
            *(end - 1) = 0;
            return;
        }
        end --;
    }
    /* 
@ -1798,7 +1996,7 @@ inline UChar getPrevNormalizedChar(collIterate *data)
    UChar  ch;
    UChar *start;
    if ((data->flags & UCOL_ITER_NORM) == 0 || 
-        (data->flags & UCOL_ITER_INNORMBUF) || *(data->pos - 1) != 0) {
+        ((data->flags & UCOL_ITER_INNORMBUF) && *(data->pos - 1) != 0)) {
        /* 
        if no normalization.
        if previous character is in normalized buffer, no further normalization
@ -1959,8 +2157,10 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
              /* this is the start of string */
              CE = *(coll->contractionCEs +
                     (UCharOffset - coll->contractionIndex));
-              if (CE == UCOL_NOT_FOUND && firstCE != UCOL_NOT_FOUND) {
+              if (CE == UCOL_NOT_FOUND) {
-                CE            = firstCE;
+                  if (firstCE != UCOL_NOT_FOUND) {
                      CE            = firstCE;
                  }
                loadState(source, &state);
              }