ICU-861

Implemented new incremental normalization in backwards iteration. X-SVN-Rev: 4524
2001-04-20 22:29:53 +00:00 · 2001-04-20 22:29:53 +00:00 · 34372f991b
commit 34372f991b
parent 2b0da7cddd
3 changed files with 745 additions and 534 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -45,6 +45,8 @@
 #define LAST_BYTE_MASK_           0xFF
 #define SECOND_LAST_BYTE_SHIFT_   8

+#define ZERO_CC_LIMIT_            0xC0
+
 static UCollator* UCA = NULL;

 extern "C" UBool checkFCD(const UChar*, int32_t, UErrorCode*);
@ -81,6 +83,22 @@ isAcceptableUCA(void *context,
    }
 }

+/* added for Han implicit CE */
+static const uint32_t IMPLICIT_HAN_START_ = 0x3400;
+static const uint32_t IMPLICIT_HAN_LIMIT_ = 0xA000;
+static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
+static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
+static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
+static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
+static const uint32_t IMPLICIT_LAST_COUNT2_ = 
+                       (IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) / 
+                       (IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
+static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ * 
+                              IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
+static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ * 
+                                  IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
+static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ / 
+                                                        IMPLICIT_LAST_COUNT2_;

 inline  void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
                              int32_t sourceLen, collIterate *s) { 
@ -116,6 +134,8 @@ inline void backupState(const collIterate *data, collIterateState *backup)
    backup->flags         = data->flags;
    backup->origFlags     = data->origFlags;
    backup->pos           = data->pos;
+    backup->bufferaddress = (long)(data->writableBuffer);
+    backup->buffersize    = data->writableBufSize;
 }

 /** 
@ -129,6 +149,17 @@ inline void loadState(collIterate *data, const collIterateState *backup)
    data->flags       = backup->flags;
    data->origFlags   = backup->origFlags;
    data->pos         = backup->pos;
+    if ((data->flags & UCOL_ITER_INNORMBUF) && 
+        (long)(data->writableBuffer) != backup->bufferaddress) {
+        /* 
+        this is when a new buffer has been reallocated and we'll have to 
+        calculate the new position.
+        note the new buffer has to contain the contents of the old buffer.
+        */
+        uint32_t temp = backup->buffersize - 
+                                  ((long)(data->pos) - backup->bufferaddress);
+        data->pos = data->writableBuffer + (data->writableBufSize - temp);
+    }
 }

 /****************************************************************************/
@ -504,8 +535,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr

    result->zero = 0;
    result->rules = NULL;
-
-    /* get the version info from UCATableHeader and populate the Collator struct*/
+    /* get the version info form UCATableHeader and populate the Collator struct*/
    result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/

@ -652,7 +682,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
            break;
        }
        
-        if (ch < 0xC0 ) {
+        if (ch < ZERO_CC_LIMIT_ ) {
            // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
            break;
        }
@ -723,6 +753,7 @@ void collPrevIterNormalize(collIterate *data)
    UChar      *pEnd   = data->pos + 1;         /* End normalize + 1 */
    UChar      *pStart;
    uint32_t    normLen;
+    UChar      *pStartNorm;

    /* Start normalize */
    if (data->fcdPosition == NULL) {
@ -733,11 +764,8 @@ void collPrevIterNormalize(collIterate *data)
    }

    normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, 
-                              data->writableBuffer, data->writableBufSize,
-                              &status);
-
-    if (U_FAILURE(status)) {
-        if (status == U_BUFFER_OVERFLOW_ERROR) { /* This would be buffer overflow */
+                              data->writableBuffer, 0, &status);
+    if (data->writableBufSize <= normLen) {
        if (data->writableBuffer != data->stackWritableBuffer) {
            uprv_free( data->writableBuffer);
        }
@ -745,16 +773,18 @@ void collPrevIterNormalize(collIterate *data)
                                                               sizeof(UChar));
        /* to handle the zero termination */
        data->writableBufSize = normLen + 1;
+    }
    status = U_ZERO_ERROR;
-            unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0,
-                        data->writableBuffer, data->writableBufSize, &status);
-        }
-        else {
-            return;
-        }
-    }
+    /* 
+    this puts the null termination infront of the normalized string instead
+    of the end
+    */
+    pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
+    *(pStartNorm - 1) = 0;
+    unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, 
+                    &status);
    
-    data->pos        = data->writableBuffer + normLen;
+    data->pos        = data->writableBuffer + data->writableBufSize;
    data->origFlags  = data->flags;
    data->flags     |= UCOL_ITER_INNORMBUF;
    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
@ -773,14 +803,16 @@ void collPrevIterNormalize(collIterate *data)
 *    True because the previous call to this function will have always exited     
 *    that way, and we get called for every char where cc might be non-zero.        
 * @param data collation iterate struct
+* @return normalization status, TRUE for normalization to be done, FALSE 
+*         otherwise
 */
-inline void collPrevIterFCD(collIterate *data)
+inline UBool collPrevIterFCD(collIterate *data) 
 {
    UChar32     codepoint;
    uint8_t     leadingCC;
    uint8_t     trailingCC = 0;
    uint16_t    fcd;
-    UBool       needNormalize = FALSE;
+    UBool       result = FALSE;
    int         length;
    
    length = (data->pos + 1) - data->string;
@ -822,7 +854,7 @@ inline void collPrevIterFCD(collIterate *data)
            }
            
            if (leadingCC < trailingCC) {
-                needNormalize = TRUE;
+                result = TRUE;
            }
            
            leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 
@ -836,9 +868,7 @@ inline void collPrevIterFCD(collIterate *data)
        data->fcdPosition = data->string + length;
    }

-    if (needNormalize) {
-        collPrevIterNormalize(data);
-    }
+    return result;
 }

 /**
@ -870,6 +900,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
        side buffer / original string, and we need to start again to get the 
        next character.
        */
+        
        while (TRUE) {                       
            if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
                /* 
@ -883,7 +914,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
            }
            else {
                /* we are in the side buffer. */
-                if (data->pos <= data->writableBuffer) {
+                if (*(data->pos - 1) == 0) {
                    /* 
                    At the start of the normalize side buffer. 
                    Go back to string.
@ -912,7 +943,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
            */
            if ((data->flags & UCOL_ITER_NORM) == 0 || 
                data->fcdPosition <= data->pos ||
-                ch < 0xC0) {
+                ch < ZERO_CC_LIMIT_) {
                break;
            }
        
@ -930,7 +961,10 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
            }
        
            /* Need a more complete FCD check and possible normalization. */
-            collPrevIterFCD(data);
+            if (collPrevIterFCD(data)) {
+                collPrevIterNormalize(data);
+            }
+
            if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
                /*  No normalization. Go ahead and process the char. */
                break;
@ -1342,35 +1376,28 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
    order = ucmp32_get(UCA->mapping, ch);
  }
  
-  if (order >= UCOL_NOT_FOUND) {
+  if (order > UCOL_NOT_FOUND) {
    order = getSpecialPrevCE(UCA, order, collationSource, status); 
  }
  
  if (order == UCOL_NOT_FOUND) 
  { 
+    uint32_t cp = 0;
    /* 
    This is where we have to resort to algorithmical generation.
    We have to check if ch is possibly a first surrogate - then we need to 
    take the next code unit and make a bigger CE 
    */
-    UChar prevChar;
    uint32_t
      SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
      LCount = 19, VCount = 21, TCount = 28,
      NCount = VCount * TCount,   /* 588 */
      SCount = LCount * NCount;   /* 11172 */
-      /*
-      LLimit = LBase + LCount,    // 1113
-      VLimit = VBase + VCount,    // 1176
-      TLimit = TBase + TCount,    // 11C3
-      SLimit = SBase + SCount;    // D7A4
-      */
   
    /* 
    once we have failed to find a match for codepoint cp, and are in the 
    implicit code.
    */
-
    uint32_t L = ch - SBase;
    if (L < SCount) 
    { /* since it is unsigned, catchs zero case too */
@ -1424,61 +1451,35 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
        }
        collationSource->toReturn = collationSource->CEpos - 1;
        return *(collationSource->toReturn);
-
-        /*return *(collationSource->toReturn++);*/
-/*
-        ucol_getJamoCEs(collationSource->coll, L, &collationSource->CEpos);
-        ucol_getJamoCEs(collationSource->coll, V, &collationSource->CEpos);
-        if (T != TBase) {
-          ucol_getJamoCEs(collationSource->coll, T, &collationSource->CEpos);
-        }
-        collationSource->toReturn = collationSource->CEpos - 1;
-        return *(collationSource->toReturn);
-*/
-        /*
-        Jamo is Special
-        do recursive processing of L, V, and T with fetchCE (but T only if not
-        equal to TBase!!)
-        Since fetchCE returns a CE, and (potentially) stuffs items into the ce
-        buffer,
-        this is how it is done.
-        */
-        /*
-          int firstCE = fetchCE(L, ...);
-          // set pointer, leave gap!
-          int* lastExpansion = expansionBufferEnd++;
-          *lastExpansion = fetchCE(V,...);
-          if (T != TBase) {
-            lastExpansion = expansionBufferEnd++; // set pointer, leave gap!
-            *lastExpansion = fetchCE(T,...);
-          }
-        */
        }
    }

    if (UTF_IS_SECOND_SURROGATE(ch)) 
    {
-      /* This is where the s***t hits the fan */
-      /* it turns out, the first part of the if can be satisfied even if we're */
-      /* at the beggining of the string */
-      /* we have to make sure we know what is the situation we're in */
-      /* quick fix is by using isUsingWritable, as shown below */
-      if ((collationSource->string < collationSource->pos) &&
-          (UTF_IS_FIRST_SURROGATE(prevChar = *(collationSource->pos - 1))))
+        UChar  prevChar;
+        UChar *prev;
+        if ((collationSource->string == collationSource->pos) ||
+            (collationSource->pos == collationSource->writableBuffer &&
+            collationSource->fcdPosition == NULL)) {
+            /* we are at the start of the string, wrong place to be at */
+            return 0;
+        }
+        if (collationSource->pos != collationSource->writableBuffer) {
+            prev     = collationSource->pos - 1;
+        }
+        else {
+            prev     = collationSource->fcdPosition;
+        }
+        prevChar = *prev;
+
+        /* Handles Han and Supplementary characters here.*/
+        if (UTF_IS_FIRST_SURROGATE(prevChar)) 
        {
-        uint32_t cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
-        collationSource->pos --;
+            cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
+            collationSource->pos = prev;
            if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
              return 0;  /* illegal code value, use completely ignoreable! */
            }
-
-        /*
-        This is a code point minus 0x10000, that's what algorithm requires
-        */
-        *(collationSource->CEpos ++) = 0xE0010303 | (cp & 0xFFE00) << 8;
-        order = 0x80200080 | (cp & 0x001FF) << 22;
-        collationSource->toReturn = collationSource->CEpos;
-        *(collationSource->CEpos ++) = order;
        } 
        else {
            return 0; /* completely ignorable */
@ -1490,14 +1491,41 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
      if (UTF_IS_FIRST_SURROGATE(ch) || (ch & 0xFFFE) == 0xFFFE) {
        return 0; /* completely ignorable */
      }
-
-      /* Make up an artifical CE from code point as per UCA */
-      *(collationSource->CEpos ++) = 0xD0800303 | (ch & 0xF000) << 12 |
-                                     (ch & 0x0FE0) << 11;
-      collationSource->toReturn = collationSource->CEpos;
-      order = 0x04000080 | (ch & 0x001F) << 27;
-      *(collationSource->CEpos ++) = order;
+      cp = ch;
    }
+  
+      /* we must skip all 00, 01, 02 bytes, so most bytes have 253 values
+       we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
+       we shift so that HAN all has the same first primary, for compression.
+       for the 4 byte case, we make the gap as large as we can fit.
+       Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
+       Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
+      */   
+      int32_t last0 = cp - IMPLICIT_BOUNDARY_;
+      uint32_t r = 0;
+
+      if (last0 < 0) {
+          cp += IMPLICIT_HAN_SHIFT_; // shift so HAN shares single block
+          int32_t last1 = cp / IMPLICIT_LAST_COUNT_;
+          last0 = cp % IMPLICIT_LAST_COUNT_;
+          int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
+          last1 %= IMPLICIT_OTHER_COUNT_;
+          r = 0xEC030300 + (last2 << 24) + (last1 << 16) + (last0 << 9);
+      } else {
+          int32_t last1 = last0 / IMPLICIT_LAST_COUNT2_;
+          last0 %= IMPLICIT_LAST_COUNT2_;
+          int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
+          last1 %= IMPLICIT_OTHER_COUNT_;
+          r = 0xEF030303 + (last2 << 16) + (last1 << 8) + 
+              (last0 * IMPLICIT_LAST2_MULTIPLIER_);
+      }
+      /* 
+      order = (r & 0xFFFF0000) | 0x00000303;
+      *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x00000080;
+      */
+      *(collationSource->CEpos++) = (r & 0xFFFF0000) | 0x00000303;
+      collationSource->toReturn = collationSource->CEpos;
+      order = ((r & 0x0000FFFF)<<16) | 0x00000080;
  }
  return order; /* return the CE */
 }
@ -1636,6 +1664,204 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
  return CE;
 }

+/**
+* Inserts the argument character into the front of the buffer replacing the 
+* front null terminator.
+* Repoints the pos pointer to the next character in the writablebuffer.
+* Changes the flags up to date.
+* @param data collation element iterator data
+* @param ch character to be appended
+*/
+inline void insertBufferFront(collIterate *data, UChar ch) 
+{
+          uint32_t  size    = data->writableBufSize;
+          UChar    *end     = data->writableBuffer + (size - 1);
+          UChar    *newbuffer;
+    const uint32_t  incsize = 5;
+
+    while (end > data->writableBuffer) {
+        if (*end == 0) {
+            *end       = ch;
+            *(end - 1) = 0;
+        }
+    }
+
+    /* 
+    buffer will always be null terminated infront.
+    giving extra space since it is likely that more characters will be added.
+    */
+    size += incsize;
+    newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
+    end = newbuffer + incsize;
+    uprv_memcpy(end, data->writableBuffer, 
+                data->writableBufSize * sizeof(UChar));
+    *end       = ch;
+    *(end - 1) = 0;
+
+    if (data->writableBuffer != data->stackWritableBuffer) {
+        uprv_free(data->writableBuffer);
+    }
+
+    data->writableBufSize = size;
+    data->writableBuffer  = newbuffer;
+}
+
+/**
+* Special normalization function for contraction in the previous iterator.
+* This normalization sequence will place the current character at source->pos
+* and its following normalized sequence into the buffer.
+* The fcd position, pos will be changed. 
+* pos will now point to positions in the buffer.
+* Flags will be changed accordingly.
+* @param data collation iterator data
+*/
+inline void normalizePrevContraction(collIterate *data)
+{ 
+    UChar      *buffer     = data->writableBuffer;
+    uint32_t    buffersize = data->writableBufSize;
+    uint32_t    nulltermsize;
+    UErrorCode  status     = U_ZERO_ERROR;
+    UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
+    UChar      *pStart;
+    uint32_t    normLen;
+    UChar      *pStartNorm;
+
+    if (data->flags & UCOL_ITER_HASLEN) {
+        /* 
+        normalization buffer not used yet, we'll pull down the next 
+        character into the end of the buffer
+        */
+        *(buffer + (buffersize - 1)) = *(data->pos - 1);
+        nulltermsize                  = buffersize - 1;
+    }
+    else {
+        nulltermsize = buffersize;
+        UChar *temp = buffer + (nulltermsize - 1);
+        while (*(temp --) != 0) {
+            nulltermsize --;
+        }
+    }
+
+    /* Start normalize */
+    if (data->fcdPosition == NULL) {
+        pStart = data->string;
+    }
+    else {
+        pStart = data->fcdPosition + 1; 
+    }
+
+    normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, 
+                              &status);
+
+    if (nulltermsize <= normLen) {
+        uint32_t  size = buffersize - nulltermsize + normLen + 1;
+        UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
+        nulltermsize   = normLen + 1;
+        uprv_memcpy(temp + normLen, buffer, 
+                    sizeof(UChar) * (buffersize - nulltermsize));
+        if (data->writableBuffer != data->stackWritableBuffer) {
+            uprv_free(buffer);
+        }
+        data->writableBuffer = temp;
+        data->writableBufSize = size;
+    }
+
+    status = U_ZERO_ERROR;
+    /* 
+    this puts the null termination infront of the normalized string instead
+    of the end
+    */
+    pStartNorm   = buffer + (nulltermsize - normLen);
+    *(pStartNorm - 1) = 0;
+    unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, 
+                    &status);
+    
+    data->pos        = data->writableBuffer + nulltermsize;
+    data->origFlags  = data->flags;
+    data->flags     |= UCOL_ITER_INNORMBUF;
+    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
+}
+
+/**
+* Contraction character management function that returns the previous character
+* for the backwards iterator.
+* Does nothing if the previous character is in buffer and not the first 
+* character in it.
+* Else it checks previous character in data string to see if it is 
+* normalizable.
+* If it is not, the character is simply copied into the buffer, else
+* the whole normalized substring is copied into the buffer, including the 
+* current character.
+* @param data collation element iterator data
+* @return previous character
+*/
+inline UChar getPrevNormalizedChar(collIterate *data) 
+{
+    UChar  prevch;
+    UChar  ch;
+    UChar *start;
+    if ((data->flags & UCOL_ITER_NORM) == 0 || 
+        (data->flags & UCOL_ITER_INNORMBUF) || *(data->pos - 1) != 0) {
+        /* 
+        if no normalization.
+        if previous character is in normalized buffer, no further normalization
+        is required
+        */
+        return *(data->pos - 1);
+    }
+
+    start = data->pos;
+    if (data->flags & UCOL_ITER_HASLEN) {
+        /* in data string */
+        if ((start - 1) == data->string) {
+            return *(start - 1);
+        }
+        data->pos = start - 1;
+    }
+    else {
+        /* 
+        in writable buffer, at this point fcdPosition can not be NULL.
+        see contracting tag.
+        */
+        if (data->fcdPosition == data->string) {
+            /* at the start of the string, just dump it into the normalizer */
+            insertBufferFront(data, *(data->fcdPosition));
+            return *(data->fcdPosition);
+        }
+        data->pos = data->fcdPosition;
+    }
+    ch     = *(data->pos);
+    prevch = *(data->pos - 1);
+    /* 
+    * if the current character is not fcd.
+    * Trailing combining class == 0.
+    */
+    if (data->fcdPosition > data->pos && 
+        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
+        prevch >= NFC_ZERO_CC_BLOCK_LIMIT_) && collPrevIterFCD(data)) {
+            /* 
+            Need a more complete FCD check and possible normalization. 
+            normalize substring will be appended to buffer 
+            */
+            normalizePrevContraction(data);
+            return ch;
+    }
+
+    if (data->flags & UCOL_ITER_INNORMBUF) {
+        /* 
+        no normalization is to be done hence only one character will be 
+        appended to the buffer.
+        */
+        insertBufferFront(data, ch);
+    }
+    else {
+        /* points back to the pos in string */
+        data->pos = start;
+    }
+
+    return ch;
+}
+
 /** 
 * This function handles the special CEs like contractions, expansions, 
 * surrogates, Thai.
@ -1659,8 +1885,8 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
        uint8_t  firstflags   = source->flags;
        */
        collIterateState state;
-
  backupState(source, &state);
+
  for(;;)
  {
    /* the only ces that loops are thai and contractions */
@ -1689,9 +1915,11 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
          Move the prevowel and the following base Consonant into the 
          normalization buffer with their order swapped
          */
-          source->writableBuffer[0] = *source->pos;
-          source->writableBuffer[1] = *(source->pos - 1);
-          source->writableBuffer[2] = 0;
+          UChar *tempbuffer = source->writableBuffer + 
+                              (source->writableBufSize - 1);
+          *(tempbuffer - 2) = 0;
+          *(tempbuffer - 1) = *source->pos;
+          *(tempbuffer)     = *(source->pos - 1);
            
          /* 
          Indicate where to continue in main input string after exhausting 
@ -1703,7 +1931,8 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
          else {
            source->fcdPosition       = source->pos - 2; 
          }
-          source->pos               = source->writableBuffer + 2;
+
+          source->pos               = tempbuffer;
          source->origFlags         = source->flags;
          source->flags            |= UCOL_ITER_INNORMBUF;
          source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
@ -1715,28 +1944,26 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
        /* This should handle contractions */
        for(;;)
        {
+            uint32_t tempfirstCE;
            /* First we position at the begining of contraction sequence */
            constart = UCharOffset = (UChar *)coll->image + 
                       getContractOffset(CE);
            strend = source->endp;

-            if (firstCE == UCOL_NOT_FOUND) {
-              firstCE = *(coll->contractionCEs +
+            tempfirstCE = *(coll->contractionCEs + 
                          (UCharOffset - coll->contractionIndex));
+            if (tempfirstCE != UCOL_NOT_FOUND) {
+              firstCE    = tempfirstCE;
+              backupState(source, &state);
            }

-            if ((source->pos == source->string) ||
-                (source->pos == source->writableBuffer &&
+            if ((source->pos == source->string) || (*(source->pos - 1) == 0 && 
                source->fcdPosition == NULL)) { 
              /* this is the start of string */
              CE = *(coll->contractionCEs + 
                     (UCharOffset - coll->contractionIndex)); 
              if (CE == UCOL_NOT_FOUND && firstCE != UCOL_NOT_FOUND) {
                CE            = firstCE;
-                /*
-                source->pos   = firstUChar;
-                source->flags = firstflags;
-                */
                loadState(source, &state);
              }

@ -1747,15 +1974,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
            Progressing to backwards block
            */
            UCharOffset += *UCharOffset; 
-
-            /* not at the border of the writable buffer */
-            if ((source->flags & UCOL_ITER_HASLEN) ||
-                (source->pos != source->writableBuffer)) {
-                schar = *(source->pos - 1);
-            }
-            else {
-                schar = *(source->fcdPosition);
-            }
+            schar = getPrevNormalizedChar(source);

            while (schar > (tchar = *UCharOffset)) {
              UCharOffset ++;
@ -1765,24 +1984,14 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
              UCharOffset = constart; 
            } 
            else {
-                if ((source->flags & UCOL_ITER_HASLEN) ||
-                    (source->pos != source->writableBuffer)) {
                source->pos --;
            }
-                else {
-                    source->pos = source->fcdPosition;
-                    source->flags = source->origFlags;
-                }
-            }

-            CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
+            CE = *(coll->contractionCEs + 
+                (UCharOffset - coll->contractionIndex));
            if (!isContraction(CE)) {
              if (CE == UCOL_NOT_FOUND) {
                CE            = firstCE;
-                /*
-                source->pos   = firstUChar;
-                source->flags = firstflags;
-                */
                loadState(source, &state);
              }
              firstCE = UCOL_NOT_FOUND;
--- a/icu4c/source/i18n/ucol_imp.h
+++ b/icu4c/source/i18n/ucol_imp.h
@ -118,6 +118,8 @@ data similar to collIterate.
 struct collIterateState {
    UChar    *pos; /* This is position in the string.  Can be to original or writable buf */
    UChar    *fcdPosition; /* Position in the original string to continue FCD check from. */
+    long      bufferaddress; /* address of the normalization buffer */
+    uint32_t  buffersize;
    uint8_t   flags;
    uint8_t   origFlags;
 };