ICU-4350 Upgrade ICU4C to UCA 4.1

X-SVN-Rev: 17622
2005-05-19 06:43:35 +00:00 · 2005-05-19 06:43:35 +00:00 · 32354b1c86
commit 32354b1c86
parent 5e1a113aba
5 changed files with 129 additions and 445 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -137,6 +137,7 @@ inline void  IInit_collIterate(const UCollator *collator, const UChar *sourceStr
    }
    (s)->iterator = NULL;
    //(s)->iteratorIndex = 0;
+    (s)->consumedChars = 0;
 }

 U_CAPI void  U_EXPORT2
@ -174,6 +175,7 @@ inline void backupState(const collIterate *data, collIterateState *backup)
        data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
      }
    }
+    backup->consumedChars = data->consumedChars;
 }

 /**
@ -231,6 +233,7 @@ inline void loadState(collIterate *data, const collIterateState *backup,
        */
        data->fcdPosition = backup->fcdPosition;
    }
+    data->consumedChars = backup->consumedChars;
 }


@ -528,24 +531,6 @@ void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo
    result->options = opts;
 }

-#if 0
-// doesn't look like anybody is using this
-void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
-  if(U_FAILURE(*status)) {
-    return;
-  }
-    opts->caseFirst = result->caseFirst;
-    opts->caseLevel = result->caseLevel;
-    opts->frenchCollation = result->frenchCollation;
-    opts->normalizationMode = result->normalizationMode;
-    opts->strength = result->strength;
-    opts->variableTopValue = result->variableTopValue;
-    opts->alternateHandling = result->alternateHandling;
-    opts->hiraganaQ = result->hiraganaQ;
-    opts->numericCollation = result->numericCollation;
-}
-#endif
-

 /**
 * Approximate determination if a character is at a contraction end.
@ -556,7 +541,7 @@ void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode
 */
 static
 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
-    if (UTF_IS_TRAIL(c)) {
+    if (U16_IS_TRAIL(c)) {
      return TRUE;
    }

@ -582,9 +567,9 @@ inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
 *        in contraction processing.
 */
 static
-inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) {
+inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    uint8_t sCC = 0;
-    if (c >= 0x300 && ucol_unsafeCP(c, coll)) {
+    if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
        sCC = u_getCombiningClass(c);
    }
    return sCC;
@ -1259,8 +1244,8 @@ inline UBool collIterFCD(collIterate *collationSource) {
    /* trie access */
    fcd = unorm_getFCD16(fcdTrieIndex, c);
    if (fcd != 0) {
-        if (UTF_IS_FIRST_SURROGATE(c)) {
-            if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
+        if (U16_IS_LEAD(c)) {
+            if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
                ++srcP;
                fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
            } else {
@ -1280,8 +1265,8 @@ inline UBool collIterFCD(collIterate *collationSource) {
                c = *srcP++;
                /* trie access */
                fcd = unorm_getFCD16(fcdTrieIndex, c);
-                if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) {
-                    if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
+                if (fcd != 0 && U16_IS_LEAD(c)) {
+                    if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
                        ++srcP;
                        fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
                    } else {
@ -1330,6 +1315,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
    }

    UChar ch = 0;
+    collationSource->consumedChars = 0;

    for (;;)                           /* Loop handles case when incremental normalize switches   */
    {                                  /*   to or from the side buffer / original string, and we  */
@ -1568,9 +1554,9 @@ inline UBool collPrevIterFCD(collIterate *data)

    /* Get the trailing combining class of the current character. */
    c = *--src;
-    if (!UTF_IS_SURROGATE(c)) {
+    if (!U16_IS_SURROGATE(c)) {
        fcd = unorm_getFCD16(fcdTrieIndex, c);
-    } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
+    } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
        --src;
        fcd = unorm_getFCD16(fcdTrieIndex, c2);
        if (fcd != 0) {
@ -1595,9 +1581,9 @@ inline UBool collPrevIterFCD(collIterate *data)
            }

            c = *--src;
-            if (!UTF_IS_SURROGATE(c)) {
+            if (!U16_IS_SURROGATE(c)) {
                fcd = unorm_getFCD16(fcdTrieIndex, c);
-            } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
+            } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
                --src;
                fcd = unorm_getFCD16(fcdTrieIndex, c2);
                if (fcd != 0) {
@ -1817,75 +1803,34 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
        contraction
        */
        if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
-            result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
-        }
-        else {
-              // TODO: fix me for THAI - I reference *(data->pos-1)
-                if ((data->flags & UCOL_ITER_INNORMBUF) == 0 &&
-                    /*UCOL_ISTHAIBASECONSONANT(ch) &&*/   // This is from the old specs - we now rearrange unconditionally
-                    // makes sure that we're not at the beggining of the string
-                    //data->pos > data->string &&
-                    !collIter_bos(data) &&
-                    UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1)))
-                    //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
-                {
-                    collIterateState entryState;
-                    backupState(data, &entryState);
-                    // we have to check if the previous character is also Thai
-                    // if not, we can just set the result
-                    goBackOne(data);
-                    if(collIter_bos(data) || !UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) {
-                      loadState(data, &entryState, FALSE);
-                      result = UCOL_THAI;
-                    } else { // previous is also reordered
-                      // we need to go back as long as they are being reordered
-                      // count over the range of reorderable characters and see
-                      // if there is an even or odd number of them
-                      // if even, we should not reorder. If odd we should reorder.
-                      int32_t noReordered = 1; // the one we already detected
-                      while(!collIter_bos(data) && UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) {
-                        noReordered++;
-                        goBackOne(data);
-                      }
-                      if(noReordered & 1) { // odd number of reorderables
-                        result = UCOL_THAI;
-                      } else {
-                        result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
-                      }
-                      loadState(data, &entryState, FALSE);
-                    }
-                }
-            else if (ch <= 0xFF) {
-              result = coll->latinOneMapping[ch];
-              //if (result > UCOL_NOT_FOUND) {
-                //result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
-              //}
+          result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
+        } else {
+          if (ch <= 0xFF) {
+            result = coll->latinOneMapping[ch];
+          }
+          else {
+            result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
+          }
+          if (result > UCOL_NOT_FOUND) {
+            result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
+          }
+          if (result == UCOL_NOT_FOUND) {
+            if (!isAtStartPrevIterate(data) &&
+              ucol_contractionEndCP(ch, data->coll)) {
+                result = UCOL_CONTRACTION;
+              }
+            else {
+              if(coll->UCA) {
+                result = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
+              }
            }
-                else {
-                    /*result = ucmpe32_get(coll->mapping, ch);*/
-                    result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
-                }
-                    if (result > UCOL_NOT_FOUND) {
-                        result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
-                    }
-                if (result == UCOL_NOT_FOUND) {
-                  if (!isAtStartPrevIterate(data) &&
-                      ucol_contractionEndCP(ch, data->coll)) {
-                      result = UCOL_CONTRACTION;
-                  }
-                  else {
-                        /*result = ucmpe32_get(UCA->mapping, ch);*/
-                      if(coll->UCA) {
-                        result = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
-                      }
-                  }

-                  if (result > UCOL_NOT_FOUND && coll->UCA) {
-                    result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
-                  }
-                }
+            if (result > UCOL_NOT_FOUND && coll->UCA) {
+              result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
            }
+          }
        }
+    }
    return result;
 }

@ -2613,7 +2558,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
        UChar trail;
        collIterateState state;
        backupState(source, &state);
-        if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
+        if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
          // we chould have stepped one char forward and it might have turned that it
          // was not a trail surrogate. In that case, we have to backup.
          loadState(source, &state, TRUE);
@ -2631,93 +2576,6 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
        }
      }
      break;
-    case THAI_TAG:
-      /* Thai/Lao reordering */
-        if  (((source->flags) & UCOL_ITER_INNORMBUF)      /* Already Swapped     ||                 */
-          || collIter_eos(source))                        /* At end of string.  No swap possible    */
-        {
-            // Treat Thai as a length one expansion */
-            CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
-            CE = *CEOffset++;
-        }
-        else
-        {
-          // Move the prevowel and the following base Consonant into the normalization buffer
-          //   with their order swapped
-          // Note: this operation might activate the normalization buffer. We have to check for
-          // that and act accordingly.
-          UChar thCh = getNextNormalizedChar(source);
-          UChar32 cp = 0;
-          if(U16_IS_LEAD(thCh)) {
-            if(!collIter_eos(source)) {
-              collIterateState thaiState;
-              backupState(source, &thaiState);
-              UChar trailCh = getNextNormalizedChar(source);
-              if(U16_IS_TRAIL(trailCh)) {
-                cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
-              } else {
-                loadState(source, &thaiState, TRUE);
-                cp = (UChar32)thCh;
-              }
-            } else {
-              cp = (UChar32)thCh;
-            }
-          } else {
-              cp = (UChar32)thCh;
-          }
-          // Now we have the character that needs to be decomposed
-          // if the normalizing buffer was not used, we can just use our structure and be happy.
-          if((source->flags & UCOL_ITER_INNORMBUF) == 0) {
-            // decompose into writable buffer
-            int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
-            if(decompLen < 0) {
-              decompLen = -decompLen;
-            }
-            // reorder Thai and the character after it
-            if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) {
-              source->writableBuffer[0] = source->writableBuffer[1];
-              source->writableBuffer[1] = source->writableBuffer[2];
-              source->writableBuffer[2] = ch;
-            } else {
-              source->writableBuffer[0] = source->writableBuffer[1];
-              source->writableBuffer[1] = ch;
-            }
-            // zero terminate, since normalization buffer is always zero terminated
-            source->writableBuffer[decompLen+1] = 0; // we added the prevowel
-            if(source->pos) {
-              source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
-                                                           //   after exhausting the writableBuffer
-            }
-            source->pos   = source->writableBuffer;
-            source->origFlags         = source->flags;
-            source->flags            |= UCOL_ITER_INNORMBUF;
-            source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
-          }
-          else {
-              // stuff is already normalized... what to do here???
-
-              // if we are in the normalization buffer, thCh must be in it
-              // prove by contradiction
-              // if thCh is not in the normalization buffer,
-              // that means that trailCh is the normalization buffer
-              // that means that trailCh is a trail surrogate by the above
-              // bounding if block, this is a contradiction because there
-              // are no characters at the moment that decomposes to an
-              // unmatched surrogate. qed.
-              if (cp >= 0x10000) {
-                  source->writableBuffer[0] = source->writableBuffer[1];
-                  source->writableBuffer[1] = source->writableBuffer[2];
-                  source->writableBuffer[2] = ch;
-              }
-              else {
-                  source->writableBuffer[0] = source->writableBuffer[1];
-                  source->writableBuffer[1] = ch;
-              }
-              source->pos = source->writableBuffer;
-          }
-          CE = UCOL_IGNORABLE;
-      }
-      break;
    case SPEC_PROC_TAG:
      {
        // Special processing is getting a CE that is preceded by a certain prefix
@ -2759,42 +2617,6 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
          }
          else
          {
-              // if there is a completely ignorable code point in the middle of
-              // a prefix, we need to act as if it's not there
-              // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
-              // lone surrogates cannot be set to zero as it would break other processing
-              uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
-              // it's easy for BMP code points
-              if(isZeroCE == 0) {
-                continue;
-              } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
-                // for supplementary code points, we have to check the next one
-                // situations where we are going to ignore
-                // 1. beginning of the string: schar is a lone surrogate
-                // 2. schar is a lone surrogate
-                // 3. schar is a trail surrogate in a valid surrogate sequence
-                //    that is explicitly set to zero.
-                if (!collIter_bos(source)) {
-                  UChar lead;
-                  if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
-                    isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
-                    if(getCETag(isZeroCE) == SURROGATE_TAG) {
-                      uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
-                      if(finalCE == 0) {
-                        // this is a real, assigned completely ignorable code point
-                        goBackOne(source);
-                        continue;
-                      }
-                    }
-                  } else {
-                    // lone surrogate, completely ignorable
-                    continue;
-                  }
-                } else {
-                  // lone surrogate at the beggining, completely ignorable
-                  continue;
-                }
-              }
              // Source string char was not in the table.
              //   We have not found the prefix.
              CE = *(coll->contractionCEs +
@ -2864,45 +2686,23 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
            //  Pick up the corresponding CE from the table.
            CE = *(coll->contractionCEs +
                (UCharOffset - coll->contractionIndex));
+            source->consumedChars++;
        }
        else
        {
-            // if there is a completely ignorable code point in the middle of
-            // contraction, we need to act as if it's not there
-            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
-            // it's easy for BMP code points
-            if(isZeroCE == 0) {
-                continue;
-            } else if(UTF_IS_LEAD(schar)) {
-              if(!collIter_eos(source)) {
-                backupState(source, &state);
-                UChar trail = getNextNormalizedChar(source);
-                if(UTF_IS_TRAIL(trail)) { // do stuff with trail
-                  if(getCETag(isZeroCE) == SURROGATE_TAG) {
-                    uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
-                    if(finalCE == 0) {
-                      continue;
-                    }
-                  }
-                } else {
-                  // broken surrogate sequence, thus completely ignorable
-                  loadState(source, &state, TRUE);
-                  continue;
-                }
-                loadState(source, &state, TRUE);
-              } else { // no  more characters, so broken surrogate pair...
-                // this contraction will ultimately fail, but not because of us
-                continue;
-              }
-            } // else if(UTF_IS_LEAD(schar))
-
            // Source string char was not in contraction table.
            //   Unless we have a discontiguous contraction, we have finished
            //   with this contraction.
+            UChar32 miss = schar;
+            if(U16_IS_LEAD(schar)) { // in order to do the proper detection, we
+              // need to see if we're dealing with a supplementary
+              miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
+            }
+
            uint8_t sCC;
-            if (schar < 0x300 ||
+            if (miss < 0x300 ||
                maxCC == 0 ||
-                (sCC = i_getCombiningClass(schar, coll)) == 0 ||
+                (sCC = i_getCombiningClass(miss, coll)) == 0 ||
                sCC>maxCC ||
                (allSame != 0 && sCC == maxCC) ||
                collIter_eos(source)) {
@ -2910,6 +2710,9 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
                    goBackOne(source);  // back up the source string by one,
                                        //  because  the character we just looked at was
                                        //  not part of the contraction.   */
+                    if(U_IS_SUPPLEMENTARY(miss)) {
+                      goBackOne(source);
+                    }
                    CE = *(coll->contractionCEs +
                        (ContractionStart - coll->contractionIndex));
            } else {
@ -2921,9 +2724,13 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
                /* find the next character if schar is not a base character
                    and we are not yet at the end of the string */
                tempchar = getNextNormalizedChar(source);
+                // probably need another supplementary thingie here
                goBackOne(source);
                if (i_getCombiningClass(tempchar, coll) == 0) {
                    goBackOne(source);
+                    if(U_IS_SUPPLEMENTARY(miss)) {
+                      goBackOne(source);
+                    }
                    /* Spit out the last char of the string, wasn't tasty enough */
                    CE = *(coll->contractionCEs +
                        (ContractionStart - coll->contractionIndex));
@ -3217,20 +3024,6 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
          CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
          CE = *CEOffset++;
          break;
-#if 0
-          CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
-          size = getExpansionCount(CE);
-          CE = *CEOffset++;
-          if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
-            for(i = 1; i<size; i++) {
-              *(source->CEpos++) = *CEOffset++;
-            }
-          } else { /* else, we do */
-            while(*CEOffset != 0) {
-              *(source->CEpos++) = *CEOffset++;
-            }
-          }
-#endif
      }
      return CE;
      }
@ -3395,78 +3188,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
      /* if you have encountered it here, it means that a */
      /* broken sequence was encountered and this is an error */
      return 0;
-    case THAI_TAG:
-      if  ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
-            source->string == source->pos        || /* At start of string.|| */
-            /* previous char not Thai prevowel */
-            /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
-            UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE)
-            //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
-      {
-          /* Treat Thai as a length one expansion */
-          /* find the offset to expansion table */
-          CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE);
-          CE = *CEOffset ++;
-      }
-      else
-      {
-          /*
-          Move the prevowel and the following base Consonant into the
-          normalization buffer with their order swapped
-          */
-          UChar32 cp = (UChar32)peekCharacter(source, 0);
-          UBool reorder = TRUE;
-
-          int32_t decompLen = unorm_getDecomposition(cp, FALSE, source->writableBuffer, UCOL_WRITABLE_BUFFER_SIZE-1);
-          if(decompLen < 0) {
-            decompLen = -decompLen; // there was no decomposition
-          } else { // we need to check if we will hit a contraction trigger because of decomposition
-            int32_t i = decompLen;
-            for(i = 0; i < decompLen; i++) {
-              if(ucol_contractionEndCP(source->writableBuffer[i], coll)) {
-                reorder = FALSE;
-              }
-            }
-          }
-
-          UChar *tempbuffer = source->writableBuffer +
-                              (source->writableBufSize - 1);
-          uprv_memcpy(tempbuffer-decompLen + 1, source->writableBuffer, sizeof(UChar)*decompLen);
-          if(reorder) {
-            *(tempbuffer - decompLen) = *(tempbuffer - decompLen + 1);
-            *(tempbuffer - decompLen + 1)     = peekCharacter(source, -1);
-          } else {
-            *(tempbuffer - decompLen) = peekCharacter(source, -1);
-          }
-          *(tempbuffer - decompLen - 1) = 0;
-
-
-/*
-          UChar *tempbuffer = source->writableBuffer +
-                              (source->writableBufSize - 1);
-          *(tempbuffer - 2) = 0;
-          *(tempbuffer - 1) = peekCharacter(source, 0);
-          *(tempbuffer)     = peekCharacter(source, -1);
-*/
-          /*
-          Indicate where to continue in main input string after exhausting
-          the writableBuffer
-          */
-          if (source->pos - 1 == source->string) {
-              source->fcdPosition = NULL;
-          } else {
-            source->fcdPosition       = source->pos-2;
-          }
-
-          source->pos               = tempbuffer+1; // we're doing predecrement, right?
-          source->origFlags         = source->flags;
-          source->flags            |= UCOL_ITER_INNORMBUF;
-          source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
-
-          //CE = UCOL_IGNORABLE;
-          return(UCOL_IGNORABLE);
-      }
-      break;
    case SPEC_PROC_TAG:
      {
        // Special processing is getting a CE that is preceded by a certain prefix
@ -3513,7 +3234,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
              // it's easy for BMP code points
              if(isZeroCE == 0) {
                continue;
-              } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
+              } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
                // for supplementary code points, we have to check the next one
                // situations where we are going to ignore
                // 1. beginning of the string: schar is a lone surrogate
@ -3522,7 +3243,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                //    that is explicitly set to zero.
                if (!collIter_bos(source)) {
                  UChar lead;
-                  if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
+                  if(U16_IS_LEAD(lead = getPrevNormalizedChar(source))) {
                    isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
                    if(getCETag(isZeroCE) == SURROGATE_TAG) {
                      uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
@ -3581,9 +3302,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
        *(UCharOffset --) = 0;
        noChars = 0;
        // have to swap thai characters
-        while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) {
-          // we might have ended here after trying to reorder Thai, but seeing that there are unsafe points
-          // in the backward processing
+        while (ucol_unsafeCP(schar, coll)) {
            *(UCharOffset) = schar;
            noChars++;
            UCharOffset --;
@ -3911,33 +3630,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
          CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
          CE = *(CEOffset++);
          break;
-#if 0
-        /* find the offset to expansion table */
-          CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
-          size     = getExpansionCount(CE);
-          if (size != 0) {
-            /*
-            if there are less than 16 elements in expansion, we don't terminate
-            */
-            uint32_t count;
-            for (count = 0; count < size; count++) {
-              *(source->CEpos ++) = *CEOffset++;
-            }
-          }
-          else {
-            /* else, we do */
-            while (*CEOffset != 0) {
-              *(source->CEpos ++) = *CEOffset ++;
-            }
-          }
-          source->toReturn = source->CEpos - 1;
-          // in case of one element expansion, we
-          // want to immediately return CEpos
-          if(source->toReturn == source->CEs) {
-            source->CEpos = source->CEs;
-          }
-          return *(source->toReturn);
-#endif
      }
      }
    case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
@ -4044,7 +3736,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
      prevChar = *prev;

      /* Handles Han and Supplementary characters here.*/
-      if (UTF_IS_FIRST_SURROGATE(prevChar)) {
+      if (U16_IS_LEAD(prevChar)) {
        cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
        source->pos = prev;
      } else {
@ -6456,6 +6148,7 @@ saveState:
      } else {
        state[0] = iterState;
        iterSkips++;
+        iterSkips += s.consumedChars;
      }
    }
    // Store the number of elements processed. On CE levels, this is
@ -8325,9 +8018,6 @@ ucol_strcollIter( const UCollator    *coll,

  while((sChar = sColl.iterator->next(sColl.iterator)) ==
    (tChar = tColl.iterator->next(tColl.iterator))) {
-    if(UCOL_ISTHAIPREVOWEL(sChar)) {
-      break;
-    }
    if(sChar == U_SENTINEL) {
      result = UCOL_EQUAL;
      goto end_compare;
@ -8422,9 +8112,6 @@ ucol_strcoll( const UCollator    *coll,
            if ( *pSrc != *pTarg || *pSrc == 0) {
                break;
            }
-            if(UCOL_ISTHAIPREVOWEL(*pSrc)) {
-              break;
-            }
            pSrc++;
            pTarg++;
        }
@ -8458,9 +8145,6 @@ ucol_strcoll( const UCollator    *coll,
                if (*pSrc != *pTarg) {
                    break;
                }
-                if(UCOL_ISTHAIPREVOWEL(*pSrc)) { // they are the same here, so any will do
-                    break;
-                }
                pSrc++;
                pTarg++;
            }
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@ -67,29 +67,73 @@ isAcceptableInvUCA(void * /*context*/,
 }
 U_CDECL_END

+/* 
+ * Takes two CEs (lead and continuation) and 
+ * compares them as CEs should be compared:
+ * primary vs. primary, secondary vs. secondary
+ * tertiary vs. tertiary
+ */
+static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
+  uint32_t s1 = source0, s2, t1 = target0, t2;
+  if(isContinuation(source1)) {
+    s2 = source1;
+  } else {
+    s2 = 0;
+  }
+  if(isContinuation(target1)) {
+    t2 = target1;
+  } else {
+    t2 = 0;
+  }
+  
+  uint32_t s = 0, t = 0;
+  if(s1 == t1 && s2 == t2) {
+    return 0;
+  }
+  s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 
+  t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 
+  if(s < t) {
+    return -1;
+  } else if(s > t) {
+    return 1;
+  } else {
+    s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
+    t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
+    if(s < t) {
+      return -1;
+    } else if(s > t) {
+      return 1;
+    } else {
+      s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
+      t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
+      if(s < t) {
+        return -1;
+      } else {
+        return 1;
+      }
+    }
+  }
+}
+
 static
 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
  uint32_t bottom = 0, top = src->invUCA->tableSize;
  uint32_t i = 0;
  uint32_t first = 0, second = 0;
  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
+  int32_t res = 0;

  while(bottom < top-1) {
    i = (top+bottom)/2;
    first = *(CETable+3*i);
    second = *(CETable+3*i+1);
-    if(first > CE) {
+    res = compareCEs(first, second, CE, SecondCE);
+    if(res > 0) {
      top = i;
-    } else if(first < CE) {
+    } else if(res < 0) {
      bottom = i;
    } else {
-        if(second > SecondCE) {
-          top = i;
-        } else if(second < SecondCE) {
-          bottom = i;
-        } else {
-          break;
-        }
+      break;
    }
  }

@ -862,13 +906,6 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
      el.cSize = (tok->source >> 24); 
      uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    }
-
-    if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
-      el.isThai = TRUE;
-    } else {
-      el.isThai = FALSE;
-    }
-
    if(src->UCA != NULL) {
      for(i = 0; i<el.cSize; i++) {
        if(UCOL_ISJAMO(el.cPoints[i])) {
@ -877,44 +914,12 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
      }
    }

-#if 0
-    // we do case bits in doCE now, since we will mess up expansions otherwise.
-    // Case bits handling 
-    el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
-    if(el.cSize > 1) {
-      // Do it manually
-      el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status);
-    } else {
-      // Copy it from the UCA
-      uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
-      el.CEs[0] |= (caseCE & 0xC0);
-    }
-#endif
-
    /* and then, add it */
 #if UCOL_DEBUG==2
    fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 #endif
    uprv_uca_addAnElement(t, &el, status);

-#if 0
-    if(el.cSize > 1) { // this is a contraction, we should check whether a composed form should also be included
-      UChar composed[256];
-      uint32_t compLen = unorm_normalize(el.cPoints, el.cSize, UNORM_NFC, 0, composed, 256, status);;
-
-      if(compLen != el.cSize || uprv_memcmp(composed, el.cPoints, el.cSize*sizeof(UChar))) {
-        // composed form of a contraction is different than the decomposed form!
-        // do it!
-#ifdef UCOL_DEBUG
-        fprintf(stderr, "Adding composed for %04X->%04X\n", *element->cPoints, *composed);
-#endif
-        el.cSize = compLen;
-        uprv_memcpy(el.cPoints, composed, el.cSize*sizeof(UChar));
-        uprv_uca_addAnElement(t, &el, status);
-      }
-    }
-#endif
-
 #if UCOL_DEBUG_DUPLICATES
    if(*status != U_ZERO_ERROR) {
      fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
--- a/icu4c/source/i18n/ucol_bld.h
+++ b/icu4c/source/i18n/ucol_bld.h
@ -28,6 +28,7 @@
 #include "unicode/utypes.h"

 #if !UCONFIG_NO_COLLATION
+#if !UCONFIG_NO_COLLATION_BUILDER

 #include "ucol_imp.h"
 #include "ucol_tok.h"
@ -55,6 +56,7 @@ typedef struct {
  uint32_t fHigh; /*forbidden High */
 } ucolCEGenerator;

+#endif /* #if !UCONFIG_NO_COLLATION_BUILDER */
 #endif /* #if !UCONFIG_NO_COLLATION */

 #endif
--- a/icu4c/source/i18n/ucol_elm.cpp
+++ b/icu4c/source/i18n/ucol_elm.cpp
@ -281,6 +281,7 @@ uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
    r->maxExpansions->position = t->maxExpansions->position;
    if(t->maxExpansions->endExpansionCE != NULL) {
      r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size);
+      uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size);
      /* test for NULL */
      if (r->maxExpansions->endExpansionCE == NULL) {
          *status = U_MEMORY_ALLOCATION_ERROR;
@ -292,6 +293,7 @@ uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
    }
    if(t->maxExpansions->expansionCESize != NULL) {
      r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size);
+      uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size);
      /* test for NULL */
      if (r->maxExpansions->expansionCESize == NULL) {
          *status = U_MEMORY_ALLOCATION_ERROR;
@ -1016,14 +1018,7 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
  element->mapCE = 0; // clear mapCE so that we can catch expansions

  if(element->noOfCEs == 1) {
-    if(element->isThai == FALSE) {
-          element->mapCE = element->CEs[0];      
-    } else { /* add thai - totally bad here */
-      expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (THAI_TAG<<UCOL_TAG_SHIFT) 
-        | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 
-        | 0x1);
-      element->mapCE = expansion;
-    }
+    element->mapCE = element->CEs[0];      
  } else {     
    /* ICU 2.1 long primaries */
    /* unfortunately, it looks like we have to look for a long primary here */
@ -1425,15 +1420,15 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {

    /* copy max expansion table */
    myData->endExpansionCE      = tableOffset;
-    myData->endExpansionCECount = maxexpansion->position;
+    myData->endExpansionCECount = maxexpansion->position - 1;
    /* not copying the first element which is a dummy */
    uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1, 
-                maxexpansion->position * sizeof(uint32_t));
-    tableOffset += (uint32_t)(paddedsize(maxexpansion->position * sizeof(uint32_t)));
+                (maxexpansion->position - 1) * sizeof(uint32_t));
+    tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t)));
    myData->expansionCESize = tableOffset;
    uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1, 
-                maxexpansion->position * sizeof(uint8_t));
-    tableOffset += (uint32_t)(paddedsize(maxexpansion->position * sizeof(uint8_t)));
+                (maxexpansion->position - 1) * sizeof(uint8_t));
+    tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t)));

    /* Unsafe chars table.  Finish it off, then copy it. */
    uprv_uca_unsafeCPAddCCNZ(t, status);
@ -1546,12 +1541,6 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
            // Since unsafeCPSet is static in ucol_elm, we are going
            // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function
          }
-          if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
-            el.isThai = TRUE;
-          } else {
-            el.isThai = FALSE;
-          }
-
          uprv_uca_addAnElement(t, &el, status);
        }
      }
--- a/icu4c/source/i18n/ucol_imp.h
+++ b/icu4c/source/i18n/ucol_imp.h
@ -279,6 +279,9 @@ typedef struct collIterate {
  uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */
  UChar stackWritableBuffer[UCOL_WRITABLE_BUFFER_SIZE]; /* A writable buffer. */
  UCharIterator *iterator;
+  uint32_t consumedChars; /* number of extra consumed chars in a contraction */
+                          /* used in conjuction with iterator state for partial */
+                          /* sortkeys */
  /*int32_t iteratorIndex;*/
 } collIterate;

@ -295,6 +298,7 @@ struct collIterateState {
    uint8_t   origFlags;
    uint32_t   iteratorIndex;
    int32_t    iteratorMove;
+    uint32_t consumedChars;
 };

 U_CAPI void U_EXPORT2 
@ -558,7 +562,7 @@ enum {
    UCOL_BYTE_FIRST_TAILORED = 0x04,
    UCOL_BYTE_COMMON = 0x05,
    UCOL_BYTE_FIRST_UCA = UCOL_BYTE_COMMON,
-    UCOL_CODAN_PLACEHOLDER = 0x24,
+    UCOL_CODAN_PLACEHOLDER = 0x26,
    UCOL_BYTE_LAST_LATIN_PRIMARY = 0x4C,
    UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x4D,
    UCOL_BYTE_UNSHIFTED_MAX = 0xFF