ICU-159 Thai collation

X-SVN-Rev: 426
1999-12-16 01:41:19 +00:00 · 1999-12-16 01:41:19 +00:00 · d5831470e3
commit d5831470e3
parent f3be55589b
9 changed files with 18811 additions and 272 deletions
--- a/icu4c/source/i18n/coleitr.cpp
+++ b/icu4c/source/i18n/coleitr.cpp
@ -18,6 +18,7 @@
 //
 //  6/23/97     helena      Adding comments to make code more readable.
 // 08/03/98     erm         Synched with 1.2 version of CollationElementIterator.java
+// 12/10/99      aliu          Ported Thai collation support from Java.
 //=============================================================================

 #include "sortkey.h"
@ -40,8 +41,9 @@ int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;
 CollationElementIterator::CollationElementIterator()
 : expIndex(0),
  text(0),
-  swapOrder(0),
  bufferAlias(0),
+  ownBuffer(new VectorOfInt(2)),
+  reorderBuffer(0),
  orderAlias(0)
 {
 }
@ -52,7 +54,8 @@ CollationElementIterator::CollationElementIterator(const RuleBasedCollator* orde
 : expIndex(0),
  text(0),
  bufferAlias(0),
-  swapOrder(0),
+  ownBuffer(new VectorOfInt(2)),
+  reorderBuffer(0),
  orderAlias(order)
 {
 }
@ -63,9 +66,10 @@ CollationElementIterator::CollationElementIterator( const UnicodeString& sourceT
                                                    const RuleBasedCollator* order,
                                                    UErrorCode& status) 
 : expIndex(0), 
-  swapOrder(0),
  text(NULL),
  bufferAlias(NULL),
+  ownBuffer(new VectorOfInt(2)),
+  reorderBuffer(0),
  orderAlias(order)
 {
    if (U_FAILURE(status)) {
@ -99,9 +103,10 @@ CollationElementIterator::CollationElementIterator( const CharacterIterator& sou
                                                    const RuleBasedCollator* order,
                                                    UErrorCode& status) 
 : expIndex(0), 
-  swapOrder(0),
  text(NULL),
  bufferAlias(NULL),
+  ownBuffer(new VectorOfInt(2)),
+  reorderBuffer(0),
  orderAlias(order)
 {
    if (U_FAILURE(status)) {
@ -131,11 +136,11 @@ CollationElementIterator::CollationElementIterator( const CharacterIterator& sou
 }

 CollationElementIterator::CollationElementIterator(const    CollationElementIterator& other)
-    : expIndex(other.expIndex), text(0), swapOrder(other.swapOrder)
+    : expIndex(other.expIndex), text(0),
+      ownBuffer(new VectorOfInt(2)),
+      reorderBuffer(0)
 {
-    text = (Normalizer*) other.text->clone();
-    bufferAlias = other.bufferAlias;
-    orderAlias = other.orderAlias;
+    *this = other;
 }

 const   CollationElementIterator&
@ -144,12 +149,24 @@ CollationElementIterator::operator=(const   CollationElementIterator& other)
    if (this != &other)
    {
        expIndex = other.expIndex;
-        swapOrder = other.swapOrder;

        delete text;
        text = (Normalizer*)other.text->clone();

-        bufferAlias = other.bufferAlias;
+        if (other.bufferAlias == other.ownBuffer) {
+            *ownBuffer = *other.ownBuffer;
+            bufferAlias = ownBuffer;
+        } else if (other.bufferAlias != NULL &&
+                   other.bufferAlias == other.reorderBuffer) {
+            if (reorderBuffer == NULL) {
+                reorderBuffer = new VectorOfInt(*other.reorderBuffer);
+            } else {
+                *reorderBuffer = *other.reorderBuffer;
+            }
+            bufferAlias = reorderBuffer;
+        } else {
+            bufferAlias = other.bufferAlias;
+        }
        orderAlias = other.orderAlias;
    }

@ -162,6 +179,8 @@ CollationElementIterator::~CollationElementIterator()
    text = NULL;
    bufferAlias = NULL;
    orderAlias = NULL;
+    delete ownBuffer;
+    delete reorderBuffer;
 }

 bool_t
@ -177,12 +196,8 @@ CollationElementIterator::operator==(const CollationElementIterator& that) const
        return FALSE;
    }

-    if (swapOrder != that.swapOrder)
-    {
-        return FALSE;
-    }
-
-    if (*bufferAlias != *(that.bufferAlias))
+    if (((bufferAlias == NULL) != (that.bufferAlias == NULL)) ||
+        (bufferAlias != NULL && *bufferAlias != *(that.bufferAlias)))
    {
        return FALSE;
    }
@ -220,7 +235,6 @@ CollationElementIterator::reset()

  bufferAlias = NULL;
  expIndex = 0;
-  swapOrder = 0;
 }

 // Sets the source to the new source string.
@ -234,8 +248,6 @@ CollationElementIterator::setText(const UnicodeString&  source,
    }

    bufferAlias = 0;
-    swapOrder = 0;
-    expIndex = 0;

    if (text == NULL)
    {
@ -258,8 +270,6 @@ CollationElementIterator::setText(CharacterIterator&  source,
    }

    bufferAlias = 0;
-    swapOrder = 0;
-    expIndex = 0;

    if (text == NULL) {
        text = new Normalizer(source, orderAlias->getDecomposition());
@ -304,22 +314,8 @@ CollationElementIterator::next(UErrorCode& status)
        else
        {
            bufferAlias = NULL;
-            expIndex = 0;
        }
    }
-    else if (swapOrder != 0)
-    {
-        // If we find a character with no order, we return the marking
-        // flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character 
-        // itself shifted left 16 bits as orders.  At this point, the
-        // UNMAPPEDCHARVALUE flag has already been returned by the code
-        // below, so just return the shifted character here.
-        int32_t order = swapOrder << 16;
-
-        swapOrder = 0;
-
-        return order;
-    }

    // Gets the next character from the string using decomposition iterator.
    UChar ch = text->current();
@ -345,20 +341,36 @@ CollationElementIterator::next(UErrorCode& status)
        // Returned an "unmapped" flag and save the character so it can be 
        // returned next time this method is called.
        if (ch == 0x0000) return ch;
-        swapOrder = ch;  // \u0000 is not valid in C++'s UnicodeString
-        return UNMAPPEDCHARVALUE;
+        // \u0000 is not valid in C++'s UnicodeString
+        ownBuffer->at(0) = UNMAPPEDCHARVALUE;
+        ownBuffer->at(1) = ch << 16;
+        bufferAlias = ownBuffer;
    }
-    
-    if (value >= RuleBasedCollator::CONTRACTCHARINDEX)
-    {
-        value = nextContractChar(ch, status);
+    else {
+        if (value >= RuleBasedCollator.CONTRACTCHARINDEX) {
+            value = nextContractChar(ch, status);
+        }
+        if (value >= RuleBasedCollator.EXPANDCHARINDEX) {
+            bufferAlias = orderAlias->getExpandValueList(value);
+        }
+        
+        if (isThaiPreVowel(ch)) {
+            UChar consonant = text->next();
+            if (isThaiBaseConsonant(consonant)) {
+                
+                bufferAlias = makeReorderedBuffer(consonant, value, bufferAlias,
+                                                  TRUE, status);
+                
+            }
+            else {
+                text->previous();
+            }
+        }
    }

-    if (value >= RuleBasedCollator::EXPANDCHARINDEX)
-    {
-        bufferAlias = orderAlias->getExpandValueList(value);
-        expIndex = 0;
-        value = bufferAlias->at(expIndex++);
+    if (bufferAlias != NULL) {
+        expIndex = 1;
+        value = bufferAlias->at(0);
    }

    return strengthOrder(value);
@ -388,14 +400,6 @@ CollationElementIterator::previous(UErrorCode& status)
        }

        bufferAlias = NULL;
-        expIndex = 0;
-    }
-    else if (swapOrder != 0)
-    {
-        int32_t order = swapOrder << 16;
-
-        swapOrder = 0;
-        return order;
    }

    UChar ch = text->previous();
@ -411,20 +415,34 @@ CollationElementIterator::previous(UErrorCode& status)
    if (value == RuleBasedCollator::UNMAPPED)
    {
        if (ch == 0x0000) return ch;
-        swapOrder = UNMAPPEDCHARVALUE;
-        return ch;
+        ownBuffer->at(0) = UNMAPPEDCHARVALUE;
+        ownBuffer->at(1) = ch << 16;
+        bufferAlias = ownBuffer;
    }
-    
-    if (value >= RuleBasedCollator::CONTRACTCHARINDEX)
-    {
-        value = prevContractChar(ch, status);
+    else {
+        if (value >= RuleBasedCollator::CONTRACTCHARINDEX) {
+            value = prevContractChar(ch, status);
+        }
+        if (value >= RuleBasedCollator::EXPANDCHARINDEX) {
+            bufferAlias = orderAlias->getExpandValueList(value);
+        }
+
+        if (isThaiBaseConsonant(ch)) {
+
+            UChar vowel = text->previous();
+            if (isThaiPreVowel(vowel)) {
+                bufferAlias = makeReorderedBuffer(vowel, value, bufferAlias,
+                                                  FALSE, status);
+            }
+            else {
+                text->next();
+            }
+        }
    }

-    if (value >= RuleBasedCollator::EXPANDCHARINDEX)
-    {
-        bufferAlias = orderAlias->getExpandValueList(value);
-        expIndex = bufferAlias->size();
-        value = bufferAlias->at(--expIndex);
+    if (bufferAlias != NULL) {
+        expIndex = bufferAlias->size()-1;
+        value = bufferAlias->at(expIndex);
    }

    return strengthOrder(value);
@ -468,15 +486,12 @@ CollationElementIterator::setOffset(UTextOffset newOffset,
    }

    bufferAlias = NULL;
-    expIndex = 0;
-    swapOrder = 0;
 }

 //============================================================
 // privates
 //============================================================

-
 /**
 * Get the ordering priority of the next contracting character in the
 * string.
@ -566,3 +581,92 @@ int32_t CollationElementIterator::prevContractChar(UChar ch,

    return order;
 }
+
+/**
+ * This method produces a buffer which contains the collation
+ * elements for the two characters, with colFirst's values preceding
+ * another character's.  Presumably, the other character precedes colFirst
+ * in logical
+ * order (otherwise you wouldn't need this method would you?).
+ * The assumption is that the other char's value(s) have already been
+ * computed.  If this char has a single element it is passed to this
+ * method as lastValue, and lastExpasion is null.  If it has an
+ * expasion it is passed in lastExpansion, and colLastValue is ignored.
+ * This method may return the ownBuffer array as its value so ownBuffer
+ * had better not be in use anywhere else.
+ */
+VectorOfInt* CollationElementIterator::makeReorderedBuffer(UChar colFirst,
+                                                           int32_t lastValue,
+                                                           VectorOfInt* lastExpansion,
+                                                           bool_t forward,
+                                                           UErrorCode& status) {
+
+    VectorOfInt* result;
+
+    int32_t firstValue = ucmp32_get(orderAlias->data->mapping, colFirst);
+    if (firstValue >= RuleBasedCollator::CONTRACTCHARINDEX) {
+        firstValue = forward ? nextContractChar(colFirst, status)
+                             : prevContractChar(colFirst, status);
+    }
+
+    VectorOfInt* firstExpansion = NULL;
+    if (firstValue >= RuleBasedCollator::EXPANDCHARINDEX) {
+        firstExpansion = orderAlias->getExpandValueList(firstValue);
+    }
+
+    if (!forward) {
+        int32_t temp1 = firstValue;
+        firstValue = lastValue;
+        lastValue = temp1;
+        VectorOfInt* temp2 = firstExpansion;
+        firstExpansion = lastExpansion;
+        lastExpansion = temp2;
+    }
+
+    if (firstExpansion == NULL && lastExpansion == NULL) {
+        ownBuffer->at(0) = firstValue;
+        ownBuffer->at(1) = lastValue;
+        result = ownBuffer;
+    }
+    else {
+        int32_t firstLength = firstExpansion==NULL? 1 : firstExpansion->size();
+        int32_t lastLength = lastExpansion==NULL? 1 : lastExpansion->size();
+        if (reorderBuffer == NULL) {
+            reorderBuffer = new VectorOfInt(firstLength+lastLength);
+        }
+        // reorderdBuffer gets reused for the life of this object.
+        // Since its internal buffer only grows, there is a danger
+        // that it will get really, really big, and never shrink.  If
+        // this is actually happening, insert code here to check for
+        // the condition.  Something along the lines of:
+        //! else if (reorderBuffer->size() >= 256 &&
+        //!          (firstLength+lastLength) < 16) {
+        //!     delete reorderBuffer;
+        //!     reorderBuffer = new VectorOfInt(firstLength+lastLength);
+        //! }
+        // The specific numeric values need to be determined
+        // empirically. [aliu]
+        result = reorderBuffer;
+
+        if (firstExpansion == NULL) {
+            result->atPut(0, firstValue);
+        }
+        else {
+            // System.arraycopy(firstExpansion, 0, result, 0, firstLength);
+            *result = *firstExpansion;
+        }
+
+        if (lastExpansion == NULL) {
+            result->atPut(firstLength, lastValue);
+        }
+        else {
+            // System.arraycopy(lastExpansion, 0, result, firstLength, lastLength);
+            for (int32_t i=0; i<lastLength; ++i) {
+                result->atPut(firstLength + i, lastExpansion->at(i));
+            }
+        }
+        result->setSize(firstLength+lastLength);
+    }
+
+    return result;
+}
--- a/icu4c/source/i18n/coleitr.h
+++ b/icu4c/source/i18n/coleitr.h
@ -18,6 +18,7 @@
 //
 //  8/18/97     helena      Added internal API documentation.
 // 08/03/98        erm            Synched with 1.2 version CollationElementIterator.java
+// 12/10/99      aliu          Ported Thai collation support from Java.
 //===============================================================================

 #ifndef COLEITR_H
@ -28,9 +29,7 @@
 #include "tblcoll.h"
 #include "chariter.h"

-
 class Normalizer;
-class VectorOfInt;
 class VectorOfPToContractElement;

 /**
@ -266,14 +265,42 @@ private:
     */
            int32_t             prevContractChar(   UChar     ch,
                                                    UErrorCode&  status);
+    
+    inline static bool_t isThaiPreVowel(UChar ch);
+                 
+    inline static bool_t isThaiBaseConsonant(UChar ch);
+                 
+    VectorOfInt* makeReorderedBuffer(UChar colFirst,
+                                     int32_t lastValue,
+                                     VectorOfInt* lastExpansion,
+                                     bool_t forward, UErrorCode& status);

    friend  class   RuleBasedCollator;
    static  const   int32_t         UNMAPPEDCHARVALUE;

            Normalizer*            text;       // owning 

-            VectorOfInt*        bufferAlias;
-            int32_t             swapOrder;  // for unmapped characters
+            VectorOfInt*        bufferAlias; // not owned
+
+    /**
+     * ownBuffer wants to be a subobject, not a pointer, but that
+     * means exposing the internal class VectorOfInt by #including the
+     * internal header "tables.h" -- not allowed!  ownBuffer is a
+     * fixed-size 2-element vector that is used to handle Thai
+     * collation; bufferAlias points to ownBuffer in some situations.
+     * [j159 - aliu]
+     */
+            VectorOfInt*        ownBuffer;
+
+    /**
+     * reorderBuffer is created on demand, so it doesn't want to be
+     * a subobject -- pointer is fine.  It is created and bufferAlias
+     * is set to it under certain conditions.  Once created, it is
+     * reused for the life of this object.  Because of the implementation
+     * of VectorOfInt, it grows monotonically.  [j159 - aliu]
+     */
+            VectorOfInt*        reorderBuffer;
+
            int32_t             expIndex;
            UnicodeString       key;
    const   RuleBasedCollator*  orderAlias;
@ -325,4 +352,19 @@ CollationElementIterator::isIgnorable(int32_t order)
    return (primaryOrder(order) == 0);
 }

+/**
+ * Determine if a character is a Thai vowel (which sorts after
+ * its base consonant).
+ */
+inline bool_t CollationElementIterator::isThaiPreVowel(UChar ch) {
+    return (ch >= (UChar)0x0E40) && (ch <= (UChar)0X0E44);
+}
+
+/**
+ * Determine if a character is a Thai base consonant
+ */
+inline bool_t CollationElementIterator::isThaiBaseConsonant(UChar ch) {
+    return (ch >= (UChar)0x0E01) && (ch <= (UChar)0x0E2E);
+}
+
 #endif
--- a/icu4c/source/i18n/tblcoll.cpp
+++ b/icu4c/source/i18n/tblcoll.cpp
@ -47,6 +47,8 @@
 * 11/02/99     helena      Collator performance enhancements.  Special case
 *                          for NO_OP situations. 
 * 11/17/99     srl         More performance enhancements. Inlined some internal functions.
+* 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
+*                          to implementation file.
 *******************************************************************************
 */

@ -134,6 +136,186 @@ const int16_t RuleBasedCollator::FILEID = 0x5443;                    // unique f
 const char* RuleBasedCollator::kFilenameSuffix = ".col";             // binary collation file extension
 char  RuleBasedCollator::fgClassID = 0; // Value is irrelevant       // class id

+////////////////////////////////////////////////////////////////////////
+// NormalizerIterator
+//
+// This class is essentially a duplicate of CollationElementIterator,
+// stripped down for speed.  It is declared here so we can incorporate
+// internal classes as subobjects, as well as just to hide it from the
+// public interface.
+////////////////////////////////////////////////////////////////////////
+
+/* Internal class for quick iteration over the text.
+   100% pure inline code
+*/
+class NormalizerIterator { 
+public:
+    Normalizer *cursor;
+    VectorOfInt *bufferAlias;
+    VectorOfInt *reorderBuffer;
+    VectorOfInt ownBuffer;
+    UChar*      text;
+    int32_t     expIndex;
+    int32_t     textLen;
+    UTextOffset  currentOffset;
+    
+    NormalizerIterator(void);
+    NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode);
+    ~NormalizerIterator(void);
+    void setText(const UChar* source, int32_t length, UErrorCode& status);
+    void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status);
+    
+    UChar current(void) const;
+    UChar next(void);
+    void reset(void);
+};
+
+inline
+NormalizerIterator::NormalizerIterator() :
+    cursor(0),
+    bufferAlias(0),
+    reorderBuffer(0),
+    ownBuffer(2),
+    text(0),
+    textLen(0),
+    currentOffset(0),
+    expIndex(0)
+{
+}
+
+inline
+NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) :
+    cursor(0),
+    bufferAlias(0),
+    reorderBuffer(0),
+    ownBuffer(2),
+    text(0),
+    textLen(0),
+    currentOffset(0),
+    expIndex(0)
+{
+    if (mode == Normalizer::NO_OP) {
+        text = (UChar*)source;
+        textLen = length;
+        currentOffset = 0;
+    } else {
+        cursor = new Normalizer(source, length, mode);
+
+    }
+}
+
+inline
+NormalizerIterator::~NormalizerIterator() 
+{
+    if (cursor != 0) {
+        delete cursor;
+        cursor = 0;
+    }
+    if (reorderBuffer != 0) {
+        delete reorderBuffer;
+    }
+}
+
+inline
+void
+NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status)
+{
+    if (cursor == 0) {
+        text = (UChar*)source;
+        textLen = length;
+        currentOffset = 0;
+
+    } else {
+        text = 0;
+        cursor->setText(source, length, status);
+    }
+    bufferAlias = 0;
+    currentOffset = 0;
+}
+
+/* You can only set mode after the comparision of two strings is completed.
+   Setting the mode in the middle of a comparison is not allowed.
+   */
+inline
+void
+
+NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status)
+{
+    if(mode != Normalizer::NO_OP)
+    {
+        /* DO have a mode -  will need a normalizer object */
+        if(cursor != NULL)
+        {
+            /* Just modify the existing cursor */
+            cursor->setMode(mode);
+            cursor->setText(source, length, status);
+        }
+        else
+        {
+            cursor = new Normalizer(source, length, mode);
+        }
+
+        /* RESET the old data */
+        text = 0;
+        textLen = 0;
+    }
+    else 
+    {
+        /* NO_OP mode.. */
+        if(cursor != NULL)
+        { /* get rid of the old cursor */
+            delete cursor; 
+            cursor = 0;
+        }
+
+        text = (UChar*)source;
+        textLen = length;
+    }
+    currentOffset = 0; /* always */
+   
+    bufferAlias = 0;
+}
+
+inline
+UChar
+NormalizerIterator::current(void) const
+{
+    if (text != 0) {
+        if(currentOffset >= textLen)
+        {
+            return Normalizer::DONE;
+        }
+        else
+        {
+            return text[currentOffset];
+        }
+    }
+
+    return cursor->current();
+}
+
+
+inline
+UChar
+NormalizerIterator::next(void)
+{
+    if (text != 0) {
+        return ((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE);
+    }
+    return cursor->next();
+}
+
+inline
+void
+NormalizerIterator::reset(void)
+{
+    currentOffset = 0;
+    if(cursor)
+    {
+        cursor->reset();
+    }
+}
+
 //================ Some inline definitions of implementation functions........ ========

 inline int32_t
@ -171,29 +353,14 @@ RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor,
        // all we have to do here is return the next ordering in the buffer.  
        if (cursor->expIndex < cursor->bufferAlias->size())
        {
-	  //_L((stderr, "next from [%08X] from bufferAlias\n", this));
+            //_L((stderr, "next from [%08X] from bufferAlias\n", this));
            return strengthOrder(cursor->bufferAlias->at(cursor->expIndex++));
        }
        else
        {
            cursor->bufferAlias = NULL;
-            cursor->expIndex = 0;
        }
    }
-    else if (cursor->swapOrder != 0)
-    {
-        // If we find a character with no order, we return the marking
-        // flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character 
-        // itself shifted left 16 bits as orders.  At this point, the
-        // UNMAPPEDCHARVALUE flag has already been returned by the code
-        // below, so just return the shifted character here.
-        int32_t order = cursor->swapOrder << 16;
-
-	  //_L((stderr, "next from [%08X] swaporder..\n", this));
-        cursor->swapOrder = 0;
-
-        return order;
-    }

    UChar ch = cursor->current();
    cursor->next();
@ -210,28 +377,110 @@ RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor,
    {
        // Returned an "unmapped" flag and save the character so it can be 
        // returned next time this method is called.
-        if (ch == 0x0000) return ch;
-        cursor->swapOrder = ch;  // \u0000 is not valid in C++'s UnicodeString
-        return CollationElementIterator::UNMAPPEDCHARVALUE;
-    }
-    
-    if (value >= CONTRACTCHARINDEX)
-    {
-        value = nextContractChar(cursor, ch, status);
+        if (ch == 0x0000) return ch; // \u0000 is not valid in C++'s UnicodeString
+        cursor->ownBuffer.at(0) = CollationElementIterator::UNMAPPEDCHARVALUE;
+        cursor->ownBuffer.at(1) = ch << 16;
+        cursor->bufferAlias = &cursor->ownBuffer;
+
+    } else {
+        
+        if (value >= CONTRACTCHARINDEX)
+        {
+            value = nextContractChar(cursor, ch, status);
+        }
+        
+        if (value >= EXPANDCHARINDEX) {
+            cursor->bufferAlias = getExpandValueList(value);
+        }
+        
+        if (CollationElementIterator::isThaiPreVowel(ch)) {
+            UChar consonant = cursor->current();
+            if (CollationElementIterator::isThaiBaseConsonant(consonant)) {
+                cursor->next();
+                cursor->bufferAlias = makeReorderedBuffer(cursor, consonant, value,
+                                                          cursor->bufferAlias);                
+            }
+        }
    }

-    if (value >= EXPANDCHARINDEX)
-    {
-        cursor->bufferAlias = getExpandValueList(value);
-        cursor->expIndex = 0;
-        value = cursor->bufferAlias->at(cursor->expIndex++);
+    if (cursor->bufferAlias != NULL) {
+        cursor->expIndex = 1;
+        value = cursor->bufferAlias->at(0);
    }

-    int32_t str = strengthOrder(value);   
-    
    return strengthOrder(value);
 }

+/**
+ * A clone of CollationElementIterator::makeReorderedBuffer, trimmed down
+ * to only handle forward.
+ */
+inline VectorOfInt*
+RuleBasedCollator::makeReorderedBuffer(NormalizerIterator* cursor,
+                                       UChar colFirst,
+                                       int32_t lastValue,
+                                       VectorOfInt* lastExpansion) const {
+    VectorOfInt* result;
+
+    int32_t firstValue = ucmp32_get(data->mapping, colFirst);
+    if (firstValue >= CONTRACTCHARINDEX) {
+        UErrorCode status = U_ZERO_ERROR;
+        firstValue = nextContractChar(cursor, colFirst, status);
+    }
+
+    VectorOfInt* firstExpansion = NULL;
+    if (firstValue >= EXPANDCHARINDEX) {
+        firstExpansion = getExpandValueList(firstValue);
+    }
+
+    if (firstExpansion == NULL && lastExpansion == NULL) {
+        cursor->ownBuffer.at(0) = firstValue;
+        cursor->ownBuffer.at(1) = lastValue;
+        result = &cursor->ownBuffer;
+    }
+    else {
+        int32_t firstLength = firstExpansion==NULL? 1 : firstExpansion->size();
+        int32_t lastLength = lastExpansion==NULL? 1 : lastExpansion->size();
+        if (cursor->reorderBuffer == NULL) {
+            cursor->reorderBuffer = new VectorOfInt(firstLength+lastLength);
+        }
+        // reorderdBuffer gets reused for the life of this object.
+        // Since its internal buffer only grows, there is a danger
+        // that it will get really, really big, and never shrink.  If
+        // this is actually happening, insert code here to check for
+        // the condition.  Something along the lines of:
+        //! else if (reorderBuffer->size() >= 256 &&
+        //!          (firstLength+lastLength) < 16) {
+        //!     delete reorderBuffer;
+        //!     reorderBuffer = new VectorOfInt(firstLength+lastLength);
+        //! }
+        // The specific numeric values need to be determined
+        // empirically. [aliu]
+        result = cursor->reorderBuffer;
+
+        if (firstExpansion == NULL) {
+            result->atPut(0, firstValue);
+        }
+        else {
+            // System.arraycopy(firstExpansion, 0, result, 0, firstLength);
+            *result = *firstExpansion;
+        }
+
+        if (lastExpansion == NULL) {
+            result->atPut(firstLength, lastValue);
+        }
+        else {
+            // System.arraycopy(lastExpansion, 0, result, firstLength, lastLength);
+            for (int32_t i=0; i<lastLength; ++i) {
+                result->atPut(firstLength + i, lastExpansion->at(i));
+            }
+        }
+        result->setSize(firstLength+lastLength);
+    }
+
+    return result;
+}
+
 // ==================== End inlines ============================================


--- a/icu4c/source/i18n/tblcoll.h
+++ b/icu4c/source/i18n/tblcoll.h
@ -41,6 +41,8 @@
 *                          UnicodeString construction and special case for NO_OP.
 * 11/23/99     srl         More performance enhancements. Updates to NormalizerIterator
 *                          internal state management.
+* 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
+*                          to implementation file.
 *******************************************************************************
 */

@ -61,6 +63,7 @@ class VectorOfPToExpandTable;
 class MergeCollation;
 class CollationElementIterator;
 class RuleBasedCollatorStreamer;
+class NormalizerIterator; // see tblcoll.cpp

 /**
 * The RuleBasedCollator class provides the simple implementation of Collator,
@ -742,32 +745,12 @@ private:
                          const UnicodeString&    name,
                          const UnicodeString&    suffix);

-  /* Internal class for quick iteration over the text.
-     100% pure inline code
-   */
-  class NormalizerIterator { 
-  public:
-      Normalizer *cursor;
-      VectorOfInt *bufferAlias;
-      int32_t     swapOrder;
-      UChar*      text;
-      int32_t     expIndex;
-      int32_t     textLen;
-      UTextOffset  currentOffset;
-
-      NormalizerIterator(void);
-      NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode);
-      ~NormalizerIterator(void);
-      void setText(const UChar* source, int32_t length, UErrorCode& status);
-      void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status);
-
-      UChar current(void) const;
-      UChar next(void);
-      void reset(void);
-  };
-
  int32_t getStrengthOrder(NormalizerIterator* cursor, 
                                    UErrorCode status) const;
+  VectorOfInt* makeReorderedBuffer(NormalizerIterator* cursor,
+                                   UChar colFirst,
+                                   int32_t lastValue,
+                                   VectorOfInt* lastExpansion) const;
  int32_t strengthOrder(int32_t value) const ;
  int32_t nextContractChar(NormalizerIterator *cursor, 
                           UChar ch,
@ -824,151 +807,6 @@ private:
  TableCollationData* data;
 };

-inline
-RuleBasedCollator::NormalizerIterator::NormalizerIterator() :
-    cursor(0),
-    bufferAlias(0),
-    swapOrder(0),
-    text(0),
-    textLen(0),
-    currentOffset(0),
-    expIndex(0)
-{
-}
-
-inline
-RuleBasedCollator::NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) :
-    cursor(0),
-    bufferAlias(0),
-    swapOrder(0),
-    text(0),
-    textLen(0),
-    currentOffset(0),
-    expIndex(0)
-{
-    if (mode == Normalizer::NO_OP) {
-        text = (UChar*)source;
-        textLen = length;
-        currentOffset = 0;
-    } else {
-        cursor = new Normalizer(source, length, mode);
-
-    }
-}
-
-inline
-RuleBasedCollator::NormalizerIterator::~NormalizerIterator() 
-{
-    if (cursor != 0) {
-        delete cursor;
-        cursor = 0;
-    }
-}
-
-inline
-void
-RuleBasedCollator::NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status)
-{
-    if (cursor == 0) {
-        text = (UChar*)source;
-        textLen = length;
-        currentOffset = 0;
-
-    } else {
-        text = 0;
-        cursor->setText(source, length, status);
-    }
-    bufferAlias = 0;
-    swapOrder = 0;
-    expIndex = 0;
-    currentOffset = 0;
-}
-
-/* You can only set mode after the comparision of two strings is completed.
-   Setting the mode in the middle of a comparison is not allowed.
-   */
-inline
-void
-
-RuleBasedCollator::NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status)
-{
-    if(mode != Normalizer::NO_OP)
-    {
-        /* DO have a mode -  will need a normalizer object */
-        if(cursor != NULL)
-        {
-            /* Just modify the existing cursor */
-            cursor->setMode(mode);
-	    cursor->setText(source, length, status);
-        }
-        else
-	{
-	  cursor = new Normalizer(source, length, mode);
-	}
-
-        /* RESET the old data */
-        text = 0;
-        textLen = 0;
-    }
-    else 
-    {
-        /* NO_OP mode.. */
-        if(cursor != NULL)
-        { /* get rid of the old cursor */
-            delete cursor; 
-            cursor = 0;
-        }
-
-        text = (UChar*)source;
-        textLen = length;
-    }
-    currentOffset = 0; /* always */
-   
-    bufferAlias = 0;
-    swapOrder = 0;
-    expIndex = 0;
-}
-
-inline
-UChar
-RuleBasedCollator::NormalizerIterator::current(void) const
-{
-    if (text != 0) {
-      if(currentOffset >= textLen)
-	{
-	  return Normalizer::DONE;
-	}
-      else
-	{
-	  return text[currentOffset];
-	}
-    }
-
-    return cursor->current();
-}
-
-
-inline
-UChar
-RuleBasedCollator::NormalizerIterator::next(void)
-{
-    if (text != 0) {
-      return ((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE);
-    }
-    return cursor->next();
-}
-
-inline
-void
-RuleBasedCollator::NormalizerIterator::reset(void)
-{
-  currentOffset = 0;
-  if(cursor)
-    {
-      cursor->reset();
-    }
-}
-
 inline bool_t
 RuleBasedCollator::operator!=(const Collator& other) const
 {
--- a/icu4c/source/test/intltest/intltest.dsp
+++ b/icu4c/source/test/intltest/intltest.dsp
@ -271,6 +271,10 @@ SOURCE=.\tfsmalls.cpp
 # End Source File
 # Begin Source File

+SOURCE=.\thcoll.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\tmsgfmt.cpp
 # End Source File
 # Begin Source File
@ -531,6 +535,10 @@ SOURCE=.\tfsmalls.h
 # End Source File
 # Begin Source File

+SOURCE=.\thcoll.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\tmsgfmt.h
 # End Source File
 # Begin Source File
--- a/icu4c/source/test/intltest/thcoll.cpp
+++ b/icu4c/source/test/intltest/thcoll.cpp
@ -0,0 +1,320 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   12/09/99    aliu        Ported from Java.
+**********************************************************************
+*/
+
+#include "thcoll.h"
+#include "coll.h"
+#include "sortkey.h"
+#include "cstring.h"
+#include "filestrm.h"
+
+/**
+ * The TestDictionary test expects a file of this name, with this
+ * encoding, to be present in the directory $ICU/source/test/testdata.
+ */
+#define TEST_FILE           "th18057.txt"
+#define TEST_FILE_ENCODING  "UTF8"
+
+/**
+ * This is the most failures we show in TestDictionary.  If this number
+ * is < 0, we show all failures.
+ */
+#define MAX_FAILURES_TO_SHOW 8
+
+#define CASE(id,test)                 \
+    case id:                          \
+        name = #test;                 \
+        if (exec) {                   \
+            logln(#test "---");       \
+            logln((UnicodeString)""); \
+            test();                   \
+        }                             \
+        break;
+
+CollationThaiTest::CollationThaiTest() {
+    UErrorCode status = U_ZERO_ERROR;
+    coll = Collator::createInstance(Locale("th", "TH", ""), status);
+    if (coll && U_SUCCESS(status)) {
+        coll->setStrength(Collator::TERTIARY);
+    } else {
+        delete coll;
+        coll = 0;
+    }
+}
+
+CollationThaiTest::~CollationThaiTest() {
+    delete coll;
+}
+
+void CollationThaiTest::runIndexedTest(int32_t index, bool_t exec, char* &name,
+                                       char* par) {
+    switch (index) {
+        CASE(0,TestDictionary)
+        CASE(1,TestCornerCases)
+        default: name = ""; break;
+    }
+}
+
+/**
+ * Read the external dictionary file, which is already in proper
+ * sorted order, and confirm that the collator compares each line as
+ * preceding the following line.
+ */
+void CollationThaiTest::TestDictionary() {
+    if (coll == 0) {
+        errln("Error: could not construct Thai collator");
+        return;
+    }
+
+    // Read in a dictionary of Thai words
+    char buffer[1024];
+    icu_strcpy(buffer, IntlTest::getTestDirectory());
+    icu_strcat(buffer, TEST_FILE);
+
+    FileStream *in = T_FileStream_open(buffer, "r");
+    if (in == 0) {
+        errln((UnicodeString)"Error: could not open test file " + buffer);
+        return;        
+    }
+
+    //
+    // Loop through each word in the dictionary and compare it to the previous
+    // word.  They should be in sorted order.
+    //
+    UnicodeString lastWord;
+    int32_t line = 0;
+    int32_t failed = 0;
+    while (T_FileStream_readLine(in, buffer, sizeof(buffer)) != 0) {
+        UnicodeString word(buffer, TEST_FILE_ENCODING);
+        line++;
+
+        if (word.charAt(0) == '#') {
+            // Skip comments
+            continue;
+        }
+
+        // Trim line termination characters from the end
+        int32_t i = word.length()-1;
+        while (i>=0 &&
+               (word.charAt(i) == (UChar)13 ||
+                word.charAt(i) == (UChar)10)) {
+            --i;
+        }
+        word.truncate(i+1);
+
+        // Skip blank lines
+        if (word.length() == 0) {
+            continue;
+        }
+
+        if (lastWord.length() > 0) {
+            int32_t result = coll->compare(lastWord, word);
+
+            if (result >= 0) {
+                failed++;
+                if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
+                    UnicodeString str;
+                    UnicodeString msg =
+                        UnicodeString("--------------------------------------------\n")
+                        + line
+                        + " compare(" + prettify(lastWord, str);
+                    msg += UnicodeString(", ")
+                        + prettify(word, str) + ") returned " + result
+                        + ", expected -1\n";
+                    UErrorCode status = U_ZERO_ERROR;
+                    CollationKey k1, k2;
+                    coll->getCollationKey(lastWord, k1, status);
+                    coll->getCollationKey(word, k2, status);
+                    if (U_FAILURE(status)) {
+                        errln((UnicodeString)"Fail: getCollationKey returned " + status);
+                        return;
+                    }
+                    msg.append("key1: ").append(prettify(k1, str)).append("\n");
+                    msg.append("key2: ").append(prettify(k2, str));
+                    errln(msg);
+                }
+            }
+        }
+        lastWord = word;
+    }
+
+    if (failed != 0) {
+        if (failed > MAX_FAILURES_TO_SHOW) {
+            errln((UnicodeString)"Too many failures; only the first " +
+                  MAX_FAILURES_TO_SHOW + " failures were shown");
+        }
+        errln((UnicodeString)"Summary: " + failed + " of " + (line - 1) +
+              " comparisons failed");
+    }
+}
+
+/**
+ * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
+ * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
+ */
+void CollationThaiTest::TestCornerCases() {
+    const char* TESTS[] = {
+        // Shorter words precede longer
+        "\\u0e01",                               "<",    "\\u0e01\\u0e01",
+
+        // Tone marks are considered after letters (i.e. are primary ignorable)
+        "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e49\\u0e32",
+
+        // ditto for other over-marks
+        "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e4c",
+
+        // commonly used mark-in-context order.
+        // In effect, marks are sorted after each syllable.
+        "\\u0e01\\u0e32\\u0e01\\u0e49\\u0e32",   "<",    "\\u0e01\\u0e48\\u0e32\\u0e01\\u0e49\\u0e32",
+
+        // Hyphens and other punctuation follow whitespace but come before letters
+        "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32-",
+        "\\u0e01\\u0e32-",                       "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
+
+        // Doubler follows an indentical word without the doubler
+        "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e46",
+        "\\u0e01\\u0e32\\u0e46",                 "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
+
+
+        // \\u0e45 after either \\u0e24 or \\u0e26 is treated as a single
+        // combining character, similar to "c < ch" in traditional spanish.
+        // TODO: beef up this case
+        "\\u0e24\\u0e29\\u0e35",                 "<",    "\\u0e24\\u0e45\\u0e29\\u0e35",
+        "\\u0e26\\u0e29\\u0e35",                 "<",    "\\u0e26\\u0e45\\u0e29\\u0e35",
+
+        // Vowels reorder, should compare \\u0e2d and \\u0e34
+        "\\u0e40\\u0e01\\u0e2d",                 "<",    "\\u0e40\\u0e01\\u0e34",
+
+        // Tones are compared after the rest of the word (e.g. primary ignorable)
+        "\\u0e01\\u0e32\\u0e01\\u0e48\\u0e32",   "<",    "\\u0e01\\u0e49\\u0e32\\u0e01\\u0e32",
+
+        // Periods are ignored entirely
+        "\\u0e01.\\u0e01.",                      "<",    "\\u0e01\\u0e32",
+    };
+    const int32_t TESTS_length = sizeof(TESTS)/sizeof(TESTS[0]);
+
+    if (coll == 0) {
+        errln("Error: could not construct Thai collator");
+        return;
+    }
+    compareArray(*coll, TESTS, TESTS_length);
+}
+
+//------------------------------------------------------------------------
+// Internal utilities
+//------------------------------------------------------------------------
+
+void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
+                                     int32_t testsLength) {
+    UErrorCode status = U_ZERO_ERROR;
+    for (int32_t i = 0; i < testsLength; i += 3) {
+
+        int32_t expect = 0;
+        if (tests[i+1][0] == '<') {
+            expect = -1;
+        } else if (tests[i+1][0] == '>') {
+            expect = 1;
+        } else if (tests[i+1][0] == '=') {
+            expect = 0;
+        } else {
+            // expect = Integer.decode(tests[i+1]).intValue();
+            errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
+            return;
+        }
+
+        UnicodeString s1, s2;
+        parseChars(s1, tests[i]);
+        parseChars(s2, tests[i+2]);
+
+        int32_t result = c.compare(s1, s2);
+        if (sign(result) != sign(expect))
+        {
+            UnicodeString t1, t2;
+            errln(UnicodeString("") +
+                  i/3 + ": compare(" + prettify(s1, t1)
+                  + " , " + prettify(s2, t2)
+                  + ") got " + result + "; expected " + expect);
+
+            CollationKey k1, k2;
+            c.getCollationKey(s1, k1, status);
+            c.getCollationKey(s2, k2, status);
+            if (U_FAILURE(status)) {
+                errln((UnicodeString)"Fail: getCollationKey returned " + status);
+                return;
+            }
+            errln((UnicodeString)"  key1: " + prettify(k1, t1) );
+            errln((UnicodeString)"  key2: " + prettify(k2, t2) );
+        }
+        else
+        {
+            // Collator.compare worked OK; now try the collation keys
+            CollationKey k1, k2;
+            c.getCollationKey(s1, k1, status);
+            c.getCollationKey(s2, k2, status);
+            if (U_FAILURE(status)) {
+                errln((UnicodeString)"Fail: getCollationKey returned " + status);
+                return;
+            }
+
+            result = k1.compareTo(k2);
+            if (sign(result) != sign(expect)) {
+                UnicodeString t1, t2;
+                errln(UnicodeString("") +
+                      i/3 + ": key(" + prettify(s1, t1)
+                      + ").compareTo(key(" + prettify(s2, t2)
+                      + ")) got " + result + "; expected " + expect);
+                
+                errln((UnicodeString)"  " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
+            }
+        }
+    }
+}
+
+int8_t CollationThaiTest::sign(int32_t i) {
+    if (i < 0) return -1;
+    if (i > 0) return 1;
+    return 0;
+}
+
+/**
+ * Set a UnicodeString corresponding to the given string.  Use
+ * UnicodeString and the default converter, unless we see the sequence
+ * "\\u", in which case we interpret the subsequent escape.
+ */
+UnicodeString& CollationThaiTest::parseChars(UnicodeString& result,
+                                             const char* chars) {
+    result.remove();
+    int32_t len = icu_strlen(chars);
+    for (int32_t i=0; i<len; ) {
+        if ((i+5)<len && chars[i] == '\\' &&
+            (chars[i+1] == 'u' || chars[i+1] == 'U')) {
+            UChar c = 0;
+            i += 2;
+            for (int32_t d=0; d<4; ++d) {
+                int8_t digit = chars[i++];
+                if (digit >= '0' && digit <= '9') {
+                    digit -= '0';
+                } else if (digit >= 'A' && digit <= 'F') {
+                    digit -= 'A' - 10;
+                } else if (digit >= 'a' && digit <= 'f') {
+                    digit -= 'a' - 10;
+                } else {
+                    digit = 0; // illegal hex digit
+                }
+                c = (c << 4) | digit;
+            }
+            result += c;
+        } else {
+            char buf[2] = { chars[i], 0 };
+            result += buf;
+            ++i;
+        }
+    }
+    return result;
+}
--- a/icu4c/source/test/intltest/thcoll.h
+++ b/icu4c/source/test/intltest/thcoll.h
@ -0,0 +1,56 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   12/09/99    aliu        Ported from Java.
+**********************************************************************
+*/
+
+#include "intltest.h"
+
+class Collator;
+class UnicodeString;
+
+class CollationThaiTest : public IntlTest {
+    Collator* coll; // Thai collator
+
+public:
+
+    CollationThaiTest();
+
+    ~CollationThaiTest();
+
+    void runIndexedTest( int32_t index, bool_t exec, char* &name, char* par = NULL );
+    
+private:
+
+    /**
+     * Read the external dictionary file, which is already in proper
+     * sorted order, and confirm that the collator compares each line as
+     * preceding the following line.
+     */
+    void TestDictionary();
+    
+    /**
+     * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
+     * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
+     */
+    void TestCornerCases();
+    
+private:
+
+    void compareArray(const Collator& c, const char* tests[],
+                      int32_t testsLength);
+
+    int8_t sign(int32_t i);
+    
+    /**
+     * Set a UnicodeString corresponding to the given string.  Use
+     * UnicodeString and the default converter, unless we see the sequence
+     * "\\u", in which case we interpret the subsequent escape.
+     */
+    UnicodeString& parseChars(UnicodeString& result,
+                              const char* chars);
+};
--- a/icu4c/source/test/intltest/tscoll.cpp
+++ b/icu4c/source/test/intltest/tscoll.cpp
@ -35,6 +35,7 @@
 #include "itercoll.h"
 //#include "capicoll.h"   // CollationCAPITest
 #include "tstnorm.h"
+#include "thcoll.h"

 void IntlTestCollator::runIndexedTest( int32_t index, bool_t exec, char* &name, char* par )
 {
@ -258,6 +259,15 @@ void IntlTestCollator::runIndexedTest( int32_t index, bool_t exec, char* &name,
        }
        break;

+    case 16:
+        name = "CollationThaiTest"; 
+        if (exec) {
+            logln("CollationThaiTest---"); logln("");
+            CollationThaiTest test;
+            callTest( test, par );
+        }
+        break;
+
    default: name = ""; break;
    }
 }
--- a/icu4c/source/test/testdata/th18057.txt
+++ b/icu4c/source/test/testdata/th18057.txt