From 974bfd35196f7e7bd7fe1928dddd258c9487525a Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Wed, 29 Aug 2001 23:57:15 +0000
Subject: [PATCH] ICU-1007 completely replace the old with the new
 implementation

X-SVN-Rev: 5614
---
 icu4c/source/common/normlzr.cpp       | 1063 +++++--------------------
 icu4c/source/common/unicode/normlzr.h |   96 +--
 2 files changed, 222 insertions(+), 937 deletions(-)

diff --git a/icu4c/source/common/normlzr.cpp b/icu4c/source/common/normlzr.cpp
index 7ec326b823..b0ded10182 100644
--- a/icu4c/source/common/normlzr.cpp
+++ b/icu4c/source/common/normlzr.cpp
@@ -15,99 +15,65 @@
 *                     useful in tbcoll and unorm.
 *                     Added quickcheck method and incorporated it into 
 *                     normalize()
+* 06/20/01+ Markus Scherer
+*                     total rewrite, implement all normalization in unorm.cpp
+*                     and turn Normalizer into a wrapper;
+*                     fix the very broken iteration API
 */
 
-#include "ucmp16.h"
-#include "dcmpdata.h"
-#include "compdata.h"
-
 #include "unicode/normlzr.h"
 #include "unicode/utypes.h"
 #include "unicode/unistr.h"
 #include "unicode/chariter.h"
 #include "unicode/schriter.h"
-#include "unicode/unicode.h"
-#include "mutex.h"
-
-/* ### TODO: new implementation */
+#include "unicode/uchriter.h"
 #include "unormimp.h"
 
-
-
-
-#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))
-/**
-* Maximum initial buffer size.
-* Used in quickCheck to declare initial array.
-*/
-const uint32_t StackBufferLen = 1024;
-
-inline static void insert(UnicodeString& dest, 
-                          UTextOffset pos, 
-                          UChar ch)
-{
-    dest.replace(pos, 0, &ch, 1);
-}
-
 //-------------------------------------------------------------------------
 // Constructors and other boilerplate
 //-------------------------------------------------------------------------
 
 Normalizer::Normalizer(const UnicodeString& str, 
-                       EMode mode)
-{
-    init(new StringCharacterIterator(str), mode, 0);
-}
+                       EMode mode) :
+    fMode(mode), fOptions(0),
+    text(new StringCharacterIterator(str)), nextIndex(-1),
+    buffer(), bufferPos(0)
+{}
 
 Normalizer::Normalizer(const UnicodeString& str, 
                        EMode mode, 
-                       int32_t opt)
-{
-    init(new StringCharacterIterator(str), mode, opt);
-}
+                       int32_t options) :
+    fMode(mode), fOptions(options),
+    text(new StringCharacterIterator(str)), nextIndex(-1),
+    buffer(), bufferPos(0)
+{}
 
-Normalizer::Normalizer(const UChar* str, int32_t length, EMode mode) 
-{
-    init(new StringCharacterIterator(UnicodeString(str, length)), mode, 0);
-}
+Normalizer::Normalizer(const UChar *str, int32_t length, EMode mode) :
+    fMode(mode), fOptions(0),
+    text(new UCharCharacterIterator(str, length)), nextIndex(-1),
+    buffer(), bufferPos(0)
+{}
 
 Normalizer::Normalizer(const CharacterIterator& iter, 
-                       EMode mode)
-{
-    init(iter.clone(), mode, 0);
-}
+                       EMode mode) :
+    fMode(mode), fOptions(0),
+    text(iter.clone()), nextIndex(-1),
+    buffer(), bufferPos(0)
+{}
 
 Normalizer::Normalizer(const CharacterIterator& iter, 
                        EMode mode, 
-                       int32_t opt)
-{
-    init(iter.clone(), mode, opt);
-}
+                       int32_t options) :
+    fMode(mode), fOptions(options),
+    text(iter.clone()), nextIndex(-1),
+    buffer(), bufferPos(0)
+{}
 
-void Normalizer::init(CharacterIterator* adoptIter, 
-                      EMode mode, 
-                      int32_t options)
-{
-    bufferPos = 0;
-    bufferLimit = 0;
-    fOptions = options;
-    currentChar = DONE;
-    fMode = mode;
-    text = adoptIter;
-  
-    minDecomp = (int16_t)((fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT);
-}
-
-Normalizer::Normalizer(const Normalizer& copy)
-{
-    init(copy.text->clone(), copy.fMode, copy.fOptions);
-  
-    buffer      = copy.buffer;
-    bufferPos   = copy.bufferPos;
-    bufferLimit = copy.bufferLimit;
-    explodeBuf  = copy.explodeBuf;
-    currentChar = copy.currentChar;
-}
+Normalizer::Normalizer(const Normalizer &copy) :
+    fMode(copy.fMode), fOptions(copy.fOptions),
+    text(copy.text->clone()), nextIndex(copy.nextIndex),
+    buffer(copy.buffer), bufferPos(copy.bufferPos)
+{}
 
 Normalizer::~Normalizer()
 {
@@ -129,17 +95,19 @@ Normalizer::clone() const
  */
 int32_t Normalizer::hashCode() const
 {
-    return text->hashCode() + fMode + fOptions + bufferPos + bufferLimit;
+    return text->hashCode() + fMode + fOptions + buffer.hashCode() + bufferPos + nextIndex;
 }
     
 UBool Normalizer::operator==(const Normalizer& that) const
 {
-    return *text == *(that.text)
-        && currentChar == that.currentChar
-        && buffer == that.buffer
-        && explodeBuf == that.explodeBuf
-        && bufferPos == that.bufferPos
-        && bufferLimit == that.bufferLimit;
+    return
+        this==&that ||
+        fMode==that.fMode &&
+        fOptions==that.fOptions &&
+        *text==*(that.text) &&
+        buffer==that.buffer &&
+        bufferPos==that.bufferPos &&
+        nextIndex==that.nextIndex;
 }
 
 //-------------------------------------------------------------------------
@@ -152,7 +120,7 @@ Normalizer::normalize(const UnicodeString& source,
                       int32_t options,
                       UnicodeString& result, 
                       UErrorCode &status) {
-    if(source.isBogus()) {
+    if(source.isBogus() || U_FAILURE(status)) {
         result.setToBogus();
     } else {
         /* make sure that we do not operate on the same buffer in source and result */
@@ -180,34 +148,6 @@ Normalizer::quickCheck(const UnicodeString& source,
                             getUNormalizationMode(mode, status), &status);
 }
 
-//-------------------------------------------------------------------------
-// Inline functions for 64-bit bitmasks (array of 2 uint32_t)
-//-------------------------------------------------------------------------
-
-// Clear all bits of the mask
-inline void emptyBitmask64(uint32_t* mask) {
-    mask[0] = mask[1] = 0;
-}
-
-// Return TRUE if all bits are clear in the mask
-inline UBool isEmptyBitmask64(uint32_t* mask) {
-    return (mask[0] == 0) && (mask[1] == 0);
-}
-
-// Set a single bit (0..63) of the mask
-inline void setBitmask64(uint32_t* mask, int32_t bit) {
-    mask[bit >> 5] |= ((uint32_t)1L << (bit & 31));
-}
-
-// Return TRUE if a single bit (0..63) is set in the mask
-inline UBool isSetBitmask64(uint32_t* mask, int32_t bit) {
-    return (mask[bit >> 5] & (1L << (bit & 31))) != 0;
-}
-
-//-------------------------------------------------------------------------
-// Compose methods
-//-------------------------------------------------------------------------
-
 void
 Normalizer::compose(const UnicodeString& source, 
                     UBool compat,
@@ -230,306 +170,6 @@ Normalizer::compose(const UnicodeString& source,
     }
 }
 
-/**
- * Compose starting with current input character and continuing
- * until just before the next base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- *  <li>underlying char iter points to first character to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- *  <li>returns first char of decomposition or DONE if at end
- *  <li>Underlying char iter is pointing at next base char or past end
- * </ul>
- */
-UChar Normalizer::nextCompose() 
-{
-    UTextOffset explodePos = EMPTY;  // Position in input buffer
-    UTextOffset basePos = 0;         // Position of last base in output string
-    uint16_t    baseIndex = 0;       // Index of last base in "actions" array
-    uint32_t    classesSeen[2];      // Combining classes seen since last base
-    uint16_t    action;
-    UChar       lastBase = 0;
-    UBool       chFromText = TRUE;
-    
-    // Compatibility explosions have lower indices; skip them if necessary
-    uint16_t minExplode = (uint16_t)((fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT);
-    uint16_t minDecompLocal = (uint16_t)((fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT);
-    
-    emptyBitmask64(classesSeen);
-    initBuffer();
-    explodeBuf.truncate(0);
-    
-    UChar ch = curForward();
-
-    while (ch != DONE) {
-        // Get the basic info for the character
-        uint16_t charInfo = composeLookup(ch);
-        uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
-        uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT);
-        
-        if (type == ComposeData::BASE || (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)) {
-            if (buffer.length() > 0 && chFromText && explodePos == EMPTY) {
-                // When we hit a base char in the source text, we can return the text
-                // that's been composed so far.  We'll re-process this char next time hrough.
-                break;
-            }
-            emptyBitmask64(classesSeen);
-            baseIndex = index;
-            basePos = buffer.length();
-            buffer += ch;
-            lastBase = ch;
-        }
-        else if (type == ComposeData::COMBINING)
-        {
-            uint32_t cclass = ComposeData::typeBit[index]; // 0..63
-            
-            // We can only combine a character with the base if we haven't
-            // already seen a combining character with the same canonical class.
-            if (index < ComposeData::COMBINING_COUNT
-                && !isSetBitmask64(classesSeen, cclass)
-                && (action = composeAction(baseIndex, index)) > 0)
-            {
-                if (action > ComposeData::MAX_COMPOSED) {
-                    // Pairwise explosion.  Actions above this value are really
-                    // indices into an array that in turn contains indices
-                    // into the exploding string table
-                    // TODO: What if there are unprocessed chars in the explode buffer?
-                    UChar newBase = pairExplode(explodeBuf, action);
-                    explodePos = 0;
-                    buffer[basePos] = newBase;
-
-                    baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT);
-                    lastBase = newBase;
-                } else {
-                    // Normal pairwise combination.  Replace the base char
-                    UChar newBase = (UChar) action;
-                    buffer[basePos] = newBase;
-                                            
-                    baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT);
-                    lastBase = newBase;
-                }
-                //
-                // Since there are Unicode characters that cannot be combined in arbitrary
-                // order, we have to re-process any combining marks that go with this
-                // base character.  There are only four characters in Unicode that have
-                // this problem.  If they are fixed in Unicode 3.0, this code can go away.
-                //
-                UTextOffset len = buffer.length();
-                if (len - basePos > 1) {
-                    for (UTextOffset j = basePos+1; j < len; j++) {
-                        explodeBuf += buffer[j];
-                    }
-                    buffer.truncate(basePos+1);
-                    emptyBitmask64(classesSeen);
-                    if (explodePos == EMPTY) explodePos = 0;
-                }
-            } else {
-                // No combination with this character
-                bubbleAppend(buffer, ch, cclass);
-                setBitmask64(classesSeen, cclass); //[cclass >> 5] |= (1L << (cclass & 31));
-            }
-        }
-        else if (index > minExplode) {
-            // Single exploding character
-            explode(explodeBuf, index);
-            explodePos = 0;
-        }
-        else if (type == ComposeData::HANGUL && minExplode == 0) {
-            // If we're in compatibility mode we need to decompose Hangul to Jamo,
-            // because some of the Jamo might have compatibility decompositions.
-            hangulToJamo(ch, explodeBuf, minDecompLocal);
-            explodePos = 0;
-        }
-        else if (type == ComposeData::INITIAL_JAMO) {
-            if (buffer.length() > 0 && chFromText && explodePos == EMPTY) {
-                // When we hit a base char in the source text, we can return the text
-                // that's been composed so far.  We'll re-process this char next time through.
-                break;
-            }
-            emptyBitmask64(classesSeen);
-            baseIndex = ComposeData::INITIAL_JAMO_INDEX;
-            basePos = buffer.length();
-            buffer += ch;
-        }
-        else if (type == ComposeData::MEDIAL_JAMO
-                 && isEmptyBitmask64(classesSeen)
-                 && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
-            // If the last character was an initial jamo, we can combine it with this
-            // one to create a Hangul character.
-            uint16_t l = (uint16_t)(buffer[basePos] - (UChar)JAMO_LBASE);
-            uint16_t v = (uint16_t)(ch - JAMO_VBASE);
-            buffer[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
-            
-            baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
-        }
-        else if (type == ComposeData::FINAL_JAMO
-                 && isEmptyBitmask64(classesSeen)
-                 && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
-            // If the last character was a medial jamo that we turned into Hangul,
-            // we can add this character too.
-            buffer[basePos] = (UChar)(buffer[basePos] + (ch - JAMO_TBASE));
-
-            baseIndex = 0;
-            basePos = -1;
-            emptyBitmask64(classesSeen);
-        } else {
-            // TODO: deal with JAMO character types
-            baseIndex = 0;
-            basePos = -1;
-            emptyBitmask64(classesSeen);
-            buffer += ch;
-        }
-        
-        if (explodePos == EMPTY) {
-            ch = text->next();
-            chFromText = TRUE;
-        } else {
-            ch = explodeBuf[explodePos++];
-            if (explodePos >= explodeBuf.length()) {
-                explodePos = EMPTY;
-                explodeBuf.truncate(0);
-            }
-            chFromText = FALSE;
-        }
-    }
-    if (buffer.length() > 0) {
-        bufferLimit = buffer.length() - 1;
-        ch = buffer[0];
-    } else {
-        ch = DONE;
-        bufferLimit = 0;
-    }
-    return ch;
-}
-
-/**
- * Compose starting with the input UChar just before the current position
- * and continuing backward until (and including) the previous base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- *  <li>underlying char iter points just after last char to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- *  <li>returns last char of resulting decomposition sequence
- *  <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
- * </ul>
- */
-UChar Normalizer::prevCompose()
-{
-    UErrorCode status = U_ZERO_ERROR;
-
-    // Compatibility explosions have lower indices; skip them if necessary
-    uint16_t minExplode = (uint16_t)((fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT);
-
-    initBuffer();
-    
-    // Slurp up characters until we hit a base char or an initial Jamo
-    UChar ch;
-    while ((ch = curBackward()) != DONE) {
-        insert(buffer, 0, ch);
-        
-        // Get the basic info for the character
-        uint16_t charInfo = composeLookup(ch);
-        uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
-        uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT);
-        
-        if (type == ComposeData::BASE
-            || (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)
-            || type == ComposeData::HANGUL 
-            || type == ComposeData::INITIAL_JAMO)
-        {
-            break;
-        }
-    }
-    // If there's more than one character in the buffer, compose it all at once....
-    if (buffer.length() > 0) {
-        // TODO: The performance of this is awful; add a way to compose
-        // a UnicodeString& in place.
-        UnicodeString composed;
-        compose(buffer, (fMode & COMPAT_BIT) != 0, fOptions, composed, status);
-        buffer.truncate(0);
-        buffer += composed;
-        
-        if (buffer.length() > 1) {
-            bufferLimit = bufferPos = buffer.length() - 1;
-            ch = buffer[bufferPos];
-        } else {
-            ch = buffer[0];
-        }
-    }
-    else {
-        ch = DONE;
-    }
-    
-    return ch;
-}
-
-void Normalizer::bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass) {
-    UTextOffset i;
-    for (i = target.length() - 1; i >= 0; --i) {
-        uint32_t iClass = getComposeClass(target[i]);
-
-        if (iClass == 1 || iClass <= cclass) {      // 1 means combining class 0
-            // We've hit something we can't bubble this character past, so insert here
-            break;
-        }
-    }
-    // We need to insert just after character "i"
-    insert(target, i+1, ch);
-}
-    
-/**
- * Return the composing class of a character, as stored in the ComposeData
- * table.  This is not the composing class as listed in the raw Unicode
- * database, but an equivalent remapped value.  Values are remapped so they
- * fit in a sequential range from 0..n, where n < 64, and relative order
- * is preserved.
- * @return the composing class of ch, from 0..63
- */
-uint32_t Normalizer::getComposeClass(UChar ch) {
-    uint32_t cclass = 0;
-    uint16_t charInfo = composeLookup(ch);
-    uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
-    if (type == ComposeData::COMBINING) {
-        cclass = ComposeData::typeBit[charInfo >> ComposeData::INDEX_SHIFT];
-    }
-    return cclass;
-}
-
-uint16_t Normalizer::composeLookup(UChar ch) {
-    return ucmp16_getu(ComposeData::lookup, ch);
-}
-
-uint16_t Normalizer::composeAction(uint16_t baseIndex, uint16_t comIndex) 
-{
-    return ucmp16_getu(ComposeData::actions,
-                       ((UChar)(baseIndex + ComposeData::MAX_BASES*comIndex)));
-}
-
-void Normalizer::explode(UnicodeString& target, uint16_t index) {
-    UChar ch;
-    while ((ch = ComposeData::replace[index++]) != 0) {
-        target += ch;
-    }
-}
-
-UChar Normalizer::pairExplode(UnicodeString& target, uint16_t action) {
-    uint16_t index = ComposeData::actionIndex[action - ComposeData::MAX_COMPOSED];
-    explode(target, (uint16_t)(index + 1));
-    return ComposeData::replace[index];   // New base char
-}
-
-//-------------------------------------------------------------------------
-// Decompose methods
-//-------------------------------------------------------------------------
-
 void
 Normalizer::decompose(const UnicodeString& source, 
                       UBool compat,
@@ -552,265 +192,36 @@ Normalizer::decompose(const UnicodeString& source,
     }
 }
 
-/**
- * Decompose starting with current input character and continuing
- * until just before the next base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- *  <li>underlying char iter points to first character to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- *  <li>returns first char of decomposition or DONE if at end
- *  <li>Underlying char iter is pointing at next base char or past end
- * </ul>
- */
-UChar Normalizer::nextDecomp()
-{
-    UBool hangul = ((fOptions & IGNORE_HANGUL) == 0);
-    UChar ch = curForward();
-    int32_t i;
-    uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
-    int16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK);
-  
-    if (index > minDecomp ||
-        ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
-    {
-        initBuffer();
-      
-        if (index > minDecomp) {
-            doAppend((const UChar*)(DecompData::contents), index, buffer);
-
-            if ((offset & DecompData::DECOMP_RECURSE) != 0) {
-                // Need to decompose the output of this decomposition recursively.
-                for (i = 0; i < buffer.length(); i++) {
-                    ch = buffer.charAt(i);
-                    int16_t index = (int16_t)(ucmp16_getu(DecompData::offsets, ch)
-                        & DecompData::DECOMP_MASK);
-                    if (index > minDecomp) {
-                        i += doReplace((const UChar*)(DecompData::contents), index, buffer, i);
-                    }
-                }
-            }
-        } else {
-            buffer += ch;
-        }
-        UBool needToReorder = FALSE;
-      
-        // Any other combining chacters that immediately follow the decomposed
-        // character must be included in the buffer too, because they're
-        // conceptually part of the same logical character.
-        while ((ch = text->next()) != DONE
-               && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
-        {
-            needToReorder = TRUE;
-            // Decompose any of these characters that need it - Liu
-            index = (int16_t)(ucmp16_getu(DecompData::offsets, ch)
-                    & DecompData::DECOMP_MASK);
-            if (index > minDecomp) {
-                doAppend((const UChar*)DecompData::contents, index, buffer);
-            } else {
-                buffer += ch;
-            }
-        }
-        
-        if (buffer.length() > 1 && needToReorder) {
-            // If there is more than one combining character in the buffer,
-            // put them into the canonical order.
-            // But we don't need to sort if only characters are the ones that
-            // resulted from decomosing the base character.
-            fixCanonical(buffer);
-        }
-        bufferLimit = buffer.length() - 1;
-        ch = buffer[0];
-    } else {
-        // Just use this character, but first advance to the next one
-        text->next();
-      
-        // Do Hangul -> Jamo decomposition if necessary
-        if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
-            initBuffer();
-            hangulToJamo(ch, buffer, minDecomp);
-            bufferLimit = buffer.length() - 1;
-            ch = buffer[0];
-        }
-    }
-    return ch;
-}
-
-
-/**
- * Decompose starting with the input char just before the current position
- * and continuing backward until (and including) the previous base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- *  <li>underlying char iter points just after last char to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- *  <li>returns last char of resulting decomposition sequence
- *  <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
- * </ul>
- */
-UChar Normalizer::prevDecomp() {
-    UBool hangul = (fOptions & IGNORE_HANGUL) == 0;
-
-    UChar ch = curBackward();
-
-    uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
-
-    if (offset > minDecomp ||
-        ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
-    {
-        initBuffer();
-
-        // This method rewritten to pass conformance tests. - Liu
-        // Collect all characters up to the previous base char
-        while (ch != DONE) {
-            buffer.insert(0, ch);
-            if (ucmp8_get(DecompData::canonClass, ch) == DecompData::BASE) break;
-            ch = text->previous();
-        }
-        
-        // Decompose the buffer
-        int32_t i;
-        for (i = 0; i < buffer.length(); i++) {
-            ch = buffer.charAt(i);
-            offset = ucmp16_getu(DecompData::offsets, ch);
-            int16_t index = (int16_t)(offset & DecompData::DECOMP_MASK);
-            
-            if (index > minDecomp) {
-                int j = doReplace((const UChar*)(DecompData::contents), index, buffer, i);
-                if ((offset & DecompData::DECOMP_RECURSE) != 0) {
-                    // Need to decompose this recursively
-                    for (; i < j; ++i) {
-                        ch = buffer.charAt(i);
-                        index = (int16_t)(ucmp16_getu(DecompData::offsets, ch)
-                            & DecompData::DECOMP_MASK);
-                        if (index > minDecomp) {
-                            i += doReplace((const UChar*)(DecompData::contents), index, buffer, i);
-                        }
-                    }
-                }
-                i = j;
-            }
-        }
-
-
-        if (buffer.length() > 1) {
-            // If there is more than one combining character in the buffer,
-            // put them into the canonical order.
-            fixCanonical(buffer);
-        }
-        bufferLimit = bufferPos = buffer.length() - 1;
-        ch = buffer[bufferPos];
-    }
-    else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
-        initBuffer();
-        hangulToJamo(ch, buffer, minDecomp);
-        bufferLimit = bufferPos = buffer.length() - 1;
-        ch = buffer[bufferPos];
-    }
-    return ch;
-}
-
-uint8_t Normalizer::getClass(UChar ch) {
-    return  ucmp8_get(DecompData::canonClass, ch);
-}
- 
-/**
- * Fixes the sorting sequence of non-spacing characters according to
- * their combining class.  The algorithm is listed on p.3-11 in the
- * Unicode Standard 2.0.  The table of combining classes is on p.4-2
- * in the Unicode Standard 2.0.
- * @param result the string to fix.
- */
-void Normalizer::fixCanonical(UnicodeString& result) {
-    UTextOffset i = result.length() - 1;
-    uint8_t currentType = getClass(result[i]);
-    uint8_t lastType;
-    
-    for (--i; i >= 0; --i) {
-        lastType = currentType;
-        currentType = getClass(result[i]);
-        
-        //
-        // a swap is presumed to be rare (and a double-swap very rare),
-        // so don't worry about efficiency here.
-        //
-        if (currentType > lastType && lastType != DecompData::BASE) {
-            // swap characters
-            UChar temp = result[i];
-            result[i] = result[i+1];
-            result[i+1] = temp;
-
-            // if not at end, backup (one further, to compensate for for-loop)
-            if (i < result.length() - 2) {
-                i += 2;
-            }
-            // reset type, since we swapped.
-            currentType = getClass(result[i]);
-        }
-    }
-}
-
-    
 //-------------------------------------------------------------------------
-// CharacterIterator overrides
+// Iteration API
 //-------------------------------------------------------------------------
 
 /**
  * Return the current character in the normalized text.
  */
-UChar32 Normalizer:: current() const
-{
-    // TODO: make this method const and guarantee that currentChar is always set?
-    Normalizer *nonConst = (Normalizer*)this;
-  
-    if (currentChar == DONE) {
-        switch (fMode) {
-        case NO_OP:
-            nonConst->currentChar = text->current();            
-            break;
-        case COMPOSE:
-        case COMPOSE_COMPAT:
-            nonConst->currentChar = nonConst->nextCompose();    
-            break;
-        case DECOMP:    
-        case DECOMP_COMPAT:
-            nonConst->currentChar = nonConst->nextDecomp();        
-            break;
-        case FCD:
-            /* ### TODO */
-            break;
+UChar32 Normalizer::current() {
+    if(bufferPos<buffer.length()) {
+        return buffer.char32At(bufferPos);
+    } else {
+        /*
+         * Normalize from the current index,
+         * return the first character from there, and
+         * reset the character iterator to the original index.
+         * Set nextIndex to where the iterator stopped so
+         * that next() can later continue from there.
+         */
+        UTextOffset currentIndex=text->getIndex();
+        UChar32 c;
+
+        if(nextNormalize()) {
+            c=buffer.char32At(bufferPos);
+            nextIndex=text->getIndex();
+        } else {
+            c=DONE;
         }
+        text->setIndex(currentIndex);
+        return c;
     }
-    return currentChar;
-}
-
-/**
- * Return the first character in the normalized text.  This resets
- * the <tt>Normalizer's</tt> position to the beginning of the text.
- */
-UChar32 Normalizer::first() {
-    return setIndex(text->startIndex());
-}
-
-/**
- * Return the last character in the normalized text.  This resets
- * the <tt>Normalizer's</tt> position to be just before the
- * the input text corresponding to that normalized character.
- */
-UChar32 Normalizer::last() {
-    text->setIndex(text->endIndex());
-  
-    currentChar = DONE;                     // The current char hasn't been processed
-    clearBuffer();                          // The buffer is empty too
-    return previous();
 }
 
 /**
@@ -819,30 +230,31 @@ UChar32 Normalizer::last() {
  * of the text has already been reached, {@link #DONE} is returned.
  */
 UChar32 Normalizer::next() {
-    if (bufferPos < bufferLimit) {
-        // There are output characters left in the buffer
-        currentChar = buffer[++bufferPos];
-    }
-    else {
-        bufferLimit = bufferPos = 0;    // Buffer is now out of date
-        switch (fMode) {
-        case NO_OP:
-            currentChar = text->next();        
-            break;
-        case COMPOSE:        
-        case COMPOSE_COMPAT:
-            currentChar = nextCompose();    
-            break;
-        case DECOMP:    
-        case DECOMP_COMPAT:
-            currentChar = nextDecomp();        
-            break;
-        case FCD:
-            /* ### TODO */
-            break;
+    UChar32 c;
+
+    if(bufferPos<buffer.length()) {
+        c=buffer.char32At(bufferPos);
+        bufferPos+=UTF_CHAR_LENGTH(c);
+        return c;
+    } else {
+        /*
+         * If the buffer (which is now exhausted) was normalized
+         * during current() or setIndex() then the character iterator
+         * must be set to behind what was normalized then
+         * in order to continue with the following text.
+         * That "position behind what was normalized" is nextIndex.
+         */
+        if(nextIndex>=0) {
+            text->setIndex(nextIndex);
+        }
+        if(nextNormalize()) {
+            c=buffer.char32At(bufferPos);
+            bufferPos+=UTF_CHAR_LENGTH(c);
+            return c;
+        } else {
+            return DONE;
         }
     }
-    return currentChar;
 }
 
 /**
@@ -850,39 +262,27 @@ UChar32 Normalizer::next() {
  * the iteration position by one.  If the beginning
  * of the text has already been reached, {@link #DONE} is returned.
  */
-UChar32 Normalizer::previous()
-{
-    if (bufferPos > 0) {
-        // There are output characters left in the buffer
-        currentChar = buffer[--bufferPos];
+UChar32 Normalizer::previous() {
+    UChar32 c;
+
+    if(bufferPos>0 || previousNormalize()) {
+        c=buffer.char32At(bufferPos-1);
+        bufferPos-=UTF_CHAR_LENGTH(c);
+        return c;
+    } else {
+        return DONE;
     }
-    else {
-        bufferLimit = bufferPos = 0;    // Buffer is now out of date
-        switch (fMode) {
-        case NO_OP:        
-            currentChar = text->previous();    
-            break;
-        case COMPOSE:        
-        case COMPOSE_COMPAT:
-            currentChar = prevCompose();    
-            break;
-        case DECOMP:    
-        case DECOMP_COMPAT:
-            currentChar = prevDecomp();        
-            break;
-        case FCD:
-            /* ### TODO */
-            break;
-        }
-    }
-    return currentChar;
 }
 
-void Normalizer::reset() 
-{
-    text->setIndex(text->startIndex());
-    currentChar = DONE;     // The current char hasn't been processed
-    clearBuffer();          // The buffer is empty too
+void Normalizer::reset() {
+    text->setToStart();
+    clearBuffer();
+}
+
+void
+Normalizer::setIndexOnly(UTextOffset index) {
+    text->setIndex(index);
+    clearBuffer();
 }
 
 /**
@@ -904,15 +304,32 @@ void Normalizer::reset()
  * @throws IllegalArgumentException if the given index is less than
  *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
  */
-UChar32 Normalizer::setIndex(UTextOffset index)
-{
-    text->setIndex(index);   // Checks range
-    currentChar = DONE;     // The current char hasn't been processed
-    clearBuffer();          // The buffer is empty too
-
+UChar32 Normalizer::setIndex(UTextOffset index) {
+    setIndexOnly(index);
     return current();
 }
 
+/**
+ * Return the first character in the normalized text.  This resets
+ * the <tt>Normalizer's</tt> position to the beginning of the text.
+ */
+UChar32 Normalizer::first() {
+    text->setToStart();
+    clearBuffer();
+    return next();
+}
+
+/**
+ * Return the last character in the normalized text.  This resets
+ * the <tt>Normalizer's</tt> position to be just before the
+ * the input text corresponding to that normalized character.
+ */
+UChar32 Normalizer::last() {
+    text->setToEnd();
+    clearBuffer();
+    return previous();
+}
+
 /**
  * Retrieve the current iteration position in the input text that is
  * being normalized.  This method is useful in applications such as
@@ -957,7 +374,6 @@ void
 Normalizer::setMode(EMode newMode) 
 {
     fMode = newMode;
-    minDecomp = (int16_t)(((fMode & COMPAT_BIT) != 0) ? 0 : DecompData::MAX_COMPAT);
 }
 
 Normalizer::EMode 
@@ -1030,7 +446,17 @@ Normalizer::setText(const UChar* newText,
                     int32_t length,
                     UErrorCode &status)
 {
-    setText(UnicodeString(newText, length), status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
+    if (newIter == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    delete text;
+    text = newIter;
+    reset();
 }
 
 /**
@@ -1043,154 +469,83 @@ Normalizer::getText(UnicodeString&  result)
     text->getText(result);
 }
 
-
 //-------------------------------------------------------------------------
 // Private utility methods
 //-------------------------------------------------------------------------
 
-
-UChar Normalizer::curForward() {
-    UChar ch = text->current();
-    return ch;
-}
-
-UChar Normalizer::curBackward() {
-    UChar ch = text->previous();
-    return ch;
-}
-
-void Normalizer::doAppend(const UChar source[], uint16_t offset, UnicodeString& dest) {
-    uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT);
-    uint16_t length = (int16_t)(offset & STR_LENGTH_MASK);
-
-    if (length == 0) {
-        UChar ch;
-        while ((ch = source[index++]) != 0x0000) {
-            dest += ch;
-        }
-    } else {
-        while (length-- > 0) {
-            dest += source[index++];
-        }
-    }
-}
-
-void Normalizer::doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos)
-{
-    uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT);
-    uint16_t length = (int16_t)(offset & STR_LENGTH_MASK);
-
-    if (length == 0) {
-        UChar ch;
-        while ((ch = source[index++]) != 0x0000) {
-            insert(dest, pos++, ch);
-        }
-    } else {
-        while (length-- > 0) {
-            insert(dest, pos++, source[index++]);
-        }
-    }
-}
-
-uint16_t Normalizer::doReplace(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos) {
-
-    uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT);
-    uint16_t length = (int16_t)(offset & STR_LENGTH_MASK);
-    uint16_t i;
-    
-    dest.setCharAt(pos++, source[index++]);
-    if (length == 0) {
-        UChar ch;
-        while ((ch = source[index++]) != 0x0000) {
-            insert(dest, pos++, ch);
-            length++;
-        }
-    } else {
-        for (i = 1; i < length; i++) {
-            dest.insert(pos++, source[index++]);
-        }
-    }
-    return length;
-}
-
-void Normalizer::initBuffer() {
-    buffer.truncate(0);
-    clearBuffer();
-}
-
 void Normalizer::clearBuffer() {
-    bufferLimit = bufferPos = 0;
+    nextIndex=-1;
+    buffer.remove();
+    bufferPos=0;
 }
 
-//-----------------------------------------------------------------------------
-// Hangul / Jamo conversion utilities for internal use
-// See section 3.10 of The Unicode Standard, v 2.0.
-//
-/**
- * Convert a single Hangul syllable into one or more Jamo characters.
- * 
- * @param conjoin If TRUE, decompose Jamo into conjoining Jamo.
- */
-void Normalizer::hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit)
-{
-    UChar sIndex  = (UChar)(ch - HANGUL_BASE);
-    UChar leading = (UChar)(JAMO_LBASE + sIndex / JAMO_NCOUNT);
-    UChar vowel   = (UChar)(JAMO_VBASE +
-                            (sIndex % JAMO_NCOUNT) / JAMO_TCOUNT);
-    UChar trailing= (UChar)(JAMO_TBASE + (sIndex % JAMO_TCOUNT));
+UBool
+Normalizer::nextNormalize() {
+    UErrorCode errorCode=U_ZERO_ERROR;
+    int32_t length;
 
-    jamoAppend(leading, decompLimit, result);
-    jamoAppend(vowel, decompLimit, result);
-    if (trailing != JAMO_TBASE) {
-        jamoAppend(trailing, decompLimit, result);
+    clearBuffer();
+    switch(fMode) {
+    case NO_OP:
+        buffer.setTo(text->next32PostInc());
+        length=buffer.length();
+        break;
+    case COMPOSE:        
+    case COMPOSE_COMPAT:
+        length=unorm_nextCompose(buffer.fArray, buffer.fCapacity, *text,
+                                 fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
+                                 UnicodeString::growBuffer, &buffer,
+                                 &errorCode);
+        break;
+    case DECOMP:    
+    case DECOMP_COMPAT:
+        length=unorm_nextDecompose(buffer.fArray, buffer.fCapacity, *text,
+                                   fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
+                                   UnicodeString::growBuffer, &buffer,
+                                   &errorCode);
+        break;
+    case FCD:
+        length=unorm_nextFCD(buffer.fArray, buffer.fCapacity, *text,
+                             UnicodeString::growBuffer, &buffer,
+                             &errorCode);
+        break;
     }
+
+    return U_SUCCESS(errorCode) && length>0;
 }
 
-void Normalizer::jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest) {
-    uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
-    if (offset > decompLimit) {
-        /* HSYS: Be sure to check this for later.  UChar may not always be
-           uint16_t*/
-        doAppend((const UChar*)(DecompData::contents), offset, dest);
-    } else {
-        dest += ch;
+UBool
+Normalizer::previousNormalize() {
+    UErrorCode errorCode=U_ZERO_ERROR;
+    int32_t length;
+
+    clearBuffer();
+    switch(fMode) {
+    case NO_OP:
+        buffer.setTo(text->previous32());
+        length=buffer.length();
+        break;
+    case COMPOSE:        
+    case COMPOSE_COMPAT:
+        length=unorm_prevCompose(buffer.fArray, buffer.fCapacity, *text,
+                                 fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
+                                 UnicodeString::growBuffer, &buffer,
+                                 &errorCode);
+        break;
+    case DECOMP:    
+    case DECOMP_COMPAT:
+        length=unorm_prevDecompose(buffer.fArray, buffer.fCapacity, *text,
+                                   fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
+                                   UnicodeString::growBuffer, &buffer,
+                                   &errorCode);
+        break;
+    case FCD:
+        length=unorm_prevFCD(buffer.fArray, buffer.fCapacity, *text,
+                             UnicodeString::growBuffer, &buffer,
+                             &errorCode);
+        break;
     }
-}
-
-void Normalizer::jamoToHangul(UnicodeString& buffer, UTextOffset start) {
-    UTextOffset out = start;
-    UTextOffset limit = buffer.length() - 1;
-
-    UTextOffset in;
-    int16_t l, v = 0, t;
-
-    for (in = start; in < limit; in++) {
-        UChar ch = buffer[in];
-
-        if ((l = (int16_t)(ch - JAMO_LBASE)) >= 0 && l < JAMO_LCOUNT
-            && (v = (int16_t)(buffer[in+1] - (UChar)JAMO_VBASE)) >= 0 && v < JAMO_VCOUNT) {
-            //
-            // We've found a pair of Jamo characters to compose.
-            // Snarf the Jamo vowel and see if there's also a trailing char
-            //
-            in++;   // Snarf the Jamo vowel too.
-
-            t = (int16_t)((in < limit) ? buffer.charAt(in+1) : 0);
-            t -= JAMO_TBASE;
-
-            if (t >= 0 && t < JAMO_TCOUNT) {
-                in++;   // Snarf the trailing consonant too
-            } else {
-                t = 0;  // No trailing consonant
-            }
-            buffer[out++] = (UChar)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE);
-        } else {
-            buffer[out++] = ch;
-        }
-    }
-    while (in < buffer.length()) {
-        buffer[out++] = buffer[in++];
-    }
-
-    buffer.truncate(out);
+
+    bufferPos=length;
+    return U_SUCCESS(errorCode) && length>0;
 }
diff --git a/icu4c/source/common/unicode/normlzr.h b/icu4c/source/common/unicode/normlzr.h
index 7b5ff9688e..d935397014 100644
--- a/icu4c/source/common/unicode/normlzr.h
+++ b/icu4c/source/common/unicode/normlzr.h
@@ -27,9 +27,6 @@
 #include "unicode/chariter.h"
 #include "unicode/unorm.h"
 
-/* forward declaration */
-class ComposedCharIter;
-
 /**
  * <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or
  * decomposed form, allowing for easier sorting and searching of text.
@@ -246,6 +243,7 @@ class U_COMMON_API Normalizer
      * Unicode Normalization Forms</a>.
      * <p>
      * @see #setOption
+     * @deprecated To be removed (or moved to private for documentation) after 2002-aug-31. Obsolete option.
      */
     IGNORE_HANGUL     = 0x001
   };
@@ -500,14 +498,14 @@ class U_COMMON_API Normalizer
              UErrorCode&          status);
 
   //-------------------------------------------------------------------------
-  // CharacterIterator overrides
+  // Iteration API
   //-------------------------------------------------------------------------
   
   /**
    * Return the current character in the normalized text.
    * @draft
    */
-  UChar32              current(void) const;
+  UChar32              current(void);
 
   /**
    * Return the first character in the normalized text.  This resets
@@ -555,10 +553,12 @@ class U_COMMON_API Normalizer
    *
    * @return      the first normalized character that is the result of iterating
    *              forward starting at the given index.
-   * @draft
+   * @deprecated To be removed after 2002-aug-31. Use setIndexOnly().
    */
   UChar32              setIndex(UTextOffset index);
 
+  void                 setIndexOnly(UTextOffset index);
+
   /**
    * Reset the iterator so that it is in the same state that it was just after
    * it was constructed.  A subsequent call to <tt>next</tt> will return the first
@@ -740,98 +740,28 @@ class U_COMMON_API Normalizer
 private:
   // Private utility methods for iteration
   // For documentation, see the source code
-  UChar nextCompose(void);
-  UChar prevCompose(void);
-  UChar nextDecomp(void);
-  UChar prevDecomp(void);
+  UBool nextNormalize();
+  UBool previousNormalize();
 
-  UChar curForward(void);
-  UChar curBackward(void);
-
-  void    init(CharacterIterator* iter, 
-         EMode mode, 
-         int32_t option);
-  void    initBuffer(void);
+  void    init(CharacterIterator* iter, EMode mode, int32_t option);
   void    clearBuffer(void);
 
-  // Utilities used by Compose
-  static void        bubbleAppend(UnicodeString& target, 
-                     UChar ch, 
-                     uint32_t cclass);
-  static uint32_t     getComposeClass(UChar ch);
-  static uint16_t    composeLookup(UChar ch);
-  static uint16_t    composeAction(uint16_t baseIndex, 
-                      uint16_t comIndex);
-  static void        explode(UnicodeString& target, 
-                uint16_t index);
-  static UChar    pairExplode(UnicodeString& target, 
-                    uint16_t action);
-
-  // Utilities used by Decompose
-  static void        fixCanonical(UnicodeString& result);    // Reorders combining marks
-  static uint8_t    getClass(UChar ch);                    // Gets char's combining class
-
-  // Other static utility methods
-  static void doAppend(const UChar source[], 
-               uint16_t offset, 
-               UnicodeString& dest);
-  static void doInsert(const UChar source[], 
-               uint16_t offset, 
-               UnicodeString& dest, 
-               UTextOffset pos);
-  static uint16_t doReplace(const UChar source[], 
-               uint16_t offset, 
-               UnicodeString& dest, 
-               UTextOffset pos);
-
-  static void hangulToJamo(UChar ch, 
-               UnicodeString& result, 
-               uint16_t decompLimit);
-  static void jamoAppend(UChar ch, 
-             uint16_t decompLimit, 
-             UnicodeString& dest);
-  static void jamoToHangul(UnicodeString& buffer, 
-               UTextOffset start);
-
   //-------------------------------------------------------------------------
   // Private data
   //-------------------------------------------------------------------------
 
   EMode         fMode;
   int32_t       fOptions;
-  int16_t    minDecomp;
 
   // The input text and our position in it
   CharacterIterator*  text;
+  // The next index (if >= 0) to set in text for next(), which is
+  // necessary to make current() and setIndex() work reasonably.
+  UTextOffset         nextIndex;
 
   // A buffer for holding intermediate results
   UnicodeString       buffer;
-  UTextOffset          bufferPos;
-  UTextOffset          bufferLimit;
-  UChar             currentChar;
-
-  // Another buffer for use during iterative composition
-  UnicodeString       explodeBuf;
-
-  enum {
-    EMPTY = -1,
-    STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder
-    STR_LENGTH_MASK = 0x0003
-  };
-
-  enum {
-    HANGUL_BASE = 0xac00,
-    HANGUL_LIMIT = 0xd7a4,
-    JAMO_LBASE = 0x1100,
-    JAMO_VBASE = 0x1161,
-    JAMO_TBASE = 0x11a7,
-    JAMO_LCOUNT = 19,
-    JAMO_VCOUNT = 21,
-    JAMO_TCOUNT = 28,
-    JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT
-  };
-
-  friend class ComposedCharIter;
+  UTextOffset         bufferPos;
 };
 
 inline UBool