scuffed-code/icu4c/source/common/normlzr.cpp

/*
*******************************************************************************
*                                                                             *
* COPYRIGHT:                                                                  *
*   IBM Open Class Library                                                    *
*   (C) Copyright Taligent, Inc.,  1996                                       *
*   (C) Copyright International Business Machines Corporation,  1996-1998     *
*   Licensed Material - Program-Property of IBM - All Rights Reserved.        *
*   US Government Users Restricted Rights - Use, duplication, or disclosure   *
*   restricted by GSA ADP Schedule Contract with IBM Corp.                    *
*                                                                             *
*******************************************************************************
*/


#include "ucmp16.h"
#include "dcmpdata.h"
#include "compdata.h"

#include "normlzr.h"
#include "utypes.h"
#include "unistr.h"
#include "chariter.h"
#include "schriter.h"
#include "unicode.h"
#include "mutex.h"


#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))

inline static void insert(UnicodeString& dest,
              UTextOffset pos,
              UChar ch)
{
  dest.replace(pos, 0, &ch, 1);
}

const UChar     Normalizer::DONE     = 0xFFFF;
const UChar    Normalizer::HANGUL_BASE = 0xac00;
const UChar    Normalizer::HANGUL_LIMIT= 0xd7a4;
const UChar    Normalizer::JAMO_LBASE  = 0x1100;
const UChar    Normalizer::JAMO_VBASE  = 0x1161;
const UChar    Normalizer::JAMO_TBASE  = 0x11a7;
const int16_t    Normalizer::JAMO_LCOUNT = 19;
const int16_t    Normalizer::JAMO_VCOUNT = 21;
const int16_t    Normalizer::JAMO_TCOUNT = 28;
const int16_t    Normalizer::JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT;


//-------------------------------------------------------------------------
// Constructors and other boilerplate
//-------------------------------------------------------------------------

Normalizer::Normalizer(const UnicodeString& str,
               EMode mode)
{
  init(new StringCharacterIterator(str), mode, 0);
}

Normalizer::Normalizer(const UnicodeString& str,
               EMode mode,
               int32_t opt)
{
  init(new StringCharacterIterator(str), mode, opt);
}

Normalizer::Normalizer(const CharacterIterator& iter,
               EMode mode)
{
  init(iter.clone(), mode, 0);
}

Normalizer::Normalizer(const CharacterIterator& iter,
               EMode mode,
               int32_t opt)
{
  init(iter.clone(), mode, opt);
}

void Normalizer::init(CharacterIterator* adoptIter,
              EMode mode,
              int32_t options)
{
  bufferPos = 0;
  bufferLimit = 0;
  fOptions = options;
  currentChar = DONE;
  fMode = mode;
  text = adoptIter;

  minDecomp = (fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT;
}

Normalizer::Normalizer(const Normalizer& copy)
{
  init(copy.text->clone(), copy.fMode, copy.fOptions);

  buffer      = copy.buffer;
  bufferPos   = copy.bufferPos;
  bufferLimit = copy.bufferLimit;
  explodeBuf  = copy.explodeBuf;
  currentChar = copy.currentChar;
}

Normalizer::~Normalizer()
{
  delete text;
}

Normalizer*
Normalizer::clone() const
{
  return new Normalizer(*this);
}

/**
 * Generates a hash code for this iterator.
 */
int32_t Normalizer::hashCode() const
{
  return text->hashCode() + fMode + fOptions + bufferPos + bufferLimit;
}

bool_t Normalizer::operator==(const Normalizer& that) const
{
  return *text == *(that.text)
  && currentChar == that.currentChar
  && buffer == that.buffer
  && explodeBuf == that.explodeBuf
  && bufferPos == that.bufferPos
  && bufferLimit == that.bufferLimit;
}

//-------------------------------------------------------------------------
// Static utility methods
//-------------------------------------------------------------------------

void
Normalizer::normalize(const UnicodeString& source,
              EMode mode,
              int32_t options,
              UnicodeString& result,
              UErrorCode &status)
{
  switch (mode) {
  case NO_OP:
    result = source;
    break;
  case COMPOSE:
  case COMPOSE_COMPAT:
    compose(source, mode & COMPAT_BIT, options, result, status);
    break;
  case DECOMP:
  case DECOMP_COMPAT:
    decompose(source, mode & COMPAT_BIT, options, result, status);
    break;
  }
}

//-------------------------------------------------------------------------
// Compose methods
//-------------------------------------------------------------------------

void
Normalizer::compose(const UnicodeString& source,
            bool_t compat,
            int32_t options,
            UnicodeString& result,
            UErrorCode &status)
{
  if (U_FAILURE(status)) {
    return;
  }
  result.truncate(0);
  UnicodeString explodeBuf;

  UTextOffset  explodePos = EMPTY;         // Position in input buffer
  UTextOffset  basePos = 0;                // Position of last base in output string
  uint16_t    baseIndex = 0;              // Index of last base in "actions" array
  uint32_t    classesSeen = 0;            // Combining classes seen since last base
  uint16_t    action;

  // Compatibility explosions have lower indices; skip them if necessary
  uint16_t minExplode = compat ? 0 : ComposeData::MAX_COMPAT;
  uint16_t minDecomp = compat ? 0 : DecompData::MAX_COMPAT;

    UTextOffset i = 0;
    while (i < source.size() || explodePos != EMPTY) {
        // Get the next char from either the buffer or the source
      UChar ch;
      if (explodePos == EMPTY) {
    ch = source[i++];
      } else {
    ch = explodeBuf[explodePos++];
    if (explodePos >= explodeBuf.size()) {
      explodePos = EMPTY;
      explodeBuf.truncate(0);
    }
      }

      // Get the basic info for the character
      uint16_t charInfo = composeLookup(ch);
      uint16_t type = charInfo & ComposeData::TYPE_MASK;
      uint16_t index = charInfo >> ComposeData::INDEX_SHIFT;

      if (type == ComposeData::BASE) {
    classesSeen = 0;
    baseIndex = index;
    basePos = result.size();
    result += ch;
      }
      else if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING)
        {
      uint32_t cclass = ComposeData::typeMask[index];

      // We can only combine a character with the base if we haven't
      // already seen a combining character with the same canonical class.
      if (type == ComposeData::COMBINING && (classesSeen & cclass) == 0
          && (action = composeAction(baseIndex, index)) > 0)
            {
          if (action > ComposeData::MAX_COMPOSED) {
        // Pairwise explosion.  Actions above this value are really
        // indices into an array that in turn contains indices
        // into the exploding string table
        // TODO: What if there are unprocessed chars in the explode buffer?
        UChar newBase = pairExplode(explodeBuf, action);
        explodePos = 0;
        result[basePos] = newBase;

        baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
          } else {
        // Normal pairwise combination.  Replace the base char
        UChar newBase = (UChar) action;
        result[basePos] = newBase;

        baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
          }
          //
          // Since there are Unicode characters that cannot be combined in arbitrary
          // order, we have to re-process any combining marks that go with this
          // base character.  There are only four characters in Unicode that have
          // this problem.  If they are fixed in Unicode 3.0, this code can go away.
          //
          UTextOffset len = result.size();
          if (len - basePos > 1) {
        for (UTextOffset j = basePos+1; j < len; j++) {
          explodeBuf += result[j];
        }
        result.truncate(basePos+1);
        classesSeen = 0;
        if (explodePos == EMPTY) explodePos = 0;
          }
            } else {
          // No combination with this character
          bubbleAppend(result, ch, cclass);
          classesSeen |= cclass;
            }
        }
      else if (index > minExplode) {
    // Single exploding character
    explode(explodeBuf, index);
    explodePos = 0;
      }
      else if (type == ComposeData::HANGUL && minExplode == 0) {
    // If we're in compatibility mode we need to decompose Hangul to Jamo,
    // because some of the Jamo might have compatibility decompositions.
    hangulToJamo(ch, explodeBuf, minDecomp);
    explodePos = 0;
      }
      else if (type == ComposeData::INITIAL_JAMO) {
    classesSeen = 0;
    baseIndex = ComposeData::INITIAL_JAMO_INDEX;
    basePos = result.size();
    result += ch;
      }
      else if (type == ComposeData::MEDIAL_JAMO && classesSeen == 0
           && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
    // If the last character was an initial jamo, we can combine it with this
    // one to create a Hangul character.
    uint16_t l = result[basePos] - JAMO_LBASE;
    uint16_t v = ch - JAMO_VBASE;
    result[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);

    baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
      }
      else if (type == ComposeData::FINAL_JAMO && classesSeen == 0
           && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
    // If the last character was a medial jamo that we turned into Hangul,
    // we can add this character too.
    result[basePos] = (UChar)(result[basePos] + (ch - JAMO_TBASE));

    baseIndex = 0;
    basePos = -1;
    classesSeen = 0;
      } else {
    baseIndex = 0;
    basePos = -1;
    classesSeen = 0;
    result += ch;
      }
    }
}

/**
 * Compose starting with current input character and continuing
 * until just before the next base char.
 * <p>
 * <b>Input</b>:
 * <ul>
 *  <li>underlying char iter points to first character to decompose
 * </ul>
 * <p>
 * <b>Output:</b>
 * <ul>
 *  <li>returns first char of decomposition or DONE if at end
 *  <li>Underlying char iter is pointing at next base char or past end
 * </ul>
 */
UChar Normalizer::nextCompose()
{
    UTextOffset  explodePos = EMPTY;         // Position in input buffer
    UTextOffset  basePos = 0;                // Position of last base in output string
    uint16_t    baseIndex = 0;              // Index of last base in "actions" array
    uint32_t    classesSeen = 0;            // Combining classes seen since last base
    uint16_t    action;
    UChar        lastBase = 0;
    bool_t        chFromText = TRUE;

    // Compatibility explosions have lower indices; skip them if necessary
    uint16_t minExplode = (fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT;
    uint16_t minDecomp = (fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT;

    initBuffer();
    explodeBuf.truncate(0);

    UChar ch = curForward();

    while (ch != DONE) {
        // Get the basic info for the character
        uint16_t charInfo = composeLookup(ch);
        uint16_t type = charInfo & ComposeData::TYPE_MASK;
        uint16_t index = charInfo >> ComposeData::INDEX_SHIFT;

        if (type == ComposeData::BASE) {
            if (buffer.size() > 0 && chFromText && explodePos == EMPTY) {
                // When we hit a base char in the source text, we can return the text
                // that's been composed so far.  We'll re-process this char next time through.
                break;
            }
            classesSeen = 0;
            baseIndex = index;
            basePos = buffer.size();
            buffer += ch;
            lastBase = ch;
        }
        else if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING)
        {
            uint32_t cclass = ComposeData::typeMask[index];

            // We can only combine a character with the base if we haven't
            // already seen a combining character with the same canonical class.
            if (type == ComposeData::COMBINING && (classesSeen & cclass) == 0
                && (action = composeAction(baseIndex, index)) > 0)
            {
                if (action > ComposeData::MAX_COMPOSED) {
                    // Pairwise explosion.  Actions above this value are really
                    // indices into an array that in turn contains indices
                    // into the exploding string table
                    // TODO: What if there are unprocessed chars in the explode buffer?
                    UChar newBase = pairExplode(explodeBuf, action);
                    explodePos = 0;
                    buffer[basePos] = newBase;

                    baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
                    lastBase = newBase;
                } else {
                    // Normal pairwise combination.  Replace the base char
                    UChar newBase = (UChar) action;
                    buffer[basePos] = newBase;

                    baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
                    lastBase = newBase;
                }
                //
                // Since there are Unicode characters that cannot be combined in arbitrary
                // order, we have to re-process any combining marks that go with this
                // base character.  There are only four characters in Unicode that have
                // this problem.  If they are fixed in Unicode 3.0, this code can go away.
                //
                UTextOffset len = buffer.size();
                if (len - basePos > 1) {
                    for (UTextOffset j = basePos+1; j < len; j++) {
                        explodeBuf += buffer[j];
                    }
                    buffer.truncate(basePos+1);
                    classesSeen = 0;
                    if (explodePos == EMPTY) explodePos = 0;
                }
            } else {
                // No combination with this character
                bubbleAppend(buffer, ch, cclass);
                classesSeen |= cclass;
            }
        }
        else if (index > minExplode) {
            // Single exploding character
            explode(explodeBuf, index);
            explodePos = 0;
        }
        else if (type == ComposeData::HANGUL && minExplode == 0) {
            // If we're in compatibility mode we need to decompose Hangul to Jamo,
            // because some of the Jamo might have compatibility decompositions.
            hangulToJamo(ch, explodeBuf, minDecomp);
            explodePos = 0;
        }
        else if (type == ComposeData::INITIAL_JAMO) {
            if (buffer.size() > 0 && chFromText && explodePos == EMPTY) {
                // When we hit a base char in the source text, we can return the text
                // that's been composed so far.  We'll re-process this char next time through.
                break;
            }
            classesSeen = 0;
            baseIndex = ComposeData::INITIAL_JAMO_INDEX;
            basePos = buffer.size();
            buffer += ch;
        }
        else if (type == ComposeData::MEDIAL_JAMO && classesSeen == 0
                    && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
            // If the last character was an initial jamo, we can combine it with this
            // one to create a Hangul character.
            uint16_t l = buffer[basePos] - JAMO_LBASE;
            uint16_t v = ch - JAMO_VBASE;
            UChar newCh = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
            buffer[basePos] = newCh;

            baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
        }
        else if (type == ComposeData::FINAL_JAMO && classesSeen == 0
                    && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
            // If the last character was a medial jamo that we turned into Hangul,
            // we can add this character too.
            UChar newCh = (UChar)(buffer[basePos] + (ch - JAMO_TBASE));
            buffer[basePos] = newCh;

            baseIndex = 0;
            basePos = -1;
            classesSeen = 0;
        } else {
            // TODO: deal with JAMO character types
            baseIndex = 0;
            basePos = -1;
            classesSeen = 0;
            buffer += ch;
        }

        if (explodePos == EMPTY) {
            ch = text->next();
            chFromText = TRUE;
        } else {
            ch = explodeBuf[explodePos++];
            if (explodePos >= explodeBuf.size()) {
                explodePos = EMPTY;
                explodeBuf.truncate(0);
            }
            chFromText = FALSE;
        }
    }
    if (buffer.size() > 0) {
        bufferLimit = buffer.size() - 1;
        ch = buffer[0];
    } else {
        ch = DONE;
        bufferLimit = 0;
    }
    return ch;
}

/**
 * Compose starting with the input UChar just before the current position
 * and continuing backward until (and including) the previous base char.
 * <p>
 * <b>Input</b>:
 * <ul>
 *  <li>underlying char iter points just after last char to decompose
 * </ul>
 * <p>
 * <b>Output:</b>
 * <ul>
 *  <li>returns last char of resulting decomposition sequence
 *  <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
 * </ul>
 */
UChar Normalizer::prevCompose()
{
    UErrorCode status = U_ZERO_ERROR;
    initBuffer();

    // Slurp up characters until we hit a base char or an initial Jamo
    UChar ch;
    while ((ch = curBackward()) != DONE) {
        insert(buffer, 0, ch);

        // Get the basic info for the character
        uint16_t charInfo = composeLookup(ch);
        uint16_t type = charInfo & ComposeData::TYPE_MASK;

        if (type == ComposeData::BASE || type == ComposeData::HANGUL
            || type == ComposeData::INITIAL_JAMO || type == ComposeData::IGNORE)
        {
            break;
        }
    }
    // If there's more than one character in the buffer, compose it all at once....
    if (buffer.size() > 0) {
        // TODO: The performance of this is awful; add a way to compose
        // a UnicodeString& in place.
      UnicodeString composed;
      compose(buffer, (fMode & COMPAT_BIT), fOptions, composed, status);
      buffer.truncate(0);
      buffer += composed;

        if (buffer.size() > 1) {
            bufferLimit = bufferPos = buffer.size() - 1;
            ch = buffer[bufferPos];
        } else {
            ch = buffer[0];
        }
    }
    else {
        ch = DONE;
    }

    return ch;
}

void Normalizer::bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass) {
    UTextOffset i;
    for (i = target.size() - 1; i > 0; --i) {
        uint32_t iClass = getComposeClass(target[i]);

        if (iClass == 1 || iClass <= cclass) {      // 1 means combining class 0
            // We've hit something we can't bubble this character past, so insert here
            break;
        }
    }
    // We need to insert just after character "i"
    insert(target, i+1, ch);
}


uint32_t Normalizer::getComposeClass(UChar ch) {
    uint32_t cclass = 0;
    uint16_t charInfo = composeLookup(ch);
    uint16_t type = charInfo & ComposeData::TYPE_MASK;
    if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING) {
        cclass = ComposeData::typeMask[charInfo >> ComposeData::INDEX_SHIFT];
    }
    return cclass;
}

uint16_t Normalizer::composeLookup(UChar ch) {
  return ucmp16_getu(ComposeData::lookup, ch);
}

uint16_t Normalizer::composeAction(uint16_t baseIndex, uint16_t comIndex)
{
  return ucmp16_getu(ComposeData::actions,
             ((UChar)(baseIndex + ComposeData::MAX_BASES*comIndex)));
}

void Normalizer::explode(UnicodeString& target, uint16_t index) {
    UChar ch;
    while ((ch = ComposeData::replace[index++]) != 0)
    target += ch;
}

UChar Normalizer::pairExplode(UnicodeString& target, uint16_t action) {
    uint16_t index = ComposeData::actionIndex[action - ComposeData::MAX_COMPOSED];
    explode(target, index + 1);
    return ComposeData::replace[index];   // New base char
}

//-------------------------------------------------------------------------
// Decompose methods
//-------------------------------------------------------------------------

void
Normalizer::decompose(const UnicodeString& source,
              bool_t compat,
              int32_t options,
              UnicodeString& result,
              UErrorCode &status)
{
  if (U_FAILURE(status)) {
    return;
  }
  bool_t     hangul = (options & IGNORE_HANGUL) == 0;
  uint16_t     limit  = compat ? 0 : DecompData::MAX_COMPAT;

  result.truncate(0);

  for (UTextOffset i = 0; i < source.size(); ++i) {
    UChar ch = source[i];

    uint16_t offset = ucmp16_getu(DecompData::offsets, ch);


    if (offset > limit) {
      doAppend(DecompData::contents, offset, result);
    } else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) {
      hangulToJamo(ch, result, limit);
    } else {
      result += ch;
    }
  }
  fixCanonical(result);
}

/**
 * Decompose starting with current input character and continuing
 * until just before the next base char.
 * <p>
 * <b>Input</b>:
 * <ul>
 *  <li>underlying char iter points to first character to decompose
 * </ul>
 * <p>
 * <b>Output:</b>
 * <ul>
 *  <li>returns first char of decomposition or DONE if at end
 *  <li>Underlying char iter is pointing at next base char or past end
 * </ul>
 */
UChar Normalizer::nextDecomp()
{
  bool_t hangul = ((fOptions & IGNORE_HANGUL) == 0);
  UChar ch = curForward();

  uint16_t offset = ucmp16_getu(DecompData::offsets, ch);

  if (offset > minDecomp || ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
    {
      initBuffer();

      if (offset > minDecomp) {
    doAppend(DecompData::contents, offset, buffer);
      } else {
    buffer += ch;
      }
      bool_t needToReorder = FALSE;

      // Any other combining chacters that immediately follow the decomposed
      // character must be included in the buffer too, because they're
      // conceptually part of the same logical character.
      //
      // TODO: Might these need to be decomposed too?
      // (i.e. are there non-BASE characters with decompositions?
      //
      while ((ch = text->next()) != DONE
         && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
        {
      needToReorder = TRUE;
      buffer += ch;
        }

      if (buffer.size() > 1 && needToReorder) {
    // If there is more than one combining character in the buffer,
    // put them into the canonical order.
    // But we don't need to sort if only characters are the ones that
    // resulted from decomosing the base character.
    fixCanonical(buffer);
      }
      bufferLimit = buffer.size() - 1;
      ch = buffer[0];
    } else {
      // Just use this character, but first advance to the next one
      text->next();

      // Do Hangul -> Jamo decomposition if necessary
      if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
    initBuffer();
    hangulToJamo(ch, buffer, minDecomp);
    bufferLimit = buffer.size() - 1;
    ch = buffer[0];
      }
    }
  return ch;
}


/**
 * Decompose starting with the input char just before the current position
 * and continuing backward until (and including) the previous base char.
 * <p>
 * <b>Input</b>:
 * <ul>
 *  <li>underlying char iter points just after last char to decompose
 * </ul>
 * <p>
 * <b>Output:</b>
 * <ul>
 *  <li>returns last char of resulting decomposition sequence
 *  <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
 * </ul>
 */
UChar Normalizer::prevDecomp() {
    bool_t hangul = (fOptions & IGNORE_HANGUL) == 0;

    UChar ch = curBackward();

    uint16_t offset = ucmp16_getu(DecompData::offsets, ch);

    if (offset > minDecomp || ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
    {
        initBuffer();

        // Slurp up any combining characters till we get to a base char.
        while (ch != DONE && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE) {
            insert(buffer, 0, ch);
            ch = text->previous();
        }

        // Now decompose this base character
        offset = ucmp16_getu(DecompData::offsets, ch);
        if (offset > minDecomp) {
            doInsert(DecompData::contents, offset, buffer, 0);
        } else {
            // This is a base character that doesn't decompose
            // and isn't involved in reordering, so throw it back
            text->next();
        }

        if (buffer.size() > 1) {
            // If there is more than one combining character in the buffer,
            // put them into the canonical order.
            fixCanonical(buffer);
        }
        bufferLimit = bufferPos = buffer.size() - 1;
        ch = buffer[bufferPos];
    }
    else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
        initBuffer();
        hangulToJamo(ch, buffer, minDecomp);
        bufferLimit = bufferPos = buffer.size() - 1;
        ch = buffer[bufferPos];
    }
    return ch;
}

uint8_t Normalizer::getClass(UChar ch) {
    return  ucmp8_get(DecompData::canonClass, ch);
}

/**
 * Fixes the sorting sequence of non-spacing characters according to
 * their combining class.  The algorithm is listed on p.3-11 in the
 * Unicode Standard 2.0.  The table of combining classes is on p.4-2
 * in the Unicode Standard 2.0.
 * @param result the string to fix.
 */
void Normalizer::fixCanonical(UnicodeString& result) {
    UTextOffset i = result.size() - 1;
    uint8_t currentType = getClass(result[i]);
    uint8_t lastType;

    for (--i; i >= 0; --i) {
        lastType = currentType;
        currentType = getClass(result[i]);

        //
        // a swap is presumed to be rare (and a double-swap very rare),
        // so don't worry about efficiency here.
        //
        if (currentType > lastType && lastType != DecompData::BASE) {
            // swap characters
            UChar temp = result[i];
            result[i] = result[i+1];
            result[i+1] = temp;

            // if not at end, backup (one further, to compensate for for-loop)
            if (i < result.size() - 2) {
                i += 2;
            }
            // reset type, since we swapped.
            currentType = getClass(result[i]);
        }
    }
}


//-------------------------------------------------------------------------
// CharacterIterator overrides
//-------------------------------------------------------------------------

/**
 * Return the current character in the normalized text.
 */
UChar Normalizer:: current() const
{
  // TODO: make this method const and guarantee that currentChar is always set?
  Normalizer *nonConst = (Normalizer*)this;

  if (currentChar == DONE) {
    switch (fMode) {
    case NO_OP:
      nonConst->currentChar = text->current();
      break;
    case COMPOSE:
    case COMPOSE_COMPAT:
      nonConst->currentChar = nonConst->nextCompose();
      break;
    case DECOMP:
    case DECOMP_COMPAT:
      nonConst->currentChar = nonConst->nextDecomp();
      break;
    }
  }
  return currentChar;
}

/**
 * Return the first character in the normalized text.  This resets
 * the <tt>Normalizer's</tt> position to the beginning of the text.
 */
UChar Normalizer::first() {
    return setIndex(text->startIndex());
}

/**
 * Return the last character in the normalized text.  This resets
 * the <tt>Normalizer's</tt> position to be just before the
 * the input text corresponding to that normalized character.
 */
UChar Normalizer::last() {
  text->setIndex(text->endIndex());

  currentChar = DONE;                     // The current char hasn't been processed
  clearBuffer();                          // The buffer is empty too
  return previous();
}

/**
 * Return the next character in the normalized text and advance
 * the iteration position by one.  If the end
 * of the text has already been reached, {@link #DONE} is returned.
 */
UChar Normalizer::next() {
  if (bufferPos < bufferLimit) {
    // There are output characters left in the buffer
    currentChar = buffer[++bufferPos];
  }
  else {
    bufferLimit = bufferPos = 0;    // Buffer is now out of date
    switch (fMode) {
    case NO_OP:
      currentChar = text->next();
      break;
    case COMPOSE:
    case COMPOSE_COMPAT:
      currentChar = nextCompose();
      break;
    case DECOMP:
    case DECOMP_COMPAT:
      currentChar = nextDecomp();
      break;
    }
  }
  return currentChar;
}

/**
 * Return the previous character in the normalized text and decrement
 * the iteration position by one.  If the beginning
 * of the text has already been reached, {@link #DONE} is returned.
 */
UChar Normalizer::previous()
{
  if (bufferPos > 0) {
    // There are output characters left in the buffer
    currentChar = buffer[--bufferPos];
  }
  else {
    bufferLimit = bufferPos = 0;    // Buffer is now out of date
    switch (fMode) {
    case NO_OP:
      currentChar = text->previous();
      break;
    case COMPOSE:
    case COMPOSE_COMPAT:
      currentChar = prevCompose();
      break;
    case DECOMP:
    case DECOMP_COMPAT:
      currentChar = prevDecomp();
      break;
    }
  }
  return currentChar;
}

void Normalizer::reset()
{
    text->setIndex(text->startIndex());
    currentChar = DONE;     // The current char hasn't been processed
    clearBuffer();          // The buffer is empty too
}

/**
 * Set the iteration position in the input text that is being normalized
 * and return the first normalized character at that position.
 * <p>
 * <b>Note:</b> This method sets the position in the <em>input</em> text,
 * while {@link #next} and {@link #previous} iterate through characters
 * in the normalized <em>output</em>.  This means that there is not
 * necessarily a one-to-one correspondence between characters returned
 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
 * returned from <tt>setIndex</tt> and {@link #getIndex}.
 * <p>
 * @param index the desired index in the input text.
 *
 * @return      the first normalized character that is the result of iterating
 *              forward starting at the given index.
 *
 * @throws IllegalArgumentException if the given index is less than
 *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
 */
UChar Normalizer::setIndex(UTextOffset index)
{
    text->setIndex(index);   // Checks range
    currentChar = DONE;     // The current char hasn't been processed
    clearBuffer();          // The buffer is empty too

    return current();
}

/**
 * Retrieve the current iteration position in the input text that is
 * being normalized.  This method is useful in applications such as
 * searching, where you need to be able to determine the position in
 * the input text that corresponds to a given normalized output character.
 * <p>
 * <b>Note:</b> This method sets the position in the <em>input</em>, while
 * {@link #next} and {@link #previous} iterate through characters in the
 * <em>output</em>.  This means that there is not necessarily a one-to-one
 * correspondence between characters returned by <tt>next</tt> and
 * <tt>previous</tt> and the indices passed to and returned from
 * <tt>setIndex</tt> and {@link #getIndex}.
 *
 */
UTextOffset Normalizer::getIndex() const {
    return text->getIndex();
}

/**
 * Retrieve the index of the start of the input text.  This is the begin index
 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
 * over which this <tt>Normalizer</tt> is iterating
 */
UTextOffset Normalizer::startIndex() const {
    return text->startIndex();
}

/**
 * Retrieve the index of the end of the input text.  This is the end index
 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 * over which this <tt>Normalizer</tt> is iterating
 */
UTextOffset Normalizer::endIndex() const {
    return text->endIndex();
}

//-------------------------------------------------------------------------
// Property access methods
//-------------------------------------------------------------------------

void
Normalizer::setMode(EMode newMode)
{
  fMode     = newMode;
  minDecomp     = ((fMode & COMPAT_BIT) != 0) ? 0 : DecompData::MAX_COMPAT;
}

Normalizer::EMode
Normalizer::getMode() const
{
    return fMode;
}

void
Normalizer::setOption(int32_t option,
              bool_t value)
{
  if (value) {
    fOptions |= option;
  } else {
    fOptions &= (~option);
  }
}

bool_t
Normalizer::getOption(int32_t option) const
{
    return (fOptions & option) != 0;
}

/**
 * Set the input text over which this <tt>Normalizer</tt> will iterate.
 * The iteration position is set to the beginning of the input text.
 */
void
Normalizer::setText(const UnicodeString& newText,
            UErrorCode &status)
{
  if (U_FAILURE(status)) {
    return;
  }
  CharacterIterator *newIter = new StringCharacterIterator(newText);
  if (newIter == NULL) {
    status = U_MEMORY_ALLOCATION_ERROR;
    return;
  }
  delete text;
  text = newIter;
  reset();
}

/**
 * Set the input text over which this <tt>Normalizer</tt> will iterate.
 * The iteration position is set to the beginning of the string.
 */
void
Normalizer::setText(const CharacterIterator& newText,
            UErrorCode &status)
{
  if (U_FAILURE(status)) {
    return;
  }
  CharacterIterator *newIter = newText.clone();
  if (newIter == NULL) {
    status = U_MEMORY_ALLOCATION_ERROR;
    return;
  }
  delete text;
  text = newIter;
  reset();
}


/**
 * Copies the text under iteration into the UnicodeString referred to by "result".
 * @param result Receives a copy of the text under iteration.
 */
void
Normalizer::getText(UnicodeString&  result)
{
    text->getText(result);
}


//-------------------------------------------------------------------------
// Private utility methods
//-------------------------------------------------------------------------


UChar Normalizer::curForward() {
    UChar ch = text->current();
    return ch;
}

UChar Normalizer::curBackward() {
    UChar ch = text->previous();
    return ch;
}

void Normalizer::doAppend(const UChar source[], uint16_t offset, UnicodeString& dest) {
    uint16_t index = offset >> STR_INDEX_SHIFT;
    uint16_t length = offset & STR_LENGTH_MASK;

    if (length == 0) {
        UChar ch;
        while ((ch = source[index++]) != 0x0000) {
            dest += ch;
        }
    } else {
        while (length-- > 0) {
            dest += source[index++];
        }
    }
}

void Normalizer::doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos)
{
    uint16_t index = offset >> STR_INDEX_SHIFT;
    uint16_t length = offset & STR_LENGTH_MASK;

    if (length == 0) {
        UChar ch;
        while ((ch = source[index++]) != 0x0000) {
            insert(dest, pos++, ch);
        }
    } else {
        while (length-- > 0) {
            insert(dest, pos++, source[index++]);
        }
    }
}

void Normalizer::initBuffer() {
    buffer.truncate(0);
    clearBuffer();
}

void Normalizer::clearBuffer() {
    bufferLimit = bufferPos = 0;
}

//-----------------------------------------------------------------------------
// Hangul / Jamo conversion utilities for internal use
// See section 3.10 of The Unicode Standard, v 2.0.
//
/**
 * Convert a single Hangul syllable into one or more Jamo characters.
 *
 * @param conjoin If TRUE, decompose Jamo into conjoining Jamo.
 */
void Normalizer::hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit)
{
    UChar sIndex  = (UChar)(ch - HANGUL_BASE);
    UChar leading = (UChar)(JAMO_LBASE + sIndex / JAMO_NCOUNT);
    UChar vowel   = (UChar)(JAMO_VBASE +
                          (sIndex % JAMO_NCOUNT) / JAMO_TCOUNT);
    UChar trailing= (UChar)(JAMO_TBASE + (sIndex % JAMO_TCOUNT));

    jamoAppend(leading, decompLimit, result);
    jamoAppend(vowel, decompLimit, result);
    if (trailing != JAMO_TBASE) {
        jamoAppend(trailing, decompLimit, result);
    }
}

void Normalizer::jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest) {
  uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
    if (offset > decompLimit) {
        doAppend(DecompData::contents, offset, dest);
    } else {
        dest += ch;
    }
}

void Normalizer::jamoToHangul(UnicodeString& buffer, UTextOffset start) {
    UTextOffset out = start;
    UTextOffset limit = buffer.size() - 1;

    UTextOffset in;
    uint16_t l, v, t;

    for (in = start; in < limit; in++) {
        UChar ch = buffer[in];

        if ((l = ch - JAMO_LBASE) >= 0 && l < JAMO_LCOUNT
                && (v = buffer[in+1] - JAMO_VBASE) >= 0 && v < JAMO_VCOUNT) {
            //
            // We've found a pair of Jamo characters to compose.
            // Snarf the Jamo vowel and see if there's also a trailing char
            //
            in++;   // Snarf the Jamo vowel too.

            t = (in < limit) ? buffer.charAt(in+1) : 0;
            t -= JAMO_TBASE;

            if (t >= 0 && t < JAMO_TCOUNT) {
                in++;   // Snarf the trailing consonant too
            } else {
                t = 0;  // No trailing consonant
            }
            buffer[out++] = (UChar)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE);
        } else {
            buffer[out++] = ch;
        }
    }
    while (in < buffer.size()) {
        buffer[out++] = buffer[in++];
    }

    buffer.truncate(out);
}