scuffed-code/icu4c/source/i18n/coleitr.cpp

/*
*****************************************************************************************
*                                                                                       *
* COPYRIGHT:                                                                            *
*   (C) Copyright Taligent, Inc.,  1996                                                 *
*   (C) Copyright International Business Machines Corporation,  1996-1998               *
*   Licensed Material - Program-Property of IBM - All Rights Reserved.                  *
*   US Government Users Restricted Rights - Use, duplication, or disclosure             *
*   restricted by GSA ADP Schedule Contract with IBM Corp.                              *
*                                                                                       *
*****************************************************************************************
*/
//=============================================================================
//
// File coleitr.cpp
//
// 
//
// Created by: Helena Shih
//
// Modification History:
//
//  Date         Name          Description
//
//  6/23/97     helena      Adding comments to make code more readable.
// 08/03/98     erm         Synched with 1.2 version of CollationElementIterator.java
//=============================================================================

#include "sortkey.h"
#include "coleitr.h"

#include "chariter.h"
#include "tables.h"
#include "normlzr.h"
#include "unicode.h"

int32_t const CollationElementIterator::NULLORDER = 0xffffffff;
int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;


// This private method will never be called, but it makes the linker happy

CollationElementIterator::CollationElementIterator()
: expIndex(0),
  text(0),
  swapOrder(0),
  bufferAlias(0),
  orderAlias(0)
{
}

// This private method will never be called, but it makes the linker happy

CollationElementIterator::CollationElementIterator(const RuleBasedCollator* order)
: expIndex(0),
  text(0),
  bufferAlias(0),
  swapOrder(0),
  orderAlias(order)
{
}

// This is the "real" constructor for this class; it constructs an iterator
// over the source text using the specified collator
CollationElementIterator::CollationElementIterator( const UnicodeString& sourceText,
                                                    const RuleBasedCollator* order,
                                                    UErrorCode& status) 
: expIndex(0), 
  swapOrder(0),
  text(NULL),
  bufferAlias(NULL),
  orderAlias(order)
{
    if (FAILURE(status)) {
        return;
    }

    if ( sourceText.size() != 0 ) {
        //
        // A CollationElementIterator is really a two-layered beast.
        // Internally it uses a Normalizer to munge the source text
        // into a form where all "composed" Unicode characters (such as <20>) are
        // split into a normal character and a combining accent character.  
        // Afterward, CollationElementIterator does its own processing to handle
        // expanding and contracting collation sequences, ignorables, and so on.
        //
      Normalizer::EMode decomp = (order->getStrength() == Collator::IDENTICAL)
    ? Normalizer::NO_OP
    : order->getDecomposition();
      
      text = new Normalizer(sourceText, decomp);
      if (text == NULL) {
    status = U_MEMORY_ALLOCATION_ERROR;
      }
    }
}


// This is the "real" constructor for this class; it constructs an iterator
// over the source text using the specified collator
CollationElementIterator::CollationElementIterator( const CharacterIterator& sourceText,
                                                    const RuleBasedCollator* order,
                                                    UErrorCode& status) 
: expIndex(0), 
  swapOrder(0),
  text(NULL),
  bufferAlias(NULL),
  orderAlias(order)
{
    if (FAILURE(status)) {
        return;
    }

    // **** should I just drop this test? ****
    if ( sourceText.endIndex() != 0 )
    {
        //
        // A CollationElementIterator is really a two-layered beast.
        // Internally it uses a Normalizer to munge the source text
        // into a form where all "composed" Unicode characters (such as <20>) are
        // split into a normal character and a combining accent character.  
        // Afterward, CollationElementIterator does its own processing to handle
        // expanding and contracting collation sequences, ignorables, and so on.
        //
      Normalizer::EMode decomp = order->getStrength() == Collator::IDENTICAL
        ? Normalizer::NO_OP
        : order->getDecomposition();
      
      text = new Normalizer(sourceText, decomp);
      if (text == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
      }
    }
}

CollationElementIterator::CollationElementIterator(const    CollationElementIterator& other)
    : expIndex(other.expIndex), text(0), swapOrder(other.swapOrder)
{
    text = (Normalizer*) other.text->clone();
    bufferAlias = other.bufferAlias;
    orderAlias = other.orderAlias;
}

const   CollationElementIterator&
CollationElementIterator::operator=(const   CollationElementIterator& other)
{
    if (this != &other)
    {
        expIndex = other.expIndex;
        swapOrder = other.swapOrder;

        delete text;
        text = (Normalizer*)other.text->clone();

        bufferAlias = other.bufferAlias;
        orderAlias = other.orderAlias;
    }

    return *this;
}

CollationElementIterator::~CollationElementIterator()
{
    delete text;
    text = NULL;
    bufferAlias = NULL;
    orderAlias = NULL;
}

bool_t
CollationElementIterator::operator==(const CollationElementIterator& that) const
{
    if (this == &that)
    {
        return TRUE;
    }

    if (*text != *(that.text))
    {
        return FALSE;
    }

    if (swapOrder != that.swapOrder)
    {
        return FALSE;
    }

    if (*bufferAlias != *(that.bufferAlias))
    {
        return FALSE;
    }

    if (expIndex != that.expIndex)
    {
        return FALSE;
    }

    if (orderAlias != that.orderAlias)
    {
        return FALSE;
    }

    return TRUE;
}

bool_t
CollationElementIterator::operator!=(const CollationElementIterator& other) const
{
    return !(*this == other);
}

/**
 * Resets the cursor to the beginning of the string.
 */
void 
CollationElementIterator::reset()
{
  if (text != NULL)
    {
      text->reset();
      text->setMode(orderAlias->getDecomposition());
    }

  bufferAlias = NULL;
  expIndex = 0;
  swapOrder = 0;
}

// Sets the source to the new source string.
void
CollationElementIterator::setText(const UnicodeString&  source,
                                        UErrorCode&      status)
{
    if (FAILURE(status))
    {
        return;
    }

    bufferAlias = 0;
    swapOrder = 0;
    expIndex = 0;

    if (text == NULL)
    {
        text = new Normalizer(source, orderAlias->getDecomposition());
    }
    else
    {
        text->setText(source, status);
        text->setMode(orderAlias->getDecomposition());
    }
}

// Sets the source to the new character iterator.
void
CollationElementIterator::setText(CharacterIterator&  source,
                                        UErrorCode&      status)
{
    if (FAILURE(status)) {
        return;
    }

    bufferAlias = 0;
    swapOrder = 0;
    expIndex = 0;

    if (text == NULL) {
        text = new Normalizer(source, orderAlias->getDecomposition());
    }
    else
    {
        text->setMode(orderAlias->getDecomposition());
        text->setText(source, status);
    }
}

/**
 * Get the ordering priority of the next character in the string.
 * @return the next character's ordering.  Returns NULLORDER if
 * the end of string is reached.
 */
int32_t
CollationElementIterator::next(UErrorCode& status)
{
    if (text == NULL || FAILURE(status))
    {
        return NULLORDER;
    }

    // Update the decomposition mode if necessary.
    text->setMode(orderAlias->getDecomposition());
    
    if (bufferAlias != NULL)
    {
        // bufferAlias needs a bit of an explanation.
        // When we hit an expanding character in the text, we call the order's
        // getExpandValues method to retrieve an array of the orderings for all
        // of the characters in the expansion (see the end of this method).
        // The first ordering is returned, and an alias to the orderings array
        // is saved so that the remaining orderings can be returned on subsequent
        // calls to next.  So, if the expanding buffer is not exhausted, 
        // all we have to do here is return the next ordering in the buffer.  
        if (expIndex < bufferAlias->size())
        {
            return strengthOrder(bufferAlias->at(expIndex++));
        }
        else
        {
            bufferAlias = NULL;
            expIndex = 0;
        }
    }
    else if (swapOrder != 0)
    {
        // If we find a character with no order, we return the marking
        // flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character 
        // itself shifted left 16 bits as orders.  At this point, the
        // UNMAPPEDCHARVALUE flag has already been returned by the code
        // below, so just return the shifted character here.
        int32_t order = swapOrder << 16;

        swapOrder = 0;

        return order;
    }

    // Gets the next character from the string using decomposition iterator.
    UChar ch = text->current();
    text->next();

    if (FAILURE(status))
    {
        return NULLORDER;
    }

    if (ch == Normalizer::DONE)
    {
        return NULLORDER;
    }
    
    // Ask the collator for this character's ordering.
    int32_t value = orderAlias->getUnicodeOrder(ch);

    if (value == RuleBasedCollator::UNMAPPED)
    {
        // Returned an "unmapped" flag and save the character so it can be 
        // returned next time this method is called.
        if (ch == 0x0000) return ch;
        swapOrder = ch;  // \u0000 is not valid in C++'s UnicodeString
        return UNMAPPEDCHARVALUE;
    }
    
    if (value >= RuleBasedCollator::CONTRACTCHARINDEX)
    {
        value = nextContractChar(ch, status);
    }

    if (value >= RuleBasedCollator::EXPANDCHARINDEX)
    {
        bufferAlias = orderAlias->getExpandValueList(value);
        expIndex = 0;
        value = bufferAlias->at(expIndex++);
    }

    return strengthOrder(value);
}

 /**
  * Get the ordering priority of the previous collation element in the string.
  * @param status the error code status.
  * @return the previous element's ordering.  Returns NULLORDER if
  * the beginning of string is reached.
  */
int32_t
CollationElementIterator::previous(UErrorCode& status)
{
    if (text == NULL || FAILURE(status))
    {
        return NULLORDER;
    }

    text->setMode(orderAlias->getDecomposition());

    if (bufferAlias != NULL)
    {
        if (expIndex > 0)
        {
            return strengthOrder(bufferAlias->at(--expIndex));
        }

        bufferAlias = NULL;
        expIndex = 0;
    }
    else if (swapOrder != 0)
    {
        int32_t order = swapOrder << 16;

        swapOrder = 0;
        return order;
    }

    UChar ch = text->previous();

    if (ch == Normalizer::DONE)
    {
        return NULLORDER;
    }

    int32_t value = orderAlias->getUnicodeOrder(ch);

    if (value == RuleBasedCollator::UNMAPPED)
    {
        if (ch == 0x0000) return ch;
        swapOrder = UNMAPPEDCHARVALUE;
        return ch;
    }
    
    if (value >= RuleBasedCollator::CONTRACTCHARINDEX)
    {
        value = prevContractChar(ch, status);
    }

    if (value >= RuleBasedCollator::EXPANDCHARINDEX)
    {
        bufferAlias = orderAlias->getExpandValueList(value);
        expIndex = bufferAlias->size();
        value = bufferAlias->at(--expIndex);
    }

    return strengthOrder(value);
}

int32_t
CollationElementIterator::strengthOrder(int32_t order) const
{
    Collator::ECollationStrength s = orderAlias->getStrength();
    // Mask off the unwanted differences.
    if (s == Collator::PRIMARY)
    {
        order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY;
    } else if (s == Collator::SECONDARY)
    {
        order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY;
    }
    return order;
}

UTextOffset
CollationElementIterator::getOffset() const
{
    // Since the DecompositionIterator is doing the work of iterating through
    // the text string, we can just ask it what its offset is.
    return (text != NULL) ? text->getIndex() : 0;
}

void 
CollationElementIterator::setOffset(UTextOffset newOffset, 
                                    UErrorCode& status)
{
    if (FAILURE(status))
    {
        return;
    }

    if (text != NULL)
    {
        text->setIndex(newOffset);
    }

    bufferAlias = NULL;
    expIndex = 0;
    swapOrder = 0;
}

//============================================================
// privates
//============================================================


/**
 * Get the ordering priority of the next contracting character in the
 * string.
 * @param ch the starting character of a contracting character token
 * @return the next contracting character's ordering.  Returns NULLORDER
 * if the end of string is reached.
 */
int32_t
CollationElementIterator::nextContractChar(UChar ch,
                                           UErrorCode& status)
{
    // First get the ordering of this single character
    VectorOfPToContractElement *list = orderAlias->getContractValues(ch);
    EntryPair *pair = (EntryPair *)list->at(0);
    int32_t order = pair->value;

    // Now iterate through the chars following it and
    // look for the longest match
    key.remove();
    key += ch;

    while ((ch = text->current()) != Normalizer::DONE)
    {
        if (FAILURE(status))
        {
            return NULLORDER;
        }

        key += ch;

        int32_t n = RuleBasedCollator::getEntry(list, key, TRUE);

        if (n == RuleBasedCollator::UNMAPPED)
        {
            break;
        }
        text->next();

        pair = (EntryPair *)list->at(n);
        order = pair->value;
    }

    return order;
}

/**
 * Get the ordering priority of the previous contracting character in the
 * string.
 * @param ch the starting character of a contracting character token
 * @return the next contracting character's ordering.  Returns NULLORDER
 * if the end of string is reached.
 */
int32_t CollationElementIterator::prevContractChar(UChar ch,
                                                   UErrorCode &status)
{
    // First get the ordering of this single character
    VectorOfPToContractElement *list = orderAlias->getContractValues(ch);
    EntryPair *pair = (EntryPair *)list->at(0);
    int32_t order = pair->value;

    // Now iterate through the chars following it and
    // look for the longest match
    key.remove();
    key += ch;

    while ((ch = text->previous()) != Normalizer::DONE)
    {
        key += ch;

        int32_t n = RuleBasedCollator::getEntry(list, key, FALSE);

        if (n == RuleBasedCollator::UNMAPPED)
        {
            ch = text->next();

            if (FAILURE(status))
            {
                return NULLORDER;
            }

            break;
        }

        pair = (EntryPair *)list->at(n);
        order = pair->value;
    }

    return order;
}