b942cae860
X-SVN-Rev: 18
569 lines
15 KiB
C++
569 lines
15 KiB
C++
/*
|
||
*****************************************************************************************
|
||
* *
|
||
* COPYRIGHT: *
|
||
* (C) Copyright Taligent, Inc., 1996 *
|
||
* (C) Copyright International Business Machines Corporation, 1996-1998 *
|
||
* Licensed Material - Program-Property of IBM - All Rights Reserved. *
|
||
* US Government Users Restricted Rights - Use, duplication, or disclosure *
|
||
* restricted by GSA ADP Schedule Contract with IBM Corp. *
|
||
* *
|
||
*****************************************************************************************
|
||
*/
|
||
//=============================================================================
|
||
//
|
||
// File coleitr.cpp
|
||
//
|
||
//
|
||
//
|
||
// Created by: Helena Shih
|
||
//
|
||
// Modification History:
|
||
//
|
||
// Date Name Description
|
||
//
|
||
// 6/23/97 helena Adding comments to make code more readable.
|
||
// 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
|
||
//=============================================================================
|
||
|
||
#include "sortkey.h"
|
||
#include "coleitr.h"
|
||
|
||
#include "chariter.h"
|
||
#include "tables.h"
|
||
#include "normlzr.h"
|
||
#include "unicode.h"
|
||
|
||
int32_t const CollationElementIterator::NULLORDER = 0xffffffff;
|
||
int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;
|
||
|
||
|
||
// This private method will never be called, but it makes the linker happy
|
||
|
||
CollationElementIterator::CollationElementIterator()
|
||
: expIndex(0),
|
||
text(0),
|
||
swapOrder(0),
|
||
bufferAlias(0),
|
||
orderAlias(0)
|
||
{
|
||
}
|
||
|
||
// This private method will never be called, but it makes the linker happy
|
||
|
||
CollationElementIterator::CollationElementIterator(const RuleBasedCollator* order)
|
||
: expIndex(0),
|
||
text(0),
|
||
bufferAlias(0),
|
||
swapOrder(0),
|
||
orderAlias(order)
|
||
{
|
||
}
|
||
|
||
// This is the "real" constructor for this class; it constructs an iterator
|
||
// over the source text using the specified collator
|
||
CollationElementIterator::CollationElementIterator( const UnicodeString& sourceText,
|
||
const RuleBasedCollator* order,
|
||
UErrorCode& status)
|
||
: expIndex(0),
|
||
swapOrder(0),
|
||
text(NULL),
|
||
bufferAlias(NULL),
|
||
orderAlias(order)
|
||
{
|
||
if (FAILURE(status)) {
|
||
return;
|
||
}
|
||
|
||
if ( sourceText.size() != 0 ) {
|
||
//
|
||
// A CollationElementIterator is really a two-layered beast.
|
||
// Internally it uses a Normalizer to munge the source text
|
||
// into a form where all "composed" Unicode characters (such as <20>) are
|
||
// split into a normal character and a combining accent character.
|
||
// Afterward, CollationElementIterator does its own processing to handle
|
||
// expanding and contracting collation sequences, ignorables, and so on.
|
||
//
|
||
Normalizer::EMode decomp = (order->getStrength() == Collator::IDENTICAL)
|
||
? Normalizer::NO_OP
|
||
: order->getDecomposition();
|
||
|
||
text = new Normalizer(sourceText, decomp);
|
||
if (text == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
// This is the "real" constructor for this class; it constructs an iterator
|
||
// over the source text using the specified collator
|
||
CollationElementIterator::CollationElementIterator( const CharacterIterator& sourceText,
|
||
const RuleBasedCollator* order,
|
||
UErrorCode& status)
|
||
: expIndex(0),
|
||
swapOrder(0),
|
||
text(NULL),
|
||
bufferAlias(NULL),
|
||
orderAlias(order)
|
||
{
|
||
if (FAILURE(status)) {
|
||
return;
|
||
}
|
||
|
||
// **** should I just drop this test? ****
|
||
if ( sourceText.endIndex() != 0 )
|
||
{
|
||
//
|
||
// A CollationElementIterator is really a two-layered beast.
|
||
// Internally it uses a Normalizer to munge the source text
|
||
// into a form where all "composed" Unicode characters (such as <20>) are
|
||
// split into a normal character and a combining accent character.
|
||
// Afterward, CollationElementIterator does its own processing to handle
|
||
// expanding and contracting collation sequences, ignorables, and so on.
|
||
//
|
||
Normalizer::EMode decomp = order->getStrength() == Collator::IDENTICAL
|
||
? Normalizer::NO_OP
|
||
: order->getDecomposition();
|
||
|
||
text = new Normalizer(sourceText, decomp);
|
||
if (text == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
}
|
||
}
|
||
}
|
||
|
||
CollationElementIterator::CollationElementIterator(const CollationElementIterator& other)
|
||
: expIndex(other.expIndex), text(0), swapOrder(other.swapOrder)
|
||
{
|
||
text = (Normalizer*) other.text->clone();
|
||
bufferAlias = other.bufferAlias;
|
||
orderAlias = other.orderAlias;
|
||
}
|
||
|
||
const CollationElementIterator&
|
||
CollationElementIterator::operator=(const CollationElementIterator& other)
|
||
{
|
||
if (this != &other)
|
||
{
|
||
expIndex = other.expIndex;
|
||
swapOrder = other.swapOrder;
|
||
|
||
delete text;
|
||
text = (Normalizer*)other.text->clone();
|
||
|
||
bufferAlias = other.bufferAlias;
|
||
orderAlias = other.orderAlias;
|
||
}
|
||
|
||
return *this;
|
||
}
|
||
|
||
CollationElementIterator::~CollationElementIterator()
|
||
{
|
||
delete text;
|
||
text = NULL;
|
||
bufferAlias = NULL;
|
||
orderAlias = NULL;
|
||
}
|
||
|
||
bool_t
|
||
CollationElementIterator::operator==(const CollationElementIterator& that) const
|
||
{
|
||
if (this == &that)
|
||
{
|
||
return TRUE;
|
||
}
|
||
|
||
if (*text != *(that.text))
|
||
{
|
||
return FALSE;
|
||
}
|
||
|
||
if (swapOrder != that.swapOrder)
|
||
{
|
||
return FALSE;
|
||
}
|
||
|
||
if (*bufferAlias != *(that.bufferAlias))
|
||
{
|
||
return FALSE;
|
||
}
|
||
|
||
if (expIndex != that.expIndex)
|
||
{
|
||
return FALSE;
|
||
}
|
||
|
||
if (orderAlias != that.orderAlias)
|
||
{
|
||
return FALSE;
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
bool_t
|
||
CollationElementIterator::operator!=(const CollationElementIterator& other) const
|
||
{
|
||
return !(*this == other);
|
||
}
|
||
|
||
/**
|
||
* Resets the cursor to the beginning of the string.
|
||
*/
|
||
void
|
||
CollationElementIterator::reset()
|
||
{
|
||
if (text != NULL)
|
||
{
|
||
text->reset();
|
||
text->setMode(orderAlias->getDecomposition());
|
||
}
|
||
|
||
bufferAlias = NULL;
|
||
expIndex = 0;
|
||
swapOrder = 0;
|
||
}
|
||
|
||
// Sets the source to the new source string.
|
||
void
|
||
CollationElementIterator::setText(const UnicodeString& source,
|
||
UErrorCode& status)
|
||
{
|
||
if (FAILURE(status))
|
||
{
|
||
return;
|
||
}
|
||
|
||
bufferAlias = 0;
|
||
swapOrder = 0;
|
||
expIndex = 0;
|
||
|
||
if (text == NULL)
|
||
{
|
||
text = new Normalizer(source, orderAlias->getDecomposition());
|
||
}
|
||
else
|
||
{
|
||
text->setText(source, status);
|
||
text->setMode(orderAlias->getDecomposition());
|
||
}
|
||
}
|
||
|
||
// Sets the source to the new character iterator.
|
||
void
|
||
CollationElementIterator::setText(CharacterIterator& source,
|
||
UErrorCode& status)
|
||
{
|
||
if (FAILURE(status)) {
|
||
return;
|
||
}
|
||
|
||
bufferAlias = 0;
|
||
swapOrder = 0;
|
||
expIndex = 0;
|
||
|
||
if (text == NULL) {
|
||
text = new Normalizer(source, orderAlias->getDecomposition());
|
||
}
|
||
else
|
||
{
|
||
text->setMode(orderAlias->getDecomposition());
|
||
text->setText(source, status);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get the ordering priority of the next character in the string.
|
||
* @return the next character's ordering. Returns NULLORDER if
|
||
* the end of string is reached.
|
||
*/
|
||
int32_t
|
||
CollationElementIterator::next(UErrorCode& status)
|
||
{
|
||
if (text == NULL || FAILURE(status))
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
// Update the decomposition mode if necessary.
|
||
text->setMode(orderAlias->getDecomposition());
|
||
|
||
if (bufferAlias != NULL)
|
||
{
|
||
// bufferAlias needs a bit of an explanation.
|
||
// When we hit an expanding character in the text, we call the order's
|
||
// getExpandValues method to retrieve an array of the orderings for all
|
||
// of the characters in the expansion (see the end of this method).
|
||
// The first ordering is returned, and an alias to the orderings array
|
||
// is saved so that the remaining orderings can be returned on subsequent
|
||
// calls to next. So, if the expanding buffer is not exhausted,
|
||
// all we have to do here is return the next ordering in the buffer.
|
||
if (expIndex < bufferAlias->size())
|
||
{
|
||
return strengthOrder(bufferAlias->at(expIndex++));
|
||
}
|
||
else
|
||
{
|
||
bufferAlias = NULL;
|
||
expIndex = 0;
|
||
}
|
||
}
|
||
else if (swapOrder != 0)
|
||
{
|
||
// If we find a character with no order, we return the marking
|
||
// flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character
|
||
// itself shifted left 16 bits as orders. At this point, the
|
||
// UNMAPPEDCHARVALUE flag has already been returned by the code
|
||
// below, so just return the shifted character here.
|
||
int32_t order = swapOrder << 16;
|
||
|
||
swapOrder = 0;
|
||
|
||
return order;
|
||
}
|
||
|
||
// Gets the next character from the string using decomposition iterator.
|
||
UChar ch = text->current();
|
||
text->next();
|
||
|
||
if (FAILURE(status))
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
if (ch == Normalizer::DONE)
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
// Ask the collator for this character's ordering.
|
||
int32_t value = orderAlias->getUnicodeOrder(ch);
|
||
|
||
if (value == RuleBasedCollator::UNMAPPED)
|
||
{
|
||
// Returned an "unmapped" flag and save the character so it can be
|
||
// returned next time this method is called.
|
||
if (ch == 0x0000) return ch;
|
||
swapOrder = ch; // \u0000 is not valid in C++'s UnicodeString
|
||
return UNMAPPEDCHARVALUE;
|
||
}
|
||
|
||
if (value >= RuleBasedCollator::CONTRACTCHARINDEX)
|
||
{
|
||
value = nextContractChar(ch, status);
|
||
}
|
||
|
||
if (value >= RuleBasedCollator::EXPANDCHARINDEX)
|
||
{
|
||
bufferAlias = orderAlias->getExpandValueList(value);
|
||
expIndex = 0;
|
||
value = bufferAlias->at(expIndex++);
|
||
}
|
||
|
||
return strengthOrder(value);
|
||
}
|
||
|
||
/**
|
||
* Get the ordering priority of the previous collation element in the string.
|
||
* @param status the error code status.
|
||
* @return the previous element's ordering. Returns NULLORDER if
|
||
* the beginning of string is reached.
|
||
*/
|
||
int32_t
|
||
CollationElementIterator::previous(UErrorCode& status)
|
||
{
|
||
if (text == NULL || FAILURE(status))
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
text->setMode(orderAlias->getDecomposition());
|
||
|
||
if (bufferAlias != NULL)
|
||
{
|
||
if (expIndex > 0)
|
||
{
|
||
return strengthOrder(bufferAlias->at(--expIndex));
|
||
}
|
||
|
||
bufferAlias = NULL;
|
||
expIndex = 0;
|
||
}
|
||
else if (swapOrder != 0)
|
||
{
|
||
int32_t order = swapOrder << 16;
|
||
|
||
swapOrder = 0;
|
||
return order;
|
||
}
|
||
|
||
UChar ch = text->previous();
|
||
|
||
if (ch == Normalizer::DONE)
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
int32_t value = orderAlias->getUnicodeOrder(ch);
|
||
|
||
if (value == RuleBasedCollator::UNMAPPED)
|
||
{
|
||
if (ch == 0x0000) return ch;
|
||
swapOrder = UNMAPPEDCHARVALUE;
|
||
return ch;
|
||
}
|
||
|
||
if (value >= RuleBasedCollator::CONTRACTCHARINDEX)
|
||
{
|
||
value = prevContractChar(ch, status);
|
||
}
|
||
|
||
if (value >= RuleBasedCollator::EXPANDCHARINDEX)
|
||
{
|
||
bufferAlias = orderAlias->getExpandValueList(value);
|
||
expIndex = bufferAlias->size();
|
||
value = bufferAlias->at(--expIndex);
|
||
}
|
||
|
||
return strengthOrder(value);
|
||
}
|
||
|
||
int32_t
|
||
CollationElementIterator::strengthOrder(int32_t order) const
|
||
{
|
||
Collator::ECollationStrength s = orderAlias->getStrength();
|
||
// Mask off the unwanted differences.
|
||
if (s == Collator::PRIMARY)
|
||
{
|
||
order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY;
|
||
} else if (s == Collator::SECONDARY)
|
||
{
|
||
order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY;
|
||
}
|
||
return order;
|
||
}
|
||
|
||
UTextOffset
|
||
CollationElementIterator::getOffset() const
|
||
{
|
||
// Since the DecompositionIterator is doing the work of iterating through
|
||
// the text string, we can just ask it what its offset is.
|
||
return (text != NULL) ? text->getIndex() : 0;
|
||
}
|
||
|
||
void
|
||
CollationElementIterator::setOffset(UTextOffset newOffset,
|
||
UErrorCode& status)
|
||
{
|
||
if (FAILURE(status))
|
||
{
|
||
return;
|
||
}
|
||
|
||
if (text != NULL)
|
||
{
|
||
text->setIndex(newOffset);
|
||
}
|
||
|
||
bufferAlias = NULL;
|
||
expIndex = 0;
|
||
swapOrder = 0;
|
||
}
|
||
|
||
//============================================================
|
||
// privates
|
||
//============================================================
|
||
|
||
|
||
/**
|
||
* Get the ordering priority of the next contracting character in the
|
||
* string.
|
||
* @param ch the starting character of a contracting character token
|
||
* @return the next contracting character's ordering. Returns NULLORDER
|
||
* if the end of string is reached.
|
||
*/
|
||
int32_t
|
||
CollationElementIterator::nextContractChar(UChar ch,
|
||
UErrorCode& status)
|
||
{
|
||
// First get the ordering of this single character
|
||
VectorOfPToContractElement *list = orderAlias->getContractValues(ch);
|
||
EntryPair *pair = (EntryPair *)list->at(0);
|
||
int32_t order = pair->value;
|
||
|
||
// Now iterate through the chars following it and
|
||
// look for the longest match
|
||
key.remove();
|
||
key += ch;
|
||
|
||
while ((ch = text->current()) != Normalizer::DONE)
|
||
{
|
||
if (FAILURE(status))
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
key += ch;
|
||
|
||
int32_t n = RuleBasedCollator::getEntry(list, key, TRUE);
|
||
|
||
if (n == RuleBasedCollator::UNMAPPED)
|
||
{
|
||
break;
|
||
}
|
||
text->next();
|
||
|
||
pair = (EntryPair *)list->at(n);
|
||
order = pair->value;
|
||
}
|
||
|
||
return order;
|
||
}
|
||
|
||
/**
|
||
* Get the ordering priority of the previous contracting character in the
|
||
* string.
|
||
* @param ch the starting character of a contracting character token
|
||
* @return the next contracting character's ordering. Returns NULLORDER
|
||
* if the end of string is reached.
|
||
*/
|
||
int32_t CollationElementIterator::prevContractChar(UChar ch,
|
||
UErrorCode &status)
|
||
{
|
||
// First get the ordering of this single character
|
||
VectorOfPToContractElement *list = orderAlias->getContractValues(ch);
|
||
EntryPair *pair = (EntryPair *)list->at(0);
|
||
int32_t order = pair->value;
|
||
|
||
// Now iterate through the chars following it and
|
||
// look for the longest match
|
||
key.remove();
|
||
key += ch;
|
||
|
||
while ((ch = text->previous()) != Normalizer::DONE)
|
||
{
|
||
key += ch;
|
||
|
||
int32_t n = RuleBasedCollator::getEntry(list, key, FALSE);
|
||
|
||
if (n == RuleBasedCollator::UNMAPPED)
|
||
{
|
||
ch = text->next();
|
||
|
||
if (FAILURE(status))
|
||
{
|
||
return NULLORDER;
|
||
}
|
||
|
||
break;
|
||
}
|
||
|
||
pair = (EntryPair *)list->at(n);
|
||
order = pair->value;
|
||
}
|
||
|
||
return order;
|
||
}
|