scuffed-code/icu4c/source/i18n/mergecol.cpp
1999-12-08 02:11:04 +00:00

521 lines
14 KiB
C++

/*******************************************************************************
* Copyright (C) 1996-1999, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
//=============================================================================
//
// File mergecol.cpp
//
// Contains MergeCollation. This classes job is to take one or more
// strings that represent the orderings in a collation, in the form
// "a , A < b , B ....". MergeCollation parses the string into a list of
// PatternEntry objects that are sorted by their position in the collation
// ordering. The input string is allowed to have elements out of order, e.g.
// "... b < c < d < e ..... c < ch". After being parsed by MergeCollation,
// the pattern entries will be in the proper order: "b", "c", "ch", "d", "e"
//
// Created by: Helena Shih
//
// Modification History:
//
// Date Name Description
// 3/5/97 mark Cleaned up fixEntry(). Added constants BYTEPOWER
// and BYTEMASK to replace BYTESIZE.
// 6/17/97 helena In getPattern, added the queue-up list for entries
// with the same extension chars.
// 6/23/97 helena Adding comments to make code more readable.
// 8/13/98 erm Synched up with 1.2 version of MergeCollation.java
// 04/23/99 stephen Removed EDecompositionMode, merged with
// Normalizer::EMode
//=============================================================================
#include "mergecol.h"
#include "tables.h"
#ifdef _DEBUG
#include "unistrm.h"
#endif
const int32_t MergeCollation::BITARRAYSIZE = 8192;
const uint8_t MergeCollation::BITARRAYMASK = 0x1;
const int32_t MergeCollation::BYTEPOWER = 3;
const int32_t MergeCollation::BYTEMASK = (1 << BYTEPOWER) - 1;
/**
* Creates from a pattern.
* If the input pattern is incorrect, error code will be set.
* @param pattern the merge collation pattern
* @param success error code input/output parameter.
*/
MergeCollation::MergeCollation(const UnicodeString& pattern,
Normalizer::EMode decompMode,
UErrorCode& status)
: lastEntry(NULL), saveEntry(NULL)
{
patterns = new VectorOfPointersToPatternEntry();
if (patterns == NULL)
{
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
statusArray = new uint8_t[BITARRAYSIZE];
if (statusArray == NULL)
{
delete patterns;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t i;
for (i = 0; i < BITARRAYSIZE; i += 1)
{
statusArray[i] = 0;
}
setPattern(pattern, decompMode, status);
if (U_FAILURE(status))
{
delete [] statusArray;
statusArray = NULL;
}
}
/**
* Copy constructor
* @param other the source merge collation object to be constructed with
*/
MergeCollation::MergeCollation(const MergeCollation& other)
: lastEntry(NULL), saveEntry(NULL)
{
// This copy ctor does a deep copy - it duplicates the PatternEntry
// objects as well as the vector object
patterns = new VectorOfPointersToPatternEntry(*other.patterns);
int32_t i;
statusArray = new uint8_t[BITARRAYSIZE];
for (i = 0; i < BITARRAYSIZE; i += 1)
{
statusArray[i] = other.statusArray[i];
}
}
// Assignment operator. Does a deep copy.
const MergeCollation&
MergeCollation::operator=(const MergeCollation& other)
{
if (this != &other)
{
*patterns = *other.patterns;
lastEntry = 0;
saveEntry = 0;
int32_t i;
for (i = 0; i < BITARRAYSIZE; i += 1)
{
statusArray[i] = other.statusArray[i];
}
}
return *this;
}
/**
* Destructor
*/
MergeCollation::~MergeCollation()
{
delete patterns;
delete [] statusArray;
}
/**
* recovers current pattern as a string.
* Basically, this runs through the PatternEntry array and outputs
* @param result the string into which the pattern is recovered
* the proper string for each element.
* @param withWhiteSpace puts spacing around the entries, and \n
* before & and <
*/
UnicodeString&
MergeCollation::getPattern(UnicodeString& result, bool_t withWhiteSpace) const
{
int32_t i;
PatternEntry *tmp = NULL;
VectorOfPointer *extList = NULL;
result.remove();
for (i = 0; i < patterns->size(); i += 1)
{
PatternEntry* entry = patterns->at(i);
if (entry != NULL)
{
// if the entry is an expanding ligature, queue up the entries until
// the last same ligature has been processed.
if (entry->extension.length() != 0)
{
if (extList == NULL)
{
extList = new VectorOfPointer();
}
extList->atInsert(extList->size(), (const void*&)entry);
}
else
{
// Process the queue-up list in reverse order to get the correct
// pattern.
if (extList != NULL)
{
const PatternEntry *last = findLastWithNoExtension(i - 1);
for (int32_t j = extList->size() - 1; j >= 0 ; j -= 1)
{
tmp = (PatternEntry*)(extList->at(j));
tmp->addToBuffer(result, FALSE, withWhiteSpace, last);
}
delete extList;
extList = NULL;
}
entry->addToBuffer(result, FALSE, withWhiteSpace, NULL);
}
}
}
// Complete the queue-up list if it isn't empty
if (extList != NULL)
{
const PatternEntry *last = findLastWithNoExtension(i - 1);
for (int32_t j = extList->size() - 1; j >= 0 ; j -= 1)
{
tmp = (PatternEntry*)(extList->at(j));
tmp->addToBuffer(result, FALSE, withWhiteSpace, last);
}
delete extList;
}
return result;
}
/**
* emits the pattern for collation builder.
* @param result the string into which the pattern is recovered
* @param withWhiteSpace puts spacing around the entries, and \n
* before & and <
* @return emits the string in the format understable to the collation
* builder.
*/
UnicodeString&
MergeCollation::emitPattern(UnicodeString& result, bool_t withWhiteSpace) const
{
int32_t i;
result.remove();
for (i = 0; i < patterns->size(); i += 1)
{
PatternEntry *entry = (PatternEntry *)patterns->at(i);
if (entry != NULL)
{
entry->addToBuffer(result, TRUE, withWhiteSpace, NULL);
}
}
return result;
}
/**
* sets the pattern.
*/
void MergeCollation::setPattern(const UnicodeString& pattern,
Normalizer::EMode decompMode,
UErrorCode& success)
{
if (U_FAILURE(success))
{
return;
}
patterns->clear();
addPattern(pattern, decompMode, success);
if (U_FAILURE(success))
{
delete patterns;
patterns = NULL;
}
}
/**
* adds a pattern string to the current list of patterns
* @param pattern the new pattern to be added
*/
void MergeCollation::addPattern(const UnicodeString& pattern,
Normalizer::EMode decompMode,
UErrorCode& success)
{
if (U_FAILURE(success) || (pattern.length() == 0))
{
return;
}
PatternEntry::Parser *parser = new PatternEntry::Parser(pattern, decompMode);
PatternEntry *entry = parser->next(success);
while (entry != NULL)
{
if (U_FAILURE(success))
{
delete entry;
break;
}
fixEntry(entry, success);
if (U_FAILURE(success))
{
delete entry;
break;
}
entry = parser->next(success);
}
}
/**
* gets count of separate entries
* @return the size of pattern entries
*/
int32_t
MergeCollation::getCount() const {
return patterns->size();
}
/**
* gets count of separate entries
* @param index the offset of the desired pattern entry
* @return the requested pattern entry
*/
const PatternEntry* MergeCollation::getItemAt(UTextOffset index) const {
return patterns->at(index);
}
// Find the last no-extension entry.
const PatternEntry* MergeCollation::findLastWithNoExtension(int32_t i) const {
for (--i;i >= 0; --i) {
PatternEntry* entry = patterns->at(i);
if ((entry != 0) && (entry->extension.length() == 0)) {
return entry;
}
}
return 0;
}
// Add a new PatternEntry to this MergeCollation's ordered list
// of entries.
//
// If the strength is RESET, then just change the lastEntry to
// be the current. (If the current is not in patterns, signal an error).
//
// If not, then remove the current entry, and add it after lastEntry
// (which is usually at the end).
//
void MergeCollation::fixEntry(PatternEntry* newEntry,
UErrorCode& success)
{
UnicodeString excess;
bool_t changeLastEntry = TRUE;
if (newEntry->strength != PatternEntry::RESET)
{
int32_t oldIndex = -1;
// Use statusArray to mark if a unicode character has been
// added in the table or not. The same later entry will
// replace the previous one. This will improve the single
// char entries dramatically which is the majority of the
// entries.
if (newEntry->chars.length() == 1)
{
UChar c = newEntry->chars[0];
int32_t statusIndex = c >> BYTEPOWER;
uint8_t bitClump = statusArray[statusIndex];
uint8_t setBit = BITARRAYMASK << (c & BYTEMASK);
if (bitClump != 0 && (bitClump & setBit) != 0)
{
int32_t i = 0;
// Find the previous entry with the same key
for (i = patterns->size() - 1; i >= 0; i -= 1)
{
PatternEntry *entry = patterns->at(i);
if ((entry != 0) &&
(entry->chars == newEntry->chars))
{
oldIndex = i;
break;
}
}
}
else
{
// We're going to add an element that starts with this
// character, so go ahead and set its bit.
statusArray[statusIndex] = (uint8_t)(bitClump | setBit);
}
}
else
{
oldIndex = patterns->lastIndexOf(newEntry);
}
if (oldIndex != -1)
{
PatternEntry *p = patterns->orphanAt(oldIndex);
delete p;
}
// Find the insertion point for the new entry.
int32_t lastIndex = findLastEntry(lastEntry, excess, success);
if (U_FAILURE(success))
{
return;
}
// Do not change the last entry if the new entry is a expanding character
if (excess.length() != 0)
{
// newEntry.extension = excess + newEntry.extensions;
newEntry->extension.insert(0, excess);
if (lastIndex != patterns->size())
{
lastEntry = saveEntry;
changeLastEntry = FALSE;
}
}
// Add the entry at the end or insert it in the middle
if (lastIndex == patterns->size())
{
patterns->atPut(lastIndex, newEntry);
saveEntry = newEntry;
}
else
{
patterns->atInsert(lastIndex, newEntry); // add at end
}
}
if (changeLastEntry)
{
lastEntry = newEntry;
}
}
int32_t
MergeCollation::findLastEntry(const PatternEntry* lastEntry,
UnicodeString& excess,
UErrorCode& success) const
{
if (U_FAILURE(success))
{
return -1;
}
if (lastEntry == NULL)
{
return 0;
}
else if (lastEntry->strength != PatternEntry::RESET)
{
int32_t oldIndex = -1;
// If the last entry is a single char entry and has been installed,
// that means the last index is the real last index.
if (lastEntry->chars.length() == 1)
{
int32_t index = lastEntry->chars[0] >> BYTEPOWER;
if ((statusArray[index] &
(uint8_t)(BITARRAYMASK << (lastEntry->chars[0] & BYTEMASK))) != 0)
{
oldIndex = patterns->lastIndexOf(lastEntry);
}
}
else
{
oldIndex = patterns->lastIndexOf(lastEntry);
}
// must exist!
if (oldIndex == -1)
{
success = U_INVALID_FORMAT_ERROR;
return oldIndex;
}
return oldIndex + 1;
}
else
{
// We're doing a reset, i.e. inserting a new ordering at the position
// just after the entry corresponding to lastEntry's first character
int32_t i;
// Search backwards for string that contains this one;
// most likely entry is last one
for (i = patterns->size() - 1; i >= 0; i -= 1)
{
PatternEntry* entry = patterns->at(i);
UnicodeString buffer;
if (entry != 0)
{
//
// Look for elements with the same beginning key. The extra
// characters will be the expanding portion. This handles cases like
// "& Question-mark < '?'". We find the existing PatternEntry that matches
// the longest possible substring of "Question-mark", which will probably
// be 'Q'. We save the characters that didn't match ("uestion-mark" in
// this case), and then return the next index.
//
if (entry->chars.compareBetween(0, entry->chars.length(),
lastEntry->chars,0,entry->chars.length()) == 0)
{
lastEntry->chars.extractBetween(entry->chars.length(),
lastEntry->chars.length(),
buffer);
excess += buffer;
break;
}
}
}
if (i == -1)
{
success = U_INVALID_FORMAT_ERROR;
return i;
}
return i + 1;
}
}