14e6e92642
X-SVN-Rev: 220
521 lines
14 KiB
C++
521 lines
14 KiB
C++
/*******************************************************************************
|
|
* Copyright (C) 1996-1999, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*/
|
|
//=============================================================================
|
|
//
|
|
// File mergecol.cpp
|
|
//
|
|
// Contains MergeCollation. This classes job is to take one or more
|
|
// strings that represent the orderings in a collation, in the form
|
|
// "a , A < b , B ....". MergeCollation parses the string into a list of
|
|
// PatternEntry objects that are sorted by their position in the collation
|
|
// ordering. The input string is allowed to have elements out of order, e.g.
|
|
// "... b < c < d < e ..... c < ch". After being parsed by MergeCollation,
|
|
// the pattern entries will be in the proper order: "b", "c", "ch", "d", "e"
|
|
//
|
|
// Created by: Helena Shih
|
|
//
|
|
// Modification History:
|
|
//
|
|
// Date Name Description
|
|
// 3/5/97 mark Cleaned up fixEntry(). Added constants BYTEPOWER
|
|
// and BYTEMASK to replace BYTESIZE.
|
|
// 6/17/97 helena In getPattern, added the queue-up list for entries
|
|
// with the same extension chars.
|
|
// 6/23/97 helena Adding comments to make code more readable.
|
|
// 8/13/98 erm Synched up with 1.2 version of MergeCollation.java
|
|
// 04/23/99 stephen Removed EDecompositionMode, merged with
|
|
// Normalizer::EMode
|
|
//=============================================================================
|
|
|
|
#include "mergecol.h"
|
|
|
|
#include "tables.h"
|
|
|
|
#ifdef _DEBUG
|
|
#include "unistrm.h"
|
|
#endif
|
|
|
|
const int32_t MergeCollation::BITARRAYSIZE = 8192;
|
|
const uint8_t MergeCollation::BITARRAYMASK = 0x1;
|
|
const int32_t MergeCollation::BYTEPOWER = 3;
|
|
const int32_t MergeCollation::BYTEMASK = (1 << BYTEPOWER) - 1;
|
|
|
|
/**
|
|
* Creates from a pattern.
|
|
* If the input pattern is incorrect, error code will be set.
|
|
* @param pattern the merge collation pattern
|
|
* @param success error code input/output parameter.
|
|
*/
|
|
MergeCollation::MergeCollation(const UnicodeString& pattern,
|
|
Normalizer::EMode decompMode,
|
|
UErrorCode& status)
|
|
: lastEntry(NULL), saveEntry(NULL)
|
|
{
|
|
patterns = new VectorOfPointersToPatternEntry();
|
|
|
|
|
|
if (patterns == NULL)
|
|
{
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
statusArray = new uint8_t[BITARRAYSIZE];
|
|
|
|
if (statusArray == NULL)
|
|
{
|
|
delete patterns;
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
int32_t i;
|
|
for (i = 0; i < BITARRAYSIZE; i += 1)
|
|
{
|
|
statusArray[i] = 0;
|
|
}
|
|
|
|
setPattern(pattern, decompMode, status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
delete [] statusArray;
|
|
statusArray = NULL;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Copy constructor
|
|
* @param other the source merge collation object to be constructed with
|
|
*/
|
|
MergeCollation::MergeCollation(const MergeCollation& other)
|
|
: lastEntry(NULL), saveEntry(NULL)
|
|
{
|
|
// This copy ctor does a deep copy - it duplicates the PatternEntry
|
|
// objects as well as the vector object
|
|
patterns = new VectorOfPointersToPatternEntry(*other.patterns);
|
|
|
|
int32_t i;
|
|
statusArray = new uint8_t[BITARRAYSIZE];
|
|
for (i = 0; i < BITARRAYSIZE; i += 1)
|
|
{
|
|
statusArray[i] = other.statusArray[i];
|
|
}
|
|
}
|
|
|
|
// Assignment operator. Does a deep copy.
|
|
const MergeCollation&
|
|
MergeCollation::operator=(const MergeCollation& other)
|
|
{
|
|
if (this != &other)
|
|
{
|
|
*patterns = *other.patterns;
|
|
lastEntry = 0;
|
|
saveEntry = 0;
|
|
|
|
int32_t i;
|
|
for (i = 0; i < BITARRAYSIZE; i += 1)
|
|
{
|
|
statusArray[i] = other.statusArray[i];
|
|
}
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
/**
|
|
* Destructor
|
|
*/
|
|
MergeCollation::~MergeCollation()
|
|
{
|
|
delete patterns;
|
|
delete [] statusArray;
|
|
}
|
|
|
|
/**
|
|
* recovers current pattern as a string.
|
|
* Basically, this runs through the PatternEntry array and outputs
|
|
* @param result the string into which the pattern is recovered
|
|
* the proper string for each element.
|
|
* @param withWhiteSpace puts spacing around the entries, and \n
|
|
* before & and <
|
|
*/
|
|
UnicodeString&
|
|
MergeCollation::getPattern(UnicodeString& result, bool_t withWhiteSpace) const
|
|
{
|
|
|
|
int32_t i;
|
|
PatternEntry *tmp = NULL;
|
|
VectorOfPointer *extList = NULL;
|
|
|
|
result.remove();
|
|
|
|
for (i = 0; i < patterns->size(); i += 1)
|
|
{
|
|
PatternEntry* entry = patterns->at(i);
|
|
|
|
if (entry != NULL)
|
|
{
|
|
// if the entry is an expanding ligature, queue up the entries until
|
|
// the last same ligature has been processed.
|
|
if (entry->extension.size() != 0)
|
|
{
|
|
if (extList == NULL)
|
|
{
|
|
extList = new VectorOfPointer();
|
|
}
|
|
|
|
extList->atInsert(extList->size(), (const void*&)entry);
|
|
}
|
|
else
|
|
{
|
|
// Process the queue-up list in reverse order to get the correct
|
|
// pattern.
|
|
if (extList != NULL)
|
|
{
|
|
const PatternEntry *last = findLastWithNoExtension(i - 1);
|
|
|
|
for (int32_t j = extList->size() - 1; j >= 0 ; j -= 1)
|
|
{
|
|
tmp = (PatternEntry*)(extList->at(j));
|
|
tmp->addToBuffer(result, FALSE, withWhiteSpace, last);
|
|
}
|
|
|
|
delete extList;
|
|
extList = NULL;
|
|
}
|
|
|
|
entry->addToBuffer(result, FALSE, withWhiteSpace, NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Complete the queue-up list if it isn't empty
|
|
if (extList != NULL)
|
|
{
|
|
const PatternEntry *last = findLastWithNoExtension(i - 1);
|
|
|
|
for (int32_t j = extList->size() - 1; j >= 0 ; j -= 1)
|
|
{
|
|
tmp = (PatternEntry*)(extList->at(j));
|
|
tmp->addToBuffer(result, FALSE, withWhiteSpace, last);
|
|
}
|
|
|
|
delete extList;
|
|
}
|
|
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* emits the pattern for collation builder.
|
|
* @param result the string into which the pattern is recovered
|
|
* @param withWhiteSpace puts spacing around the entries, and \n
|
|
* before & and <
|
|
* @return emits the string in the format understable to the collation
|
|
* builder.
|
|
*/
|
|
UnicodeString&
|
|
MergeCollation::emitPattern(UnicodeString& result, bool_t withWhiteSpace) const
|
|
{
|
|
int32_t i;
|
|
|
|
result.remove();
|
|
|
|
for (i = 0; i < patterns->size(); i += 1)
|
|
{
|
|
PatternEntry *entry = (PatternEntry *)patterns->at(i);
|
|
|
|
if (entry != NULL)
|
|
{
|
|
entry->addToBuffer(result, TRUE, withWhiteSpace, NULL);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* sets the pattern.
|
|
*/
|
|
void MergeCollation::setPattern(const UnicodeString& pattern,
|
|
Normalizer::EMode decompMode,
|
|
UErrorCode& success)
|
|
{
|
|
if (U_FAILURE(success))
|
|
{
|
|
return;
|
|
}
|
|
|
|
patterns->clear();
|
|
|
|
addPattern(pattern, decompMode, success);
|
|
if (U_FAILURE(success))
|
|
{
|
|
delete patterns;
|
|
patterns = NULL;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* adds a pattern string to the current list of patterns
|
|
* @param pattern the new pattern to be added
|
|
*/
|
|
void MergeCollation::addPattern(const UnicodeString& pattern,
|
|
Normalizer::EMode decompMode,
|
|
UErrorCode& success)
|
|
{
|
|
if (U_FAILURE(success) || (pattern.size() == 0))
|
|
{
|
|
return;
|
|
}
|
|
|
|
PatternEntry::Parser *parser = new PatternEntry::Parser(pattern, decompMode);
|
|
|
|
PatternEntry *entry = parser->next(success);
|
|
|
|
while (entry != NULL)
|
|
{
|
|
if (U_FAILURE(success))
|
|
{
|
|
delete entry;
|
|
break;
|
|
}
|
|
|
|
fixEntry(entry, success);
|
|
|
|
if (U_FAILURE(success))
|
|
{
|
|
delete entry;
|
|
break;
|
|
}
|
|
|
|
entry = parser->next(success);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* gets count of separate entries
|
|
* @return the size of pattern entries
|
|
*/
|
|
int32_t
|
|
MergeCollation::getCount() const {
|
|
return patterns->size();
|
|
}
|
|
|
|
/**
|
|
* gets count of separate entries
|
|
* @param index the offset of the desired pattern entry
|
|
* @return the requested pattern entry
|
|
*/
|
|
const PatternEntry* MergeCollation::getItemAt(UTextOffset index) const {
|
|
return patterns->at(index);
|
|
}
|
|
|
|
// Find the last no-extension entry.
|
|
const PatternEntry* MergeCollation::findLastWithNoExtension(int32_t i) const {
|
|
for (--i;i >= 0; --i) {
|
|
PatternEntry* entry = patterns->at(i);
|
|
if ((entry != 0) && (entry->extension.size() == 0)) {
|
|
return entry;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Add a new PatternEntry to this MergeCollation's ordered list
|
|
// of entries.
|
|
//
|
|
// If the strength is RESET, then just change the lastEntry to
|
|
// be the current. (If the current is not in patterns, signal an error).
|
|
//
|
|
// If not, then remove the current entry, and add it after lastEntry
|
|
// (which is usually at the end).
|
|
//
|
|
void MergeCollation::fixEntry(PatternEntry* newEntry,
|
|
UErrorCode& success)
|
|
{
|
|
UnicodeString excess;
|
|
bool_t changeLastEntry = TRUE;
|
|
|
|
if (newEntry->strength != PatternEntry::RESET)
|
|
{
|
|
int32_t oldIndex = -1;
|
|
|
|
// Use statusArray to mark if a unicode character has been
|
|
// added in the table or not. The same later entry will
|
|
// replace the previous one. This will improve the single
|
|
// char entries dramatically which is the majority of the
|
|
// entries.
|
|
if (newEntry->chars.size() == 1)
|
|
{
|
|
UChar c = newEntry->chars[0];
|
|
int32_t statusIndex = c >> BYTEPOWER;
|
|
uint8_t bitClump = statusArray[statusIndex];
|
|
uint8_t setBit = BITARRAYMASK << (c & BYTEMASK);
|
|
|
|
if (bitClump != 0 && (bitClump & setBit) != 0)
|
|
{
|
|
int32_t i = 0;
|
|
|
|
// Find the previous entry with the same key
|
|
for (i = patterns->size() - 1; i >= 0; i -= 1)
|
|
{
|
|
PatternEntry *entry = patterns->at(i);
|
|
|
|
if ((entry != 0) &&
|
|
(entry->chars == newEntry->chars))
|
|
{
|
|
oldIndex = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// We're going to add an element that starts with this
|
|
// character, so go ahead and set its bit.
|
|
statusArray[statusIndex] = (uint8_t)(bitClump | setBit);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
oldIndex = patterns->lastIndexOf(newEntry);
|
|
}
|
|
|
|
if (oldIndex != -1)
|
|
{
|
|
PatternEntry *p = patterns->orphanAt(oldIndex);
|
|
delete p;
|
|
}
|
|
|
|
// Find the insertion point for the new entry.
|
|
int32_t lastIndex = findLastEntry(lastEntry, excess, success);
|
|
|
|
if (U_FAILURE(success))
|
|
{
|
|
return;
|
|
}
|
|
|
|
// Do not change the last entry if the new entry is a expanding character
|
|
if (excess.size() != 0)
|
|
{
|
|
// newEntry.extension = excess + newEntry.extensions;
|
|
newEntry->extension.insert(0, excess);
|
|
if (lastIndex != patterns->size())
|
|
{
|
|
lastEntry = saveEntry;
|
|
changeLastEntry = FALSE;
|
|
}
|
|
}
|
|
|
|
// Add the entry at the end or insert it in the middle
|
|
if (lastIndex == patterns->size())
|
|
{
|
|
patterns->atPut(lastIndex, newEntry);
|
|
saveEntry = newEntry;
|
|
|
|
}
|
|
else
|
|
{
|
|
patterns->atInsert(lastIndex, newEntry); // add at end
|
|
}
|
|
}
|
|
|
|
if (changeLastEntry)
|
|
{
|
|
lastEntry = newEntry;
|
|
}
|
|
}
|
|
|
|
int32_t
|
|
MergeCollation::findLastEntry(const PatternEntry* lastEntry,
|
|
UnicodeString& excess,
|
|
UErrorCode& success) const
|
|
{
|
|
if (U_FAILURE(success))
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
if (lastEntry == NULL)
|
|
{
|
|
return 0;
|
|
}
|
|
else if (lastEntry->strength != PatternEntry::RESET)
|
|
{
|
|
int32_t oldIndex = -1;
|
|
|
|
// If the last entry is a single char entry and has been installed,
|
|
// that means the last index is the real last index.
|
|
if (lastEntry->chars.size() == 1)
|
|
{
|
|
int32_t index = lastEntry->chars[0] >> BYTEPOWER;
|
|
|
|
if ((statusArray[index] &
|
|
(uint8_t)(BITARRAYMASK << (lastEntry->chars[0] & BYTEMASK))) != 0)
|
|
{
|
|
oldIndex = patterns->lastIndexOf(lastEntry);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
oldIndex = patterns->lastIndexOf(lastEntry);
|
|
}
|
|
|
|
// must exist!
|
|
if (oldIndex == -1)
|
|
{
|
|
success = U_INVALID_FORMAT_ERROR;
|
|
return oldIndex;
|
|
}
|
|
|
|
return oldIndex + 1;
|
|
}
|
|
else
|
|
{
|
|
// We're doing a reset, i.e. inserting a new ordering at the position
|
|
// just after the entry corresponding to lastEntry's first character
|
|
int32_t i;
|
|
|
|
// Search backwards for string that contains this one;
|
|
// most likely entry is last one
|
|
for (i = patterns->size() - 1; i >= 0; i -= 1)
|
|
{
|
|
PatternEntry* entry = patterns->at(i);
|
|
UnicodeString buffer;
|
|
if (entry != 0)
|
|
{
|
|
//
|
|
// Look for elements with the same beginning key. The extra
|
|
// characters will be the expanding portion. This handles cases like
|
|
// "& Question-mark < '?'". We find the existing PatternEntry that matches
|
|
// the longest possible substring of "Question-mark", which will probably
|
|
// be 'Q'. We save the characters that didn't match ("uestion-mark" in
|
|
// this case), and then return the next index.
|
|
//
|
|
if (entry->chars.compareBetween(0, entry->chars.size(),
|
|
lastEntry->chars,0,entry->chars.size()) == 0)
|
|
{
|
|
lastEntry->chars.extractBetween(entry->chars.size(),
|
|
lastEntry->chars.size(),
|
|
buffer);
|
|
excess += buffer;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (i == -1)
|
|
{
|
|
success = U_INVALID_FORMAT_ERROR;
|
|
return i;
|
|
}
|
|
|
|
return i + 1;
|
|
}
|
|
}
|