scuffed-code/icu4c/source/i18n/tblcoll.cpp
1999-10-07 00:07:53 +00:00

2219 lines
68 KiB
C++
Raw Blame History

/*
*******************************************************************************
* *
* COPYRIGHT: *
* (C) Copyright Taligent, Inc., 1996 *
* (C) Copyright International Business Machines Corporation, 1996-1999 *
* Licensed Material - Program-Property of IBM - All Rights Reserved. *
* US Government Users Restricted Rights - Use, duplication, or disclosure *
* restricted by GSA ADP Schedule Contract with IBM Corp. *
* *
*******************************************************************************
*
* File tblcoll.cpp
*
* Created by: Helena Shih
*
* Modification History:
*
* Date Name Description
* 2/5/97 aliu Added streamIn and streamOut methods. Added
* constructor which reads RuleBasedCollator object from
* a binary file. Added writeToFile method which streams
* RuleBasedCollator out to a binary file. The streamIn
* and streamOut methods use istream and ostream objects
* in binary mode.
* 2/11/97 aliu Moved declarations out of for loop initializer.
* Added Mac compatibility #ifdef for ios::nocreate.
* 2/12/97 aliu Modified to use TableCollationData sub-object to
* hold invariant data.
* 2/13/97 aliu Moved several methods into this class from Collation.
* Added a private RuleBasedCollator(Locale&) constructor,
* to be used by Collator::getInstance(). General
* clean up. Made use of UErrorCode variables consistent.
* 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
* constructor and getDynamicClassID.
* 3/5/97 aliu Changed compaction cycle to improve performance. We
* use the maximum allowable value which is kBlockCount.
* Modified getRules() to load rules dynamically. Changed
* constructFromFile() call to accomodate this (added
* parameter to specify whether binary loading is to
* take place).
* 05/06/97 helena Added memory allocation error check.
* 6/20/97 helena Java class name change.
* 6/23/97 helena Adding comments to make code more readable.
* 09/03/97 helena Added createCollationKeyValues().
* 06/26/98 erm Changes for CollationKeys using byte arrays.
* 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
* 04/23/99 stephen Removed EDecompositionMode, merged with
* Normalizer::EMode
* 06/14/99 stephen Removed kResourceBundleSuffix
* 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
* files are no longer used.
*******************************************************************************
*/
#include "ucmp32.h"
#include "tcoldata.h"
#include "tblcoll.h"
#include "coleitr.h"
#include "locid.h"
#include "unicode.h"
#include "tables.h"
#include "normlzr.h"
#include "mergecol.h"
#include "resbund.h"
#include "filestrm.h"
#ifdef _DEBUG
#include "unistrm.h"
#endif
#include "compitr.h"
#include <string.h>
class RuleBasedCollatorStreamer
{
public:
static void streamIn(RuleBasedCollator* collator, FileStream* is);
static void streamOut(const RuleBasedCollator* collator, FileStream* os);
};
//===========================================================================================
// The following diagram shows the data structure of the RuleBasedCollator object.
// Suppose we have the rule, where 'o-umlaut' is the unicode char 0x00F6.
// "a, A < b, B < c, C, ch, cH, Ch, CH < d, D ... < o, O; 'o-umlaut'/E, 'O-umlaut'/E ...".
// What the rule says is, sorts 'ch'ligatures and 'c' only with tertiary difference and
// sorts 'o-umlaut' as if it's always expanded with 'e'.
//
// mapping table contracting list expanding list
// (contains all unicode char
// entries) ___ _____________ _________________________
// ________ |==>|_*_|-->|'c' |v('c') | |==>|v('o')|v('umlaut')|v('e')|
// |_\u0001_|--> v('\u0001') | |_:_| |-------------| | |-------------------------|
// |_\u0002_|--> v('\u0002') | |_:_| |'ch' |v('ch')| | | : |
// |____:___| | |_:_| |-------------| | |-------------------------|
// |____:___| | |'cH' |v('cH')| | | : |
// |__'a'___|--> v('a') | |-------------| | |-------------------------|
// |__'b'___|--> v('b') | |'Ch' |v('Ch')| | | : |
// |____:___| | |-------------| | |-------------------------|
// |____:___| | |'CH' |v('CH')| | | : |
// |___'c'__|------------------- ------------- | |-------------------------|
// |____:___| | | : |
// |o-umlaut|------------------------------------------------ |_________________________|
// |____:___|
//
//
// Noted by Helena Shih on 6/23/97 with pending design changes (slimming collation).
//============================================================================================
const int32_t RuleBasedCollator::CHARINDEX = 0x70000000; // need look up in .commit()
const int32_t RuleBasedCollator::EXPANDCHARINDEX = 0x7E000000; // Expand index follows
const int32_t RuleBasedCollator::CONTRACTCHARINDEX = 0x7F000000; // contract indexes follows
const int32_t RuleBasedCollator::UNMAPPED = 0xFFFFFFFF; // unmapped character values
const int32_t RuleBasedCollator::PRIMARYORDERINCREMENT = 0x00010000; // primary strength increment
const int32_t RuleBasedCollator::SECONDARYORDERINCREMENT = 0x00000100; // secondary strength increment
const int32_t RuleBasedCollator::TERTIARYORDERINCREMENT = 0x00000001; // tertiary strength increment
const int32_t RuleBasedCollator::MAXIGNORABLE = 0x00010000; // maximum ignorable char order value
const int32_t RuleBasedCollator::PRIMARYORDERMASK = 0xffff0000; // mask off anything but primary order
const int32_t RuleBasedCollator::SECONDARYORDERMASK = 0x0000ff00; // mask off anything but secondary order
const int32_t RuleBasedCollator::TERTIARYORDERMASK = 0x000000ff; // mask off anything but tertiary order
const int32_t RuleBasedCollator::SECONDARYRESETMASK = 0x0000ffff; // mask off secondary and tertiary order
const int32_t RuleBasedCollator::IGNORABLEMASK = 0x0000ffff; // mask off ignorable char order
const int32_t RuleBasedCollator::PRIMARYDIFFERENCEONLY = 0xffff0000; // use only the primary difference
const int32_t RuleBasedCollator::SECONDARYDIFFERENCEONLY = 0xffffff00; // use only the primary and secondary difference
const int32_t RuleBasedCollator::PRIMARYORDERSHIFT = 16; // primary order shift
const int32_t RuleBasedCollator::SECONDARYORDERSHIFT = 8; // secondary order shift
const int32_t RuleBasedCollator::SORTKEYOFFSET = 1; // minimum sort key offset
const int32_t RuleBasedCollator::CONTRACTCHAROVERFLOW = 0x7FFFFFFF; // Indicates the char is a contract char
const int16_t RuleBasedCollator::FILEID = 0x5443; // unique file id for parity check
const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension
char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id
//===============================================================================
RuleBasedCollator::RuleBasedCollator()
: Collator(),
isOverIgnore(FALSE),
mPattern(0),
sourceCursor(0),
targetCursor(0),
data(0),
dataIsOwned(FALSE)
{
}
RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
: Collator(that),
isOverIgnore(that.isOverIgnore),
mPattern(0),
sourceCursor(0),
targetCursor(0),
dataIsOwned(FALSE),
data(that.data) // Alias the data pointer
{
}
bool_t
RuleBasedCollator::operator==(const Collator& that) const
{
if (this == &that)
{
return TRUE;
}
if (this->getDynamicClassID() != that.getDynamicClassID())
{
return FALSE; // not the same class
}
if (!Collator::operator==(that))
{
return FALSE;
}
RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
if (isOverIgnore != thatAlias.isOverIgnore)
{
return FALSE;
}
if (data != thatAlias.data)
{
return FALSE;
}
return TRUE;
}
RuleBasedCollator&
RuleBasedCollator::operator=(const RuleBasedCollator& that)
{
if (this != &that)
{
Collator::operator=(that);
isOverIgnore = that.isOverIgnore;
if (dataIsOwned)
{
delete data;
}
data = 0;
delete mPattern;
mPattern = 0;
dataIsOwned = FALSE;
data = that.data;
}
return *this;
}
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
UErrorCode& status)
: Collator(),
isOverIgnore(FALSE),
mPattern(0),
sourceCursor(0),
targetCursor(0),
data(0),
dataIsOwned(FALSE)
{
if (FAILURE(status))
{
return;
}
constructFromRules(rules, status);
}
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
ECollationStrength collationStrength,
UErrorCode& status)
: Collator(collationStrength, Normalizer::NO_OP),
isOverIgnore(FALSE),
mPattern(0),
sourceCursor(0),
targetCursor(0),
data(0),
dataIsOwned(FALSE)
{
if (FAILURE(status))
{
return;
}
constructFromRules(rules, status);
}
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
Normalizer::EMode decompositionMode,
UErrorCode& status)
: Collator(TERTIARY, decompositionMode),
isOverIgnore(FALSE),
mPattern(0),
sourceCursor(0),
targetCursor(0),
data(0),
dataIsOwned(FALSE)
{
if (FAILURE(status))
{
return;
}
constructFromRules(rules, status);
}
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
ECollationStrength collationStrength,
Normalizer::EMode decompositionMode,
UErrorCode& status)
: Collator(collationStrength, decompositionMode),
isOverIgnore(FALSE),
mPattern(0),
sourceCursor(0),
targetCursor(0),
data(0),
dataIsOwned(FALSE)
{
if (FAILURE(status))
{
return;
}
constructFromRules(rules, status);
}
void RuleBasedCollator::constructFromRules(const UnicodeString& rules,
UErrorCode& status)
{
// Construct this collator's ruleset from its string representation
if (FAILURE(status))
{
return;
}
if (rules.isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (dataIsOwned)
{
delete data;
data = 0;
}
isOverIgnore = FALSE;
setStrength(Collator::TERTIARY);
data = new TableCollationData;
if (data->isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
delete data;
data = 0;
return;
}
// We constructed the data using the build method, so we own it.
dataIsOwned = TRUE;
// Now that we've got all the buffers allocated, do the actual work
mPattern = 0;
build(rules, status);
}
void
RuleBasedCollator::constructFromFile(const char* fileName,
UErrorCode& status)
{
// This method tries to read in a flattened RuleBasedCollator that
// has been previously streamed out using the streamOut() method.
// The 'fileName' parameter should contain a full pathname valid on
// the local environment.
if (FAILURE(status))
{
return;
}
if (dataIsOwned)
{
delete data;
data = 0;
}
mPattern = 0;
isOverIgnore = FALSE;
setStrength(Collator::TERTIARY); // This is the default strength
FileStream* ifs = T_FileStream_open(fileName, "rb");
if (ifs == 0) {
status = U_FILE_ACCESS_ERROR;
return;
}
// The streamIn function does the actual work here...
RuleBasedCollatorStreamer::streamIn(this, ifs);
if (!T_FileStream_error(ifs))
{
status = U_ZERO_ERROR;
}
else if (data && data->isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
delete data;
data = 0;
}
else
{
status = U_MISSING_RESOURCE_ERROR;
delete data;
data = 0;
}
#ifdef COLLDEBUG
fprintf(stderr, "binary read %s size %d, %s\n", fileName, T_FileStream_size(ifs), errorName(status));
#endif
// We constructed the data when streaming it in, so we own it
dataIsOwned = TRUE;
T_FileStream_close(ifs);
}
RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
UErrorCode& status)
: Collator(),
isOverIgnore(FALSE),
dataIsOwned(FALSE),
data(0),
sourceCursor(0),
targetCursor(0),
mPattern(0)
{
if (FAILURE(status))
{
return;
}
// Try to load, in order:
// 1. The desired locale's collation.
// 2. A fallback of the desired locale.
// 3. The default locale's collation.
// 4. A fallback of the default locale.
// 5. The default collation rules, which contains en_US collation rules.
// To reiterate, we try:
// Specific:
// language+country+variant
// language+country
// language
// Default:
// language+country+variant
// language+country
// language
// Root: (aka DEFAULTRULES)
UnicodeString localeName;
desiredLocale.getName(localeName);
enum { eTryDefaultLocale, eTryDefaultCollation, eDone } next = eTryDefaultLocale;
for (;;)
{
if (localeName.size() == 0)
{
if (next == eDone)
{
// We've failed to load a locale, but should never return U_MISSING_RESOURCE_ERROR
UErrorCode intStatus = U_ZERO_ERROR;
constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus);
if (intStatus == U_ZERO_ERROR)
{
status = U_USING_DEFAULT_ERROR;
}
else
{
status = intStatus; // bubble back
}
if (status == U_MEMORY_ALLOCATION_ERROR)
{
return;
}
data->desiredLocale = desiredLocale;
desiredLocale.getName(localeName);
data->realLocaleName = localeName;
addToCache(localeName);
setDecomposition(Normalizer::NO_OP);
const UnicodeString& rules = getRules();
break;
}
// We've exhausted our inheritance attempts with this locale.
// Try the next step.
switch (next)
{
case eTryDefaultLocale:
status = U_USING_DEFAULT_ERROR;
Locale::getDefault().getName(localeName);
next = eTryDefaultCollation;
break;
case eTryDefaultCollation:
// There is no distinction between this condition of
// using a default collation object and the condition of
// using a default locale to get a collation object currently.
// That is, the caller can't distinguish based on UErrorCode.
status = U_USING_DEFAULT_ERROR;
localeName = ResourceBundle::kDefaultFilename;
next = eDone;
break;
}
}
// First try to load the collation from the in-memory static cache.
// Note that all of the caching logic is handled here, and in the
// call to RuleBasedCollator::addToCache, below.
UErrorCode intStatus = U_ZERO_ERROR;
constructFromCache(localeName, intStatus);
if (SUCCESS(intStatus))
{
break; // Done!
}
// The collation we want is not in the cache. The second thing
// to try is loading from a file, either binary or ASCII. So:
// Try to load the locale's collation data. This will try to load
// a binary collation file, or if that is unavailable, it will go
// to the text resource bundle file (with the corresponding name)
// and try to get the collation table there.
intStatus = U_ZERO_ERROR;
constructFromFile(desiredLocale, localeName, TRUE, intStatus);
if (SUCCESS(intStatus))
{
// If we succeeded in loading the collation from a file, now is the
// time to add it to the in-memory cache. We record the real
// location at which the collation data was found, so we can reload
// the rule table quickly, if it is requested, in the future.
// See getRules().
data->desiredLocale = desiredLocale;
data->realLocaleName = localeName;
addToCache(localeName);
setDecomposition(Normalizer::NO_OP);
break; // Done!
}
if (intStatus == U_MEMORY_ALLOCATION_ERROR)
{
status = intStatus;
return;
}
// Having failed, chop off the end of the locale name, making
// it less specific, and try again. Indicate the use of a
// fallback locale, unless we've already fallen through to
// a default locale -- then leave the status as is.
if (status == U_ZERO_ERROR)
{
status = U_USING_FALLBACK_ERROR;
}
chopLocale(localeName);
}
}
void
RuleBasedCollator::constructFromFile( const Locale& locale,
const UnicodeString& localeFileName,
bool_t tryBinaryFile,
UErrorCode& status)
{
// constructFromFile creates a collation object by reading from a
// file. It does not employ the usual FILE search mechanism with
// locales, default locales, and base locales. Instead, it tries to
// look only in files with the given localFileName. It does,
// however, employ the LOCALE search mechanism.
// This method maintains the binary collation files. If a collation
// is not present in binary form, but is present in text form (in a
// resource bundle file), it will be loaded in text form, and then
// written to disk.
// If tryBinaryFile is true, then try to load from the binary file first.
if(FAILURE(status)) {
return;
}
if(dataIsOwned) {
delete data;
data = 0;
}
char *binaryFilePath = createPathName(Locale::getDataDirectory(),
localeFileName, kFilenameSuffix);
if(tryBinaryFile) {
// Try to load up the collation from a binary file first
constructFromFile(binaryFilePath, status);
#ifdef COLLDEBUG
cerr << localeFileName << " binary load " << errorName(status) << endl;
#endif
if(SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR)
return;
}
// Now try to load it up from a resource bundle text source file
ResourceBundle bundle(Locale::getDataDirectory(), localeFileName, status);
// if there is no resource bundle file for the give locale, break out
if(FAILURE(status))
return;
#ifdef COLLDEBUG
cerr << localeFileName << " ascii load " << errorName(status) << endl;
#endif
// check and see if this resource bundle contains collation data
UnicodeString colString;
UErrorCode intStatus = U_ZERO_ERROR;
bundle.getString("CollationElements", colString, intStatus);
if(colString.isBogus()) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// if this bundle doesn't contain collation data, break out
if(FAILURE(intStatus)) {
status = U_MISSING_RESOURCE_ERROR;
return;
}
// Having loaded the collation from the resource bundle text file,
// now retrieve the CollationElements tagged data, merged with the
// default rules. If that fails, use the default rules alone.
colString.insert(0, DEFAULTRULES);
if(colString.isBogus()) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
constructFromRules(colString, intStatus);
if(intStatus == U_MEMORY_ALLOCATION_ERROR) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if(intStatus != U_ZERO_ERROR) {
status = U_USING_DEFAULT_ERROR;
// predefined tables should contain correct grammar
intStatus = U_ZERO_ERROR;
constructFromRules(DEFAULTRULES, intStatus);
if(intStatus != U_ZERO_ERROR) {
status = intStatus;
}
}
#ifdef COLLDEBUG
cerr << localeFileName << " ascii load " << (SUCCESS(status) ? "OK" : "Failed") << endl;
#endif
if(SUCCESS(status) && tryBinaryFile) {
// If we get a RuleBasedCollator result, even if it is derived
// from a default or a fallback, then we write it out as a
// binary file to the disk. The next time the system wants to
// get this collation, it will load up very quickly from the
// binary file.
bool_t ok = writeToFile(binaryFilePath);
delete [] binaryFilePath;
#ifdef COLLDEBUG
cerr << localeFileName << " binary write " << (ok? "OK" : "Failed") << endl;
#endif
}
}
RuleBasedCollator::~RuleBasedCollator()
{
if (dataIsOwned)
{
delete data;
}
data = 0;
delete sourceCursor;
sourceCursor = 0;
delete targetCursor;
targetCursor = 0;
delete mPattern;
mPattern = 0;
}
Collator*
RuleBasedCollator::clone() const
{
return new RuleBasedCollator(*this);
}
// Create a CollationElementIterator object that will iterator over the elements
// in a string, using the collation rules defined in this RuleBasedCollator
CollationElementIterator*
RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const
{
UErrorCode status = U_ZERO_ERROR;
CollationElementIterator *newCursor = 0;
newCursor = new CollationElementIterator(source, this, status);
if (FAILURE(status))
{
return NULL;
}
return newCursor;
}
// Create a CollationElementIterator object that will iterator over the elements
// in a string, using the collation rules defined in this RuleBasedCollator
CollationElementIterator*
RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const
{
UErrorCode status = U_ZERO_ERROR;
CollationElementIterator *newCursor = 0;
newCursor = new CollationElementIterator(source, this, status);
if (FAILURE(status))
{
return NULL;
}
return newCursor;
}
// Return a string representation of this collator's rules.
// The string can later be passed to the constructor that takes a
// UnicodeString argument, which will construct a collator that's
// functionally identical to this one.
// You can also allow users to edit the string in order to change
// the collation data, or you can print it out for inspection, or whatever.
const UnicodeString&
RuleBasedCollator::getRules() const
{
if (mPattern != 0)
{
MergeCollation*& nonConstMPattern = *(MergeCollation**)&mPattern;
mPattern->emitPattern(data->ruleTable);
data->isRuleTableLoaded = TRUE;
delete nonConstMPattern;
nonConstMPattern = 0;
}
else if (!data->isRuleTableLoaded)
{
// At this point the caller wants the rules, but the rule table data
// is not loaded. Furthermore, there is no mPattern object to load
// the rules from. Therefore, we fetch the rules off the disk.
// Notice that we pass in a tryBinaryFile value of FALSE, since
// by design the binary file has NO rules in it!
RuleBasedCollator temp;
UErrorCode status = U_ZERO_ERROR;
temp.constructFromFile(data->desiredLocale, data->realLocaleName, FALSE, status);
// We must check that mPattern is nonzero here, or we run the risk
// of an infinite loop.
if (SUCCESS(status) && temp.mPattern != 0)
{
data->ruleTable = temp.getRules();
data->isRuleTableLoaded = TRUE;
#ifdef _DEBUG
// the following is useful for specific debugging purposes
// UnicodeString name;
// cerr << "Table collation rules loaded dynamically for "
// << data->desiredLocale.getName(name)
// << " at "
// << data->realLocaleName
// << ", " << dec << data->ruleTable.size() << " characters"
// << endl;
#endif
}
else
{
#ifdef _DEBUG
UnicodeString name;
cerr << "Unable to load table collation rules dynamically for "
<< data->desiredLocale.getName(name)
<< " at "
<< data->realLocaleName
<< endl;
cerr << "Status " << errorName(status) << ", mPattern " << temp.mPattern << endl;
#endif
}
}
return data->ruleTable;
}
Collator::EComparisonResult
RuleBasedCollator::compare( const UnicodeString& source,
const UnicodeString& target,
int32_t length) const
{
UnicodeString source_togo;
UnicodeString target_togo;
UTextOffset begin=0;
source.extract(begin, icu_min(length,source.size()), source_togo);
target.extract(begin, icu_min(length,target.size()), target_togo);
return (RuleBasedCollator::compare(source_togo, target_togo));
}
// Compare two strings using this collator
Collator::EComparisonResult
RuleBasedCollator::compare(const UnicodeString& source,
const UnicodeString& target) const
{
// check if source and target are valid strings
if (source.isBogus() || target.isBogus())
{
return Collator::EQUAL;
}
Collator::EComparisonResult result = Collator::EQUAL;
UErrorCode status = U_ZERO_ERROR;
// The basic algorithm here is that we use CollationElementIterators
// to step through both the source and target strings. We compare each
// collation element in the source string against the corresponding one
// in the target, checking for differences.
//
// If a difference is found, we set <result> to LESS or GREATER to
// indicate whether the source string is less or greater than the target.
//
// However, it's not that simple. If we find a tertiary difference
// (e.g. 'A' vs. 'a') near the beginning of a string, it can be
// overridden by a primary difference (e.g. "A" vs. "B") later in
// the string. For example, "AA" < "aB", even though 'A' > 'a'.
//
// To keep track of this, we use checkSecTer and checkTertiary to keep
// track of the strength of the most significant difference that has been
// found so far. When we find a difference whose strength is greater than
// the previous ones, it overrides the last difference (if any) that
// was found.
//
if (sourceCursor == NULL)
{
((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source);
}
else
{
sourceCursor->setText(source, status);
}
if (sourceCursor == NULL || FAILURE(status))
{
return Collator::EQUAL;
}
if (targetCursor == NULL)
{
((RuleBasedCollator *)this)->targetCursor = createCollationElementIterator(target);
}
else
{
targetCursor->setText(target, status);
}
if (targetCursor == NULL || FAILURE(status))
{
return Collator::EQUAL;
}
int32_t sOrder, tOrder;
bool_t gets = TRUE, gett = TRUE;
bool_t initialCheckSecTer = getStrength() >= Collator::SECONDARY;
bool_t checkSecTer = initialCheckSecTer;
bool_t checkTertiary = getStrength() >= Collator::TERTIARY;
bool_t isFrenchSec = data->isFrenchSec;
uint32_t pSOrder, pTOrder;
while(TRUE)
{
// Get the next collation element in each of the strings, unless
// we've been requested to skip it.
if (gets)
{
sOrder = sourceCursor->next(status);
if (FAILURE(status))
{
return Collator::EQUAL;
}
}
gets = TRUE;
if (gett)
{
tOrder = targetCursor->next(status);
if (FAILURE(status))
{
return Collator::EQUAL;
}
}
gett = TRUE;
// If we've hit the end of one of the strings, jump out of the loop
if ((sOrder == CollationElementIterator::NULLORDER)||
(tOrder == CollationElementIterator::NULLORDER))
{
break;
}
// If there's no difference at this position, we can skip to the
// next one.
pSOrder = CollationElementIterator::primaryOrder(sOrder);
pTOrder = CollationElementIterator::primaryOrder(tOrder);
if (sOrder == tOrder)
{
if (isFrenchSec && pSOrder != 0)
{
if (!checkSecTer)
{
// in french, a secondary difference more to the right is stronger,
// so accents have to be checked with each base element
checkSecTer = initialCheckSecTer;
// but tertiary differences are less important than the first
// secondary difference, so checking tertiary remains disabled
checkTertiary = FALSE;
}
}
continue;
}
// Compare primary differences first.
if (pSOrder != pTOrder)
{
if (sOrder == 0)
{
// The entire source element is ignorable.
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
continue;
}
if (tOrder == 0)
{
gets = FALSE;
continue;
}
// The source and target elements aren't ignorable, but it's still possible
// for the primary component of one of the elements to be ignorable....
if (pSOrder == 0) // primary order in source is ignorable
{
// The source's primary is ignorable, but the target's isn't. We treat ignorables
// as a secondary difference, so remember that we found one.
if (checkSecTer)
{
result = Collator::GREATER; // (strength is SECONDARY)
checkSecTer = FALSE;
}
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
}
else if (pTOrder == 0)
{
// record differences - see the comment above.
if (checkSecTer)
{
result = Collator::LESS; // (strength is SECONDARY)
checkSecTer = FALSE;
}
// Skip to the next target element, but don't fetch another source element.
gets = FALSE;
}
else
{
// Neither of the orders is ignorable, and we already know that the primary
// orders are different because of the (pSOrder != pTOrder) test above.
// Record the difference and stop the comparison.
if (pSOrder < pTOrder)
{
return Collator::LESS; // (strength is PRIMARY)
}
return Collator::GREATER; // (strength is PRIMARY)
}
}
else
{ // else of if ( pSOrder != pTOrder )
// primary order is the same, but complete order is different. So there
// are no base elements at this point, only ignorables (Since the strings are
// normalized)
if (checkSecTer)
{
// a secondary or tertiary difference may still matter
uint32_t secSOrder = CollationElementIterator::secondaryOrder(sOrder);
uint32_t secTOrder = CollationElementIterator::secondaryOrder(tOrder);
if (secSOrder != secTOrder)
{
// there is a secondary difference
result = (secSOrder < secTOrder) ? Collator::LESS : Collator::GREATER;
// (strength is SECONDARY)
checkSecTer = FALSE;
// (even in french, only the first secondary difference within
// a base character matters)
}
else
{
if (checkTertiary)
{
// a tertiary difference may still matter
uint32_t terSOrder = CollationElementIterator::tertiaryOrder(sOrder);
uint32_t terTOrder = CollationElementIterator::tertiaryOrder(tOrder);
if (terSOrder != terTOrder)
{
// there is a tertiary difference
result = (terSOrder < terTOrder) ? Collator::LESS : Collator::GREATER;
// (strength is TERTIARY)
checkTertiary = FALSE;
}
}
}
} // if (checkSecTer)
} // if ( pSOrder != pTOrder )
} // while()
if (sOrder != CollationElementIterator::NULLORDER)
{
// (tOrder must be CollationElementIterator::NULLORDER,
// since this point is only reached when sOrder or tOrder is NULLORDER.)
// The source string has more elements, but the target string hasn't.
do
{
if (CollationElementIterator::primaryOrder(sOrder) != 0)
{
// We found an additional non-ignorable base character in the source string.
// This is a primary difference, so the source is greater
return Collator::GREATER; // (strength is PRIMARY)
}
if (CollationElementIterator::secondaryOrder(sOrder) != 0)
{
// Additional secondary elements mean the source string is greater
if (checkSecTer)
{
result = Collator::GREATER; // (strength is SECONDARY)
checkSecTer = FALSE;
}
}
}
while ((sOrder = sourceCursor->next(status)) != CollationElementIterator::NULLORDER);
}
else if (tOrder != CollationElementIterator::NULLORDER)
{
// The target string has more elements, but the source string hasn't.
do
{
if (CollationElementIterator::primaryOrder(tOrder) != 0)
{
// We found an additional non-ignorable base character in the target string.
// This is a primary difference, so the source is less
return Collator::LESS; // (strength is PRIMARY)
}
if (CollationElementIterator::secondaryOrder(tOrder) != 0)
{
// Additional secondary elements in the target mean the source string is less
if (checkSecTer)
{
result = Collator::LESS; // (strength is SECONDARY)
checkSecTer = FALSE;
}
}
}
while ((tOrder = targetCursor->next(status)) != CollationElementIterator::NULLORDER);
}
// For IDENTICAL comparisons, we use a bitwise character comparison
// as a tiebreaker if all else is equal
// NOTE: The java code compares result with 0, and
// puts the result of the string comparison directly into result
if (result == Collator::EQUAL && getStrength() == IDENTICAL)
{
UnicodeString sourceDecomp, targetDecomp;
int8_t comparison;
Normalizer::normalize(source, getDecomposition(),
0, sourceDecomp, status);
Normalizer::normalize(target, getDecomposition(),
0, targetDecomp, status);
comparison = sourceDecomp.compare(targetDecomp);
if (comparison < 0)
{
result = Collator::LESS;
}
else if (comparison == 0)
{
result = Collator::EQUAL;
}
else
{
result = Collator::GREATER;
}
}
return result;
}
// Retrieve a collation key for the specified string
// The key can be compared with other collation keys using a bitwise comparison
// (e.g. memcmp) to find the ordering of their respective source strings.
// This is handy when doing a sort, where each sort key must be compared
// many times.
//
// The basic algorithm here is to find all of the collation elements for each
// character in the source string, convert them to an ASCII representation,
// and put them into the collation key. But it's trickier than that.
// Each collation element in a string has three components: primary ('A' vs 'B'),
// secondary ('u' vs '<27>'), and tertiary ('A' vs 'a'), and a primary difference
// at the end of a string takes precedence over a secondary or tertiary
// difference earlier in the string.
//
// To account for this, we put all of the primary orders at the beginning of the
// string, followed by the secondary and tertiary orders. Each set of orders is
// terminated by nulls so that a key for a string which is a initial substring of
// another key will compare less without any special case.
//
// Here's a hypothetical example, with the collation element represented as
// a three-digit number, one digit for primary, one for secondary, etc.
//
// String: A a B <20>
// Collation Elements: 101 100 201 511
// Collation Key: 1125<null>0001<null>1011<null>
//
// To make things even trickier, secondary differences (accent marks) are compared
// starting at the *end* of the string in languages with French secondary ordering.
// But when comparing the accent marks on a single base character, they are compared
// from the beginning. To handle this, we reverse all of the accents that belong
// to each base character, then we reverse the entire string of secondary orderings
// at the end.
//
CollationKey&
RuleBasedCollator::getCollationKey( const UnicodeString& source,
CollationKey& sortkey,
UErrorCode& status) const
{
if (FAILURE(status))
{
status = U_ILLEGAL_ARGUMENT_ERROR;
return sortkey.setToBogus();
}
if (source.isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
return sortkey.setToBogus();
}
if (source.size() == 0)
{
return sortkey.reset();
}
if (sourceCursor == NULL)
{
((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source);
}
else
{
sourceCursor->setText(source, status);
}
if (sourceCursor == NULL || FAILURE(status))
{
return sortkey.setToBogus();
}
bool_t compareSec = (getStrength() >= Collator::SECONDARY);
bool_t compareTer = (getStrength() >= Collator::TERTIARY);
bool_t compareIdent = (getStrength() == Collator::IDENTICAL);
int32_t order = 0;
int32_t totalPrimary = 0;
int32_t totalSec = 0;
int32_t totalTer = 0;
int32_t totalIdent = 0;
UnicodeString decomp;
// iterate over the source, counting primary, secondary, and tertiary entries
while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER)
{
int32_t secOrder = CollationElementIterator::secondaryOrder(order);
int32_t terOrder = CollationElementIterator::tertiaryOrder(order);
if (FAILURE(status))
{
return sortkey.setToBogus();
}
if (! CollationElementIterator::isIgnorable(order))
{
totalPrimary += 1;
if (compareSec)
{
totalSec += 1;
}
if (compareTer)
{
totalTer += 1;
}
}
else
{
if (compareSec && secOrder != 0)
{
totalSec += 1;
}
if (compareTer && terOrder != 0)
{
totalTer += 1;
}
}
}
// count the null bytes after the entires
totalPrimary += 1;
if (compareSec)
{
totalSec += 1;
}
if (compareTer)
{
totalTer += 1;
}
if (compareIdent)
{
Normalizer::normalize(source, getDecomposition(),
0, decomp, status);
if (SUCCESS(status))
{
totalIdent = decomp.size() + 1;
}
}
// Compute total number of bytes to hold the entries
// and make sure the key can hold them
uint32_t size = 2 * (totalPrimary + totalSec + totalTer + totalIdent);
sortkey.ensureCapacity(size);
if (sortkey.isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
return sortkey;
}
int32_t primaryCursor = 0;
int32_t secCursor = 2 * totalPrimary;
int32_t secBase = secCursor;
int32_t preSecIgnore = secBase;
int32_t terCursor = secCursor + (2 * totalSec);
int32_t identCursor = terCursor + (2 * totalTer);
// reset source to the beginning
sourceCursor->reset();
// now iterate over the source computing the actual entries
while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER)
{
if (FAILURE(status))
{
return sortkey.reset();
}
int32_t primaryOrder = CollationElementIterator::primaryOrder(order);
int32_t secOrder = CollationElementIterator::secondaryOrder(order);
int32_t terOrder = CollationElementIterator::tertiaryOrder(order);
if (! CollationElementIterator::isIgnorable(order))
{
primaryCursor = sortkey.storeBytes(primaryCursor, primaryOrder + SORTKEYOFFSET);
if (compareSec)
{
if (data->isFrenchSec && (preSecIgnore < secCursor))
{
sortkey.reverseBytes(preSecIgnore, secCursor);
}
secCursor = sortkey.storeBytes(secCursor, secOrder + SORTKEYOFFSET);
preSecIgnore = secCursor;
}
if (compareTer)
{
terCursor = sortkey.storeBytes(terCursor, terOrder + SORTKEYOFFSET);
}
}
else
{
if (compareSec && secOrder != 0)
{
secCursor = sortkey.storeBytes(secCursor, secOrder + data->maxSecOrder + SORTKEYOFFSET);
}
if (compareTer && terOrder != 0)
{
terCursor = sortkey.storeBytes(terCursor, terOrder + data->maxTerOrder + SORTKEYOFFSET);
}
}
}
// append 0 at the end of each portion.
sortkey.storeBytes(primaryCursor, 0);
if (compareSec)
{
if (data->isFrenchSec)
{
if (preSecIgnore < secCursor)
{
sortkey.reverseBytes(preSecIgnore, secCursor);
}
sortkey.reverseBytes(secBase, secCursor);
}
sortkey.storeBytes(secCursor, 0);
}
if (compareTer)
{
sortkey.storeBytes(terCursor, 0);
}
if (compareIdent)
{
sortkey.storeUnicodeString(identCursor, decomp);
}
return sortkey;
}
// Build this collator's rule tables based on a string representation of the rules
// See the big diagram at the top of this file for an overview of how the tables
// are organized.
void
RuleBasedCollator::build(const UnicodeString& pattern,
UErrorCode& status)
{
if (FAILURE(status))
{
return;
}
// This array maps Unicode characters to their collation ordering
data->mapping = ucmp32_open(UNMAPPED);
if (data->mapping->fBogus)
{
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
Collator::ECollationStrength aStrength = Collator::IDENTICAL;
bool_t isSource = TRUE;
int32_t i = 0;
UnicodeString lastGroupChars;
UnicodeString expChars;
UnicodeString groupChars;
if (pattern.size() == 0)
{
status = U_INVALID_FORMAT_ERROR;
return;
}
// Build the merged collation entries
// Since rules can be specified in any order in the string
// (e.g. "c , C < d , D < e , E .... C < CH")
// this splits all of the rules in the string out into separate
// objects and then sorts them. In the above example, it merges the
// "C < CH" rule in just before the "C < D" rule.
mPattern = new MergeCollation(pattern, getDecomposition(), status);
if (FAILURE(status))
{
ucmp32_close(data->mapping);
data->mapping = 0;
delete mPattern;
mPattern = 0;
return;
}
int32_t order = 0;
// Walk through each entry
for (i = 0; i < mPattern->getCount(); ++i)
{
const PatternEntry* entry = mPattern->getItemAt(i);
groupChars.remove();
expChars.remove();
// if entry is valid
if (entry != NULL)
{
entry->getChars(groupChars);
// check if french secondary needs to be turned on
if ((groupChars.size() > 1) &&
(groupChars[groupChars.size()-(T_INT32(1))] == 0x0040))
{
data->isFrenchSec = TRUE;
groupChars.remove(groupChars.size()-(T_INT32(1)));
}
order = increment((Collator::ECollationStrength)entry->getStrength(), order);
if (entry->getExtension(expChars).size() != 0)
{
// encountered an expanding character, where one character on input
// expands to several sort elements (e.g. '<27>' --> 'o' 'e')
addExpandOrder(groupChars, expChars, order, status);
if (FAILURE(status))
{
return;
}
}
else if (groupChars.size() > 1)
{
// encountered a contracting character, where several characters on input
// contract into one sort order. For example, "ch" is treated as a single
// character in traditional Spanish sorting.
addContractOrder(groupChars, order, status);
if (FAILURE(status))
{
return;
}
}
else
{
// Nothing out of the ordinary -- one character maps to one sort order
addOrder(groupChars[0], order, status);
if (FAILURE(status))
{
return;
}
}
}
}
// add expanding entries for pre-composed characters
addComposedChars();
// Fill in all the expanding chars values
commit();
// Compact the data mapping table
ucmp32_compact(data->mapping, 1);
}
/**
* Add expanding entries for pre-composed unicode characters so that this
* collator can be used reasonably well with decomposition turned off.
*/
void RuleBasedCollator::addComposedChars()
{
UnicodeString buf;
UErrorCode status = U_ZERO_ERROR;
// Iterate through all of the pre-composed characters in Unicode
ComposedCharIter iter;
UnicodeString decomp;
while (iter.hasNext())
{
UChar c = iter.next();
if (getCharOrder(c) == UNMAPPED)
{
//
// We don't already have an ordering for this pre-composed character.
//
// First, see if the decomposed string is already in our
// tables as a single contracting-string ordering.
// If so, just map the precomposed character to that order.
//
// TODO: What we should really be doing here is trying to find the
// longest initial substring of the decomposition that is present
// in the tables as a contracting character sequence, and find its
// ordering. Then do this recursively with the remaining chars
// so that we build a list of orderings, and add that list to
// the expansion table.
// That would be more correct but also significantly slower, so
// I'm not totally sure it's worth doing.
//
iter.getDecomposition(decomp);
int contractOrder = getContractOrder(decomp);
if (contractOrder != UNMAPPED)
{
addOrder(c, contractOrder, status);
}
else
{
//
// We don't have a contracting ordering for the entire string
// that results from the decomposition, but if we have orders
// for each individual character, we can add an expanding
// table entry for the pre-composed character
//
bool_t allThere = TRUE;
int32_t i;
for (i = 0; i < decomp.size(); i += 1)
{
if (getCharOrder(decomp[i]) == UNMAPPED)
{
allThere = FALSE;
break;
}
}
if (allThere)
{
buf.remove();
buf += c;
addExpandOrder(buf, decomp, UNMAPPED, status);
}
}
}
}
}
// When the expanding character tables are built by addExpandOrder,
// it doesn't know what the final ordering of each character
// in the expansion will be. Instead, it just puts the raw character
// code into the table, adding CHARINDEX as a flag. Now that we've
// finished building the mapping table, we can go back and look up
// that character to see what its real collation order is and
// stick that into the expansion table. That lets us avoid doing
// a two-stage lookup later.
void
RuleBasedCollator::commit()
{
// if there are any expanding characters
if (data->expandTable != NULL)
{
int32_t i;
for (i = 0; i < data->expandTable->size(); i += 1)
{
VectorOfInt* valueList = data->expandTable->at(i);
int32_t j;
for (j = 0; j < valueList->size(); j++)
{
// found a expanding character
// the expanding char value is not filled in yet
if ((valueList->at(j) < EXPANDCHARINDEX) &&
(valueList->at(j) > CHARINDEX))
{
// Get the real values for the non-filled entry
UChar ch = (UChar)(valueList->at(j) - CHARINDEX);
int32_t realValue = ucmp32_get(data->mapping, ch);
if (realValue == UNMAPPED)
{
// The real value is still unmapped, maybe it'signorable
valueList->atPut(j, IGNORABLEMASK & ch);
}
// fill in the value
else
{
valueList->atPut(j, realValue);
}
}
}
}
}
}
/**
* Increment of the last order based on the comparison level.
*/
int32_t
RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t lastValue)
{
switch(aStrength)
{
case Collator::PRIMARY:
// increment priamry order and mask off secondary and tertiary difference
lastValue += PRIMARYORDERINCREMENT;
lastValue &= PRIMARYORDERMASK;
isOverIgnore = TRUE;
break;
case Collator::SECONDARY:
// increment secondary order and mask off tertiary difference
lastValue += SECONDARYORDERINCREMENT;
lastValue &= SECONDARYDIFFERENCEONLY;
// record max # of ignorable chars with secondary difference
if (isOverIgnore == FALSE)
{
data->maxSecOrder += 1;
}
break;
case Collator::TERTIARY:
// increment tertiary order
lastValue += TERTIARYORDERINCREMENT;
// record max # of ignorable chars with tertiary difference
if (isOverIgnore == FALSE)
{
data->maxTerOrder += 1;
}
break;
}
return lastValue;
}
// Adds a character and its designated order into the collation table.
// This is the simple case, with no expansion or contraction
void
RuleBasedCollator::addOrder(UChar ch,
int32_t anOrder,
UErrorCode& status)
{
if (FAILURE(status))
{
return;
}
// try to find the order of the char in the mapping table
int32_t order = ucmp32_get(data->mapping, ch);
if (order >= CONTRACTCHARINDEX)
{
// There's already an entry for this character that points to a contracting
// character table. Instead of adding the character directly to the mapping
// table, we must add it to the contract table instead.
key.remove();
key += ch;
if (key.isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
addContractOrder(key, anOrder, status);
}
else
{
// add the entry to the mapping table, the same later entry replaces the previous one
ucmp32_set(data->mapping, ch, anOrder);
}
}
// Add an expanding-character entry to the table.
void
RuleBasedCollator::addExpandOrder( const UnicodeString& contractChars,
const UnicodeString& expandChars,
int32_t anOrder,
UErrorCode& status)
{
if (FAILURE(status))
{
return;
}
// Create an expansion table entry
int32_t tableIndex = addExpansion(anOrder, expandChars);
// And add its index into the main mapping table
if (contractChars.size() > 1)
{
addContractOrder(contractChars, tableIndex, status);
}
else
{
addOrder(contractChars[0], tableIndex, status);
}
}
int32_t RuleBasedCollator::addExpansion(int32_t anOrder, const UnicodeString &expandChars)
{
if (data->expandTable == NULL)
{
data->expandTable = new VectorOfPToExpandTable();
if (data->expandTable == NULL)
{
return 0;
}
}
// If anOrder is valid, we want to add it at the beginning of the list
int32_t offset = (anOrder == UNMAPPED) ? 0 : 1;
VectorOfInt *valueList = new VectorOfInt(expandChars.size() + offset);
if (offset == 1)
{
valueList->atPut(0, anOrder);
}
int32_t i;
for (i = 0; i < expandChars.size(); i += 1)
{
UChar ch = expandChars[i];
int32_t mapValue = getCharOrder(ch);
if (mapValue != UNMAPPED)
{
valueList->atPut(i + offset, mapValue);
}
else
{
// can't find it in the table, will be filled in by commit().
valueList->atPut(i + offset, CHARINDEX + (int32_t)ch);
}
}
// Add the expanding char list into the expansion table.
int32_t tableIndex = EXPANDCHARINDEX + data->expandTable->size();
data->expandTable->atPut(data->expandTable->size(), valueList);
return tableIndex;
}
// Add a string of characters that contracts into a single ordering.
void
RuleBasedCollator::addContractOrder(const UnicodeString& groupChars,
int32_t anOrder,
bool_t fwd,
UErrorCode& status)
{
if (FAILURE(status))
{
return;
}
if (data->contractTable == NULL)
{
data->contractTable = new VectorOfPToContractTable();
if (data->contractTable->isBogus())
{
delete data->contractTable;
data->contractTable = NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
// See if the initial character of the string already has a contract table.
// e.g. for "ch", look for 'c'.
int32_t entry = ucmp32_get(data->mapping, groupChars[0]);
VectorOfPToContractElement *entryTable = getContractValues(entry - CONTRACTCHARINDEX);
if (entryTable == NULL)
{
// We need to create a new table of contract entries for this base char
int32_t tableIndex = CONTRACTCHARINDEX + data->contractTable->size();
EntryPair *pair = NULL;
UnicodeString substring;
entryTable = new VectorOfPToContractElement();
if (entryTable->isBogus())
{
delete entryTable;
delete data->contractTable;
data->contractTable = NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
data->contractTable->atPut(data->contractTable->size(), entryTable);
if (data->contractTable->isBogus())
{
delete entryTable;
delete data->contractTable;
data->contractTable = NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Add the initial character's current ordering first. then
// update its mapping to point to this contract table
groupChars.extract(0, 1, substring);
if (substring.isBogus())
{
delete entryTable;
delete data->contractTable;
data->contractTable = NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
pair = new EntryPair(substring, entry);
entryTable->atPut(0, pair);
if (entryTable->isBogus())
{
delete entryTable;
delete data->contractTable;
data->contractTable = NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
ucmp32_set(data->mapping, groupChars[0], tableIndex);
}
// Now add (or replace) this string in the table
int32_t index = getEntry(entryTable, groupChars, fwd);
if (index != UNMAPPED)
{
EntryPair *pair = (EntryPair *) entryTable->at(index);
pair->value = anOrder;
}
else
{
EntryPair *pair = new EntryPair(groupChars, anOrder, fwd);
entryTable->atPut(entryTable->size(), pair);
}
// If this was a forward mapping for a contracting string, also add a
// reverse mapping for it, so that CollationElementIterator::previous
// can work right
if (fwd)
{
UnicodeString reverse(groupChars);
if (reverse.isBogus())
{
delete entryTable;
delete data->contractTable;
data->contractTable = NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
addContractOrder(reverse.reverse(), anOrder, FALSE, status);
}
}
/**
* If the given string has been specified as a contracting string
* in this collation table, return its ordering.
* Otherwise return UNMAPPED.
*/
int32_t RuleBasedCollator::getContractOrder(const UnicodeString &groupChars) const
{
int32_t result = UNMAPPED;
if (data->contractTable != NULL)
{
VectorOfPToContractElement *entryTable = getContractValues(groupChars[0]);
if (entryTable != NULL)
{
int32_t index = getEntry(entryTable, groupChars, TRUE);
if (index != UNMAPPED)
{
EntryPair *pair = entryTable->at(index);
result = pair->value;
}
}
}
return result;
}
int32_t RuleBasedCollator::getCharOrder(UChar ch) const
{
int32_t order = ucmp32_get(data->mapping, ch);
if (order >= CONTRACTCHARINDEX)
{
VectorOfPToContractElement *groupList = getContractValues(order - CONTRACTCHARINDEX);
EntryPair *pair = groupList->at(0);
order = pair->value;
}
return order;
}
// Create a hash code for this collation. Just hash the main rule table --
// that should be good enough for almost any use.
int32_t
RuleBasedCollator::hashCode() const
{
int32_t value = 0;
int32_t c;
int32_t count = getRules().size();
UTextOffset pos = count - 1;
if (count > 64)
{
count = 64; // only hash upto limit
}
int16_t i = 0;
while (i < count)
{
c = data->ruleTable[pos];
value = ((value << (c & 0x0f)) ^ (c << 8)) + (c ^ value);
i += 1;
pos -= 1;
}
if (value == 0)
{
value = 1;
}
return value;
}
// find the contracting char entry in the list
int32_t
RuleBasedCollator::getEntry(VectorOfPToContractElement* list,
const UnicodeString& name,
bool_t fwd)
{
int32_t i;
if (list != NULL)
{
for (i = 0; i < list->size(); i += 1)
{
EntryPair *pair = list->at(i);
if ((pair != NULL) && (pair->fwd == fwd) && (pair->entryName == name))
{
return i;
}
}
}
return RuleBasedCollator::UNMAPPED;
}
// look for the contracting list entry with the beginning char
VectorOfPToContractElement*
RuleBasedCollator::getContractValues(UChar ch) const
{
int32_t index = ucmp32_get(data->mapping, ch);
return getContractValues(index - CONTRACTCHARINDEX);
}
// look for the contracting list entry with the index
VectorOfPToContractElement*
RuleBasedCollator::getContractValues(int32_t index) const
{
if (data->contractTable != NULL)
{
if (index >= 0)
{
return data->contractTable->at(index);
}
}
return NULL;
}
/**
* Return the maximum length of any expansion sequences that end
* with the specified comparison order.
*
* @param order a collation order returned by previous or next.
* @return the maximum length of any expansion seuences ending
* with the specified order.
*
* @see CollationElementIterator#getMaxExpansion
*/
int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
{
int32_t result = 1;
if (data->expandTable != NULL)
{
// Right now this does a linear search through the entire
// expandsion table. If a collator had a large number of expansions,
// this could cause a performance problem, but in practice that
// rarely happens
int32_t i;
for (i = 0; i < data->expandTable->size(); i += 1)
{
VectorOfInt *valueList = data->expandTable->at(i);
int32_t length = valueList->size();
if (length > result && valueList->at(length-1) == order)
{
result = length;
}
}
}
return result;
}
/**
* Get the entry of hash table of the expanding string in the collation
* table.
* @param idx the index of the expanding string value list
*/
VectorOfInt *RuleBasedCollator::getExpandValueList(int32_t order) const
{
return data->expandTable->at(order - EXPANDCHARINDEX);
}
// Get the character order in the mapping table
int32_t
RuleBasedCollator::getUnicodeOrder(UChar ch) const
{
return ucmp32_get(data->mapping, ch);
}
void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, FileStream* is)
{
if (!T_FileStream_error(is))
{
// Check that this is the correct file type
int16_t id;
T_FileStream_read(is, &id, sizeof(id));
if (id != collator->FILEID)
{
// This isn't the right type of file. Mark the ios
// as failing and return.
T_FileStream_setError(is); // force the stream to set its error flag
return;
}
// Stream in large objects
char isNull;
T_FileStream_read(is, &isNull, sizeof(isNull));
if (isNull)
{
delete collator->data;
collator->data = NULL;
}
else
{
if (collator->data == NULL)
{
collator->data = new TableCollationData;
}
collator->data->streamIn(is);
if (collator->data->isBogus()) {
T_FileStream_setError(is); // force the stream to set its error flag
return;
}
}
// Verify that the end marker is present
T_FileStream_read(is, &id, sizeof(id));
if (id != collator->FILEID)
{
// This isn't the right type of file. Mark the ios
// as failing and return.
T_FileStream_setError(is); // force the stream to set its error flag
return;
}
// Reset other data members
collator->isOverIgnore = FALSE;
collator->lastChar = 0;
delete collator->mPattern;
collator->mPattern = 0;
collator->key.remove();
collator->dataIsOwned = TRUE;
}
}
void RuleBasedCollatorStreamer::streamOut(const RuleBasedCollator* collator, FileStream* os)
{
if (!T_FileStream_error(os))
{
// We use a 16-bit ID code to identify this file.
int16_t id = collator->FILEID;
T_FileStream_write(os, &id, sizeof(id));
// Stream out the data
char isNull;
isNull = (collator->data == 0);
T_FileStream_write(os, &isNull, sizeof(isNull));
if (!isNull)
{
collator->data->streamOut(os);
}
// Write out the ID to indicate the end
T_FileStream_write(os, &id, sizeof(id));
}
}
bool_t RuleBasedCollator::writeToFile(const char* fileName) const
{
FileStream* ofs = T_FileStream_open(fileName, "wb");
if (ofs != 0)
{
RuleBasedCollatorStreamer::streamOut(this, ofs);
}
#ifdef COLLDEBUG
fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs),
(!T_FileStream_error(ofs) ? ", OK" : ", FAIL");
#endif
bool_t err = T_FileStream_error(ofs) == 0;
T_FileStream_close(ofs);
return err;
}
void RuleBasedCollator::addToCache(const UnicodeString& key)
{
// This method doesn't add the RuleBasedCollator itself to the cache. Instead,
// it adds the given RuleBasedCollator's data object to the TableCollationData
// cache, and marks it as non-owned in the given RuleBasedCollator object.
TableCollationData::addToCache(key, data);
dataIsOwned = FALSE;
}
void
RuleBasedCollator::constructFromCache(const UnicodeString& key,
UErrorCode& status)
{
// Attempt to construct this RuleBasedCollator object from cached TableCollationData.
// If no such data is in the cache, return false.
if (FAILURE(status)) return;
if (dataIsOwned)
{
delete data;
data = NULL;
}
isOverIgnore = FALSE;
lastChar = 0;
mPattern = 0;
setStrength(Collator::TERTIARY);
dataIsOwned = FALSE;
data = TableCollationData::findInCache(key);
if (data == NULL)
{
status = U_MISSING_RESOURCE_ERROR;
}
}
char*
RuleBasedCollator::createPathName( const UnicodeString& prefix,
const UnicodeString& name,
const UnicodeString& suffix)
{
// Concatenate three elements to form a file name, and return it.
UnicodeString workingName(prefix);
int32_t size;
char* returnVal;
workingName += name;
workingName += suffix;
size = workingName.size();
returnVal = new char[size + 1];
workingName.extract(0, size, returnVal);
returnVal[size] = 0;
return returnVal;
}
void
RuleBasedCollator::chopLocale(UnicodeString& localeName)
{
// chopLocale removes the final element from a locale string.
// For instance, "de_CH" becomes "de", and "de" becomes "".
// "" remains "".
int32_t size = localeName.size();
int32_t i;
for (i = size - 1; i > 0; i--)
{
if (localeName[i] == 0x005F)
{
break;
}
}
if (i < 0)
{
i = 0;
}
localeName.remove(i, size - i);
}
//eof