1999-08-16 21:50:52 +00:00
|
|
|
|
/*
|
|
|
|
|
*******************************************************************************
|
1999-11-22 20:25:35 +00:00
|
|
|
|
* Copyright (C) 1996-1999, International Business Machines Corporation and *
|
|
|
|
|
* others. All Rights Reserved. *
|
1999-08-16 21:50:52 +00:00
|
|
|
|
*******************************************************************************
|
|
|
|
|
*
|
|
|
|
|
* File tblcoll.cpp
|
|
|
|
|
*
|
|
|
|
|
* Created by: Helena Shih
|
|
|
|
|
*
|
|
|
|
|
* Modification History:
|
|
|
|
|
*
|
|
|
|
|
* Date Name Description
|
|
|
|
|
* 2/5/97 aliu Added streamIn and streamOut methods. Added
|
|
|
|
|
* constructor which reads RuleBasedCollator object from
|
|
|
|
|
* a binary file. Added writeToFile method which streams
|
|
|
|
|
* RuleBasedCollator out to a binary file. The streamIn
|
|
|
|
|
* and streamOut methods use istream and ostream objects
|
|
|
|
|
* in binary mode.
|
|
|
|
|
* 2/11/97 aliu Moved declarations out of for loop initializer.
|
|
|
|
|
* Added Mac compatibility #ifdef for ios::nocreate.
|
|
|
|
|
* 2/12/97 aliu Modified to use TableCollationData sub-object to
|
|
|
|
|
* hold invariant data.
|
|
|
|
|
* 2/13/97 aliu Moved several methods into this class from Collation.
|
|
|
|
|
* Added a private RuleBasedCollator(Locale&) constructor,
|
|
|
|
|
* to be used by Collator::getInstance(). General
|
|
|
|
|
* clean up. Made use of UErrorCode variables consistent.
|
|
|
|
|
* 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
|
|
|
|
|
* constructor and getDynamicClassID.
|
|
|
|
|
* 3/5/97 aliu Changed compaction cycle to improve performance. We
|
|
|
|
|
* use the maximum allowable value which is kBlockCount.
|
|
|
|
|
* Modified getRules() to load rules dynamically. Changed
|
|
|
|
|
* constructFromFile() call to accomodate this (added
|
|
|
|
|
* parameter to specify whether binary loading is to
|
|
|
|
|
* take place).
|
|
|
|
|
* 05/06/97 helena Added memory allocation error check.
|
|
|
|
|
* 6/20/97 helena Java class name change.
|
|
|
|
|
* 6/23/97 helena Adding comments to make code more readable.
|
|
|
|
|
* 09/03/97 helena Added createCollationKeyValues().
|
|
|
|
|
* 06/26/98 erm Changes for CollationKeys using byte arrays.
|
|
|
|
|
* 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java
|
|
|
|
|
* 04/23/99 stephen Removed EDecompositionMode, merged with
|
|
|
|
|
* Normalizer::EMode
|
|
|
|
|
* 06/14/99 stephen Removed kResourceBundleSuffix
|
|
|
|
|
* 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
|
1999-11-23 22:49:29 +00:00
|
|
|
|
* files are no longer used.
|
|
|
|
|
* 11/02/99 helena Collator performance enhancements. Special case
|
|
|
|
|
* for NO_OP situations.
|
|
|
|
|
* 11/17/99 srl More performance enhancements. Inlined some internal functions.
|
1999-12-16 01:41:19 +00:00
|
|
|
|
* 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
|
|
|
|
|
* to implementation file.
|
1999-08-16 21:50:52 +00:00
|
|
|
|
*******************************************************************************
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "ucmp32.h"
|
|
|
|
|
#include "tcoldata.h"
|
|
|
|
|
|
1999-12-28 23:57:50 +00:00
|
|
|
|
#include "unicode/tblcoll.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
1999-12-28 23:57:50 +00:00
|
|
|
|
#include "unicode/coleitr.h"
|
|
|
|
|
#include "unicode/locid.h"
|
|
|
|
|
#include "unicode/unicode.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#include "tables.h"
|
1999-12-28 23:57:50 +00:00
|
|
|
|
#include "unicode/normlzr.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#include "mergecol.h"
|
1999-12-28 23:57:50 +00:00
|
|
|
|
#include "unicode/resbund.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#include "filestrm.h"
|
2000-05-22 19:49:10 +00:00
|
|
|
|
#include "umemstrm.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
#ifdef _DEBUG
|
|
|
|
|
#include "unistrm.h"
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#include "compitr.h"
|
|
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
2000-01-06 20:02:04 +00:00
|
|
|
|
#include "unicode/ustring.h"
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
2000-06-03 04:37:12 +00:00
|
|
|
|
#include "cmemory.h"
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
class RuleBasedCollatorStreamer
|
|
|
|
|
{
|
|
|
|
|
public:
|
2000-05-22 19:49:10 +00:00
|
|
|
|
static void streamIn(RuleBasedCollator* collator, FileStream* is);
|
|
|
|
|
static void streamOut(const RuleBasedCollator* collator, FileStream* os);
|
2000-11-09 21:58:19 +00:00
|
|
|
|
static void streamIn(RuleBasedCollator* collator, UMemoryStream* is, UErrorCode& status);
|
2000-05-22 19:49:10 +00:00
|
|
|
|
static void streamOut(const RuleBasedCollator* collator, UMemoryStream* os);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//===========================================================================================
|
|
|
|
|
// The following diagram shows the data structure of the RuleBasedCollator object.
|
|
|
|
|
// Suppose we have the rule, where 'o-umlaut' is the unicode char 0x00F6.
|
|
|
|
|
// "a, A < b, B < c, C, ch, cH, Ch, CH < d, D ... < o, O; 'o-umlaut'/E, 'O-umlaut'/E ...".
|
|
|
|
|
// What the rule says is, sorts 'ch'ligatures and 'c' only with tertiary difference and
|
|
|
|
|
// sorts 'o-umlaut' as if it's always expanded with 'e'.
|
|
|
|
|
//
|
|
|
|
|
// mapping table contracting list expanding list
|
|
|
|
|
// (contains all unicode char
|
|
|
|
|
// entries) ___ _____________ _________________________
|
|
|
|
|
// ________ |==>|_*_|-->|'c' |v('c') | |==>|v('o')|v('umlaut')|v('e')|
|
|
|
|
|
// |_\u0001_|--> v('\u0001') | |_:_| |-------------| | |-------------------------|
|
|
|
|
|
// |_\u0002_|--> v('\u0002') | |_:_| |'ch' |v('ch')| | | : |
|
|
|
|
|
// |____:___| | |_:_| |-------------| | |-------------------------|
|
|
|
|
|
// |____:___| | |'cH' |v('cH')| | | : |
|
|
|
|
|
// |__'a'___|--> v('a') | |-------------| | |-------------------------|
|
|
|
|
|
// |__'b'___|--> v('b') | |'Ch' |v('Ch')| | | : |
|
|
|
|
|
// |____:___| | |-------------| | |-------------------------|
|
|
|
|
|
// |____:___| | |'CH' |v('CH')| | | : |
|
|
|
|
|
// |___'c'__|------------------- ------------- | |-------------------------|
|
|
|
|
|
// |____:___| | | : |
|
|
|
|
|
// |o-umlaut|------------------------------------------------ |_________________________|
|
|
|
|
|
// |____:___|
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// Noted by Helena Shih on 6/23/97 with pending design changes (slimming collation).
|
|
|
|
|
//============================================================================================
|
|
|
|
|
|
|
|
|
|
const int32_t RuleBasedCollator::CHARINDEX = 0x70000000; // need look up in .commit()
|
|
|
|
|
const int32_t RuleBasedCollator::EXPANDCHARINDEX = 0x7E000000; // Expand index follows
|
|
|
|
|
const int32_t RuleBasedCollator::CONTRACTCHARINDEX = 0x7F000000; // contract indexes follows
|
|
|
|
|
const int32_t RuleBasedCollator::UNMAPPED = 0xFFFFFFFF; // unmapped character values
|
|
|
|
|
const int32_t RuleBasedCollator::PRIMARYORDERINCREMENT = 0x00010000; // primary strength increment
|
|
|
|
|
const int32_t RuleBasedCollator::SECONDARYORDERINCREMENT = 0x00000100; // secondary strength increment
|
|
|
|
|
const int32_t RuleBasedCollator::TERTIARYORDERINCREMENT = 0x00000001; // tertiary strength increment
|
|
|
|
|
const int32_t RuleBasedCollator::MAXIGNORABLE = 0x00010000; // maximum ignorable char order value
|
|
|
|
|
const int32_t RuleBasedCollator::PRIMARYORDERMASK = 0xffff0000; // mask off anything but primary order
|
|
|
|
|
const int32_t RuleBasedCollator::SECONDARYORDERMASK = 0x0000ff00; // mask off anything but secondary order
|
|
|
|
|
const int32_t RuleBasedCollator::TERTIARYORDERMASK = 0x000000ff; // mask off anything but tertiary order
|
|
|
|
|
const int32_t RuleBasedCollator::SECONDARYRESETMASK = 0x0000ffff; // mask off secondary and tertiary order
|
|
|
|
|
const int32_t RuleBasedCollator::IGNORABLEMASK = 0x0000ffff; // mask off ignorable char order
|
|
|
|
|
const int32_t RuleBasedCollator::PRIMARYDIFFERENCEONLY = 0xffff0000; // use only the primary difference
|
|
|
|
|
const int32_t RuleBasedCollator::SECONDARYDIFFERENCEONLY = 0xffffff00; // use only the primary and secondary difference
|
|
|
|
|
const int32_t RuleBasedCollator::PRIMARYORDERSHIFT = 16; // primary order shift
|
|
|
|
|
const int32_t RuleBasedCollator::SECONDARYORDERSHIFT = 8; // secondary order shift
|
|
|
|
|
const int32_t RuleBasedCollator::SORTKEYOFFSET = 1; // minimum sort key offset
|
|
|
|
|
const int32_t RuleBasedCollator::CONTRACTCHAROVERFLOW = 0x7FFFFFFF; // Indicates the char is a contract char
|
|
|
|
|
|
|
|
|
|
const int16_t RuleBasedCollator::FILEID = 0x5443; // unique file id for parity check
|
|
|
|
|
const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension
|
|
|
|
|
char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id
|
|
|
|
|
|
1999-12-16 01:41:19 +00:00
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// NormalizerIterator
|
|
|
|
|
//
|
|
|
|
|
// This class is essentially a duplicate of CollationElementIterator,
|
|
|
|
|
// stripped down for speed. It is declared here so we can incorporate
|
|
|
|
|
// internal classes as subobjects, as well as just to hide it from the
|
|
|
|
|
// public interface.
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
/* Internal class for quick iteration over the text.
|
|
|
|
|
100% pure inline code
|
|
|
|
|
*/
|
|
|
|
|
class NormalizerIterator {
|
|
|
|
|
public:
|
|
|
|
|
Normalizer *cursor;
|
|
|
|
|
VectorOfInt *bufferAlias;
|
|
|
|
|
VectorOfInt *reorderBuffer;
|
|
|
|
|
VectorOfInt ownBuffer;
|
|
|
|
|
UChar* text;
|
|
|
|
|
int32_t expIndex;
|
|
|
|
|
int32_t textLen;
|
|
|
|
|
UTextOffset currentOffset;
|
|
|
|
|
|
|
|
|
|
NormalizerIterator(void);
|
|
|
|
|
NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode);
|
|
|
|
|
~NormalizerIterator(void);
|
|
|
|
|
void setText(const UChar* source, int32_t length, UErrorCode& status);
|
|
|
|
|
void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status);
|
|
|
|
|
|
|
|
|
|
UChar current(void) const;
|
|
|
|
|
UChar next(void);
|
|
|
|
|
void reset(void);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
NormalizerIterator::NormalizerIterator() :
|
|
|
|
|
cursor(0),
|
|
|
|
|
bufferAlias(0),
|
|
|
|
|
reorderBuffer(0),
|
|
|
|
|
ownBuffer(2),
|
|
|
|
|
text(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
expIndex(0),
|
1999-12-16 01:41:19 +00:00
|
|
|
|
textLen(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
currentOffset(0)
|
1999-12-16 01:41:19 +00:00
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) :
|
|
|
|
|
cursor(0),
|
|
|
|
|
bufferAlias(0),
|
|
|
|
|
reorderBuffer(0),
|
|
|
|
|
ownBuffer(2),
|
|
|
|
|
text(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
expIndex(0),
|
1999-12-16 01:41:19 +00:00
|
|
|
|
textLen(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
currentOffset(0)
|
1999-12-16 01:41:19 +00:00
|
|
|
|
{
|
|
|
|
|
if (mode == Normalizer::NO_OP) {
|
|
|
|
|
text = (UChar*)source;
|
|
|
|
|
textLen = length;
|
|
|
|
|
currentOffset = 0;
|
|
|
|
|
} else {
|
|
|
|
|
cursor = new Normalizer(source, length, mode);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
NormalizerIterator::~NormalizerIterator()
|
|
|
|
|
{
|
|
|
|
|
if (cursor != 0) {
|
|
|
|
|
delete cursor;
|
|
|
|
|
cursor = 0;
|
|
|
|
|
}
|
|
|
|
|
if (reorderBuffer != 0) {
|
|
|
|
|
delete reorderBuffer;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
void
|
|
|
|
|
NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status)
|
|
|
|
|
{
|
|
|
|
|
if (cursor == 0) {
|
|
|
|
|
text = (UChar*)source;
|
|
|
|
|
textLen = length;
|
|
|
|
|
currentOffset = 0;
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
text = 0;
|
|
|
|
|
cursor->setText(source, length, status);
|
|
|
|
|
}
|
|
|
|
|
bufferAlias = 0;
|
|
|
|
|
currentOffset = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* You can only set mode after the comparision of two strings is completed.
|
|
|
|
|
Setting the mode in the middle of a comparison is not allowed.
|
|
|
|
|
*/
|
|
|
|
|
inline
|
|
|
|
|
void
|
|
|
|
|
|
|
|
|
|
NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status)
|
|
|
|
|
{
|
|
|
|
|
if(mode != Normalizer::NO_OP)
|
|
|
|
|
{
|
|
|
|
|
/* DO have a mode - will need a normalizer object */
|
|
|
|
|
if(cursor != NULL)
|
|
|
|
|
{
|
|
|
|
|
/* Just modify the existing cursor */
|
|
|
|
|
cursor->setMode(mode);
|
|
|
|
|
cursor->setText(source, length, status);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
cursor = new Normalizer(source, length, mode);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* RESET the old data */
|
|
|
|
|
text = 0;
|
|
|
|
|
textLen = 0;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/* NO_OP mode.. */
|
|
|
|
|
if(cursor != NULL)
|
|
|
|
|
{ /* get rid of the old cursor */
|
|
|
|
|
delete cursor;
|
|
|
|
|
cursor = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
text = (UChar*)source;
|
|
|
|
|
textLen = length;
|
|
|
|
|
}
|
|
|
|
|
currentOffset = 0; /* always */
|
|
|
|
|
|
|
|
|
|
bufferAlias = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
UChar
|
|
|
|
|
NormalizerIterator::current(void) const
|
|
|
|
|
{
|
|
|
|
|
if (text != 0) {
|
|
|
|
|
if(currentOffset >= textLen)
|
|
|
|
|
{
|
|
|
|
|
return Normalizer::DONE;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
return text[currentOffset];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2000-05-24 21:01:07 +00:00
|
|
|
|
return (UChar)cursor->current();
|
1999-12-16 01:41:19 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
UChar
|
|
|
|
|
NormalizerIterator::next(void)
|
|
|
|
|
{
|
|
|
|
|
if (text != 0) {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
return (UChar)((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE);
|
1999-12-16 01:41:19 +00:00
|
|
|
|
}
|
2000-05-24 21:01:07 +00:00
|
|
|
|
return (UChar)cursor->next();
|
1999-12-16 01:41:19 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline
|
|
|
|
|
void
|
|
|
|
|
NormalizerIterator::reset(void)
|
|
|
|
|
{
|
|
|
|
|
currentOffset = 0;
|
|
|
|
|
if(cursor)
|
|
|
|
|
{
|
|
|
|
|
cursor->reset();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
//================ Some inline definitions of implementation functions........ ========
|
2000-01-13 21:36:39 +00:00
|
|
|
|
/**
|
|
|
|
|
* A clone of CollationElementIterator::makeReorderedBuffer, trimmed down
|
|
|
|
|
* to only handle forward.
|
|
|
|
|
*/
|
|
|
|
|
inline VectorOfInt*
|
|
|
|
|
RuleBasedCollator::makeReorderedBuffer(NormalizerIterator* cursor,
|
|
|
|
|
UChar colFirst,
|
|
|
|
|
int32_t lastValue,
|
|
|
|
|
VectorOfInt* lastExpansion) const {
|
|
|
|
|
VectorOfInt* result;
|
|
|
|
|
|
|
|
|
|
int32_t firstValue = ucmp32_get(data->mapping, colFirst);
|
|
|
|
|
if (firstValue >= CONTRACTCHARINDEX) {
|
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
firstValue = nextContractChar(cursor, colFirst, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VectorOfInt* firstExpansion = NULL;
|
|
|
|
|
if (firstValue >= EXPANDCHARINDEX) {
|
|
|
|
|
firstExpansion = getExpandValueList(firstValue);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (firstExpansion == NULL && lastExpansion == NULL) {
|
|
|
|
|
cursor->ownBuffer.at(0) = firstValue;
|
|
|
|
|
cursor->ownBuffer.at(1) = lastValue;
|
|
|
|
|
result = &cursor->ownBuffer;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
int32_t firstLength = firstExpansion==NULL? 1 : firstExpansion->size();
|
|
|
|
|
int32_t lastLength = lastExpansion==NULL? 1 : lastExpansion->size();
|
|
|
|
|
if (cursor->reorderBuffer == NULL) {
|
|
|
|
|
cursor->reorderBuffer = new VectorOfInt(firstLength+lastLength);
|
|
|
|
|
}
|
|
|
|
|
// reorderdBuffer gets reused for the life of this object.
|
|
|
|
|
// Since its internal buffer only grows, there is a danger
|
|
|
|
|
// that it will get really, really big, and never shrink. If
|
|
|
|
|
// this is actually happening, insert code here to check for
|
|
|
|
|
// the condition. Something along the lines of:
|
|
|
|
|
//! else if (reorderBuffer->size() >= 256 &&
|
|
|
|
|
//! (firstLength+lastLength) < 16) {
|
|
|
|
|
//! delete reorderBuffer;
|
|
|
|
|
//! reorderBuffer = new VectorOfInt(firstLength+lastLength);
|
|
|
|
|
//! }
|
|
|
|
|
// The specific numeric values need to be determined
|
|
|
|
|
// empirically. [aliu]
|
|
|
|
|
result = cursor->reorderBuffer;
|
|
|
|
|
|
|
|
|
|
if (firstExpansion == NULL) {
|
|
|
|
|
result->atPut(0, firstValue);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// System.arraycopy(firstExpansion, 0, result, 0, firstLength);
|
|
|
|
|
*result = *firstExpansion;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (lastExpansion == NULL) {
|
|
|
|
|
result->atPut(firstLength, lastValue);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// System.arraycopy(lastExpansion, 0, result, firstLength, lastLength);
|
|
|
|
|
for (int32_t i=0; i<lastLength; ++i) {
|
|
|
|
|
result->atPut(firstLength + i, lastExpansion->at(i));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result->setSize(firstLength+lastLength);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
|
|
|
|
inline int32_t
|
|
|
|
|
RuleBasedCollator::strengthOrder(int32_t value) const
|
|
|
|
|
{
|
|
|
|
|
if (getStrength() == PRIMARY)
|
|
|
|
|
{
|
|
|
|
|
return (value & PRIMARYDIFFERENCEONLY);
|
|
|
|
|
} else if (getStrength() == SECONDARY)
|
|
|
|
|
{
|
|
|
|
|
return (value & SECONDARYDIFFERENCEONLY);
|
|
|
|
|
}
|
|
|
|
|
return value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline int32_t
|
|
|
|
|
RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor,
|
|
|
|
|
UErrorCode status) const
|
|
|
|
|
{
|
|
|
|
|
if (U_FAILURE(status))
|
|
|
|
|
{
|
|
|
|
|
return CollationElementIterator::NULLORDER;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (cursor->bufferAlias != NULL)
|
|
|
|
|
{
|
|
|
|
|
// bufferAlias needs a bit of an explanation.
|
|
|
|
|
// When we hit an expanding character in the text, we call the order's
|
|
|
|
|
// getExpandValues method to retrieve an array of the orderings for all
|
|
|
|
|
// of the characters in the expansion (see the end of this method).
|
|
|
|
|
// The first ordering is returned, and an alias to the orderings array
|
|
|
|
|
// is saved so that the remaining orderings can be returned on subsequent
|
|
|
|
|
// calls to next. So, if the expanding buffer is not exhausted,
|
|
|
|
|
// all we have to do here is return the next ordering in the buffer.
|
|
|
|
|
if (cursor->expIndex < cursor->bufferAlias->size())
|
|
|
|
|
{
|
1999-12-16 01:41:19 +00:00
|
|
|
|
//_L((stderr, "next from [%08X] from bufferAlias\n", this));
|
1999-11-23 22:49:29 +00:00
|
|
|
|
return strengthOrder(cursor->bufferAlias->at(cursor->expIndex++));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
cursor->bufferAlias = NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
UChar ch = cursor->current();
|
|
|
|
|
cursor->next();
|
|
|
|
|
|
|
|
|
|
//_L((stderr, "Next from [%08X] = [%04X], [%c]\n", cursor, (int)ch & 0xFFFF, (char)(ch & 0xFF)));
|
|
|
|
|
|
|
|
|
|
if (ch == Normalizer::DONE) {
|
|
|
|
|
return CollationElementIterator::NULLORDER;
|
|
|
|
|
}
|
|
|
|
|
// Ask the collator for this character's ordering.
|
1999-12-10 18:53:45 +00:00
|
|
|
|
int32_t value = ucmp32_get(data->mapping, ch);
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
|
|
|
|
if (value == UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
// Returned an "unmapped" flag and save the character so it can be
|
|
|
|
|
// returned next time this method is called.
|
1999-12-16 01:41:19 +00:00
|
|
|
|
if (ch == 0x0000) return ch; // \u0000 is not valid in C++'s UnicodeString
|
|
|
|
|
cursor->ownBuffer.at(0) = CollationElementIterator::UNMAPPEDCHARVALUE;
|
|
|
|
|
cursor->ownBuffer.at(1) = ch << 16;
|
|
|
|
|
cursor->bufferAlias = &cursor->ownBuffer;
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
|
|
if (value >= CONTRACTCHARINDEX)
|
|
|
|
|
{
|
|
|
|
|
value = nextContractChar(cursor, ch, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (value >= EXPANDCHARINDEX) {
|
|
|
|
|
cursor->bufferAlias = getExpandValueList(value);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (CollationElementIterator::isThaiPreVowel(ch)) {
|
|
|
|
|
UChar consonant = cursor->current();
|
|
|
|
|
if (CollationElementIterator::isThaiBaseConsonant(consonant)) {
|
|
|
|
|
cursor->next();
|
|
|
|
|
cursor->bufferAlias = makeReorderedBuffer(cursor, consonant, value,
|
|
|
|
|
cursor->bufferAlias);
|
|
|
|
|
}
|
|
|
|
|
}
|
1999-11-23 22:49:29 +00:00
|
|
|
|
}
|
|
|
|
|
|
1999-12-16 01:41:19 +00:00
|
|
|
|
if (cursor->bufferAlias != NULL) {
|
|
|
|
|
cursor->expIndex = 1;
|
|
|
|
|
value = cursor->bufferAlias->at(0);
|
1999-11-23 22:49:29 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return strengthOrder(value);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ==================== End inlines ============================================
|
|
|
|
|
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
//===============================================================================
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::RuleBasedCollator()
|
|
|
|
|
: Collator(),
|
|
|
|
|
isOverIgnore(FALSE),
|
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
//targetCursor(0),
|
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
|
|
|
|
|
: Collator(that),
|
|
|
|
|
isOverIgnore(that.isOverIgnore),
|
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
//targetCursor(0),
|
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
1999-08-16 21:50:52 +00:00
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(that.data) // Alias the data pointer
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool
|
1999-08-16 21:50:52 +00:00
|
|
|
|
RuleBasedCollator::operator==(const Collator& that) const
|
|
|
|
|
{
|
|
|
|
|
if (this == &that)
|
|
|
|
|
{
|
|
|
|
|
return TRUE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this->getDynamicClassID() != that.getDynamicClassID())
|
|
|
|
|
{
|
|
|
|
|
return FALSE; // not the same class
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!Collator::operator==(that))
|
|
|
|
|
{
|
|
|
|
|
return FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
|
|
|
|
|
|
|
|
|
|
if (isOverIgnore != thatAlias.isOverIgnore)
|
|
|
|
|
{
|
|
|
|
|
return FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (data != thatAlias.data)
|
|
|
|
|
{
|
|
|
|
|
return FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return TRUE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator&
|
|
|
|
|
RuleBasedCollator::operator=(const RuleBasedCollator& that)
|
|
|
|
|
{
|
|
|
|
|
if (this != &that)
|
|
|
|
|
{
|
|
|
|
|
Collator::operator=(that);
|
|
|
|
|
isOverIgnore = that.isOverIgnore;
|
|
|
|
|
|
|
|
|
|
if (dataIsOwned)
|
|
|
|
|
{
|
|
|
|
|
delete data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data = 0;
|
|
|
|
|
delete mPattern;
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
dataIsOwned = FALSE;
|
|
|
|
|
data = that.data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return *this;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
: Collator(),
|
|
|
|
|
isOverIgnore(FALSE),
|
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
/// targetCursor(0),
|
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constructFromRules(rules, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
|
|
|
|
ECollationStrength collationStrength,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
: Collator(collationStrength, Normalizer::NO_OP),
|
|
|
|
|
isOverIgnore(FALSE),
|
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
// targetCursor(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
constructFromRules(rules, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
|
|
|
|
Normalizer::EMode decompositionMode,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
: Collator(TERTIARY, decompositionMode),
|
|
|
|
|
isOverIgnore(FALSE),
|
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
// targetCursor(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constructFromRules(rules, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
|
|
|
|
ECollationStrength collationStrength,
|
|
|
|
|
Normalizer::EMode decompositionMode,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
: Collator(collationStrength, decompositionMode),
|
|
|
|
|
isOverIgnore(FALSE),
|
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
//targetCursor(0),
|
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constructFromRules(rules, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RuleBasedCollator::constructFromRules(const UnicodeString& rules,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
|
|
|
|
// Construct this collator's ruleset from its string representation
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rules.isBogus())
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (dataIsOwned)
|
|
|
|
|
{
|
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
isOverIgnore = FALSE;
|
|
|
|
|
setStrength(Collator::TERTIARY);
|
|
|
|
|
|
|
|
|
|
data = new TableCollationData;
|
|
|
|
|
if (data->isBogus())
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We constructed the data using the build method, so we own it.
|
|
|
|
|
dataIsOwned = TRUE;
|
|
|
|
|
|
|
|
|
|
// Now that we've got all the buffers allocated, do the actual work
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
build(rules, status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::constructFromFile(const char* fileName,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
|
|
|
|
// This method tries to read in a flattened RuleBasedCollator that
|
|
|
|
|
// has been previously streamed out using the streamOut() method.
|
|
|
|
|
// The 'fileName' parameter should contain a full pathname valid on
|
|
|
|
|
// the local environment.
|
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (dataIsOwned)
|
|
|
|
|
{
|
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
isOverIgnore = FALSE;
|
|
|
|
|
setStrength(Collator::TERTIARY); // This is the default strength
|
|
|
|
|
|
|
|
|
|
FileStream* ifs = T_FileStream_open(fileName, "rb");
|
|
|
|
|
if (ifs == 0) {
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_FILE_ACCESS_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The streamIn function does the actual work here...
|
|
|
|
|
RuleBasedCollatorStreamer::streamIn(this, ifs);
|
|
|
|
|
|
|
|
|
|
if (!T_FileStream_error(ifs))
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
else if (data && data->isBogus())
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef COLLDEBUG
|
1999-12-08 23:31:17 +00:00
|
|
|
|
fprintf(stderr, "binary read %s size %d, %s\n", fileName, T_FileStream_size(ifs), u_errorName(status));
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// We constructed the data when streaming it in, so we own it
|
|
|
|
|
dataIsOwned = TRUE;
|
|
|
|
|
|
|
|
|
|
T_FileStream_close(ifs);
|
|
|
|
|
}
|
|
|
|
|
|
2000-05-24 19:58:40 +00:00
|
|
|
|
const char *
|
2000-05-24 22:03:54 +00:00
|
|
|
|
RuleBasedCollator::constructFromBundle(const Locale & name,
|
2000-05-22 19:49:10 +00:00
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
2000-05-24 19:58:40 +00:00
|
|
|
|
// This method tries to locate binary collation data which has been
|
|
|
|
|
// previously streamed to a binary object "%%Collation" in a
|
|
|
|
|
// resource bundle. If the data is found, it is cached.
|
|
|
|
|
// cache is checked before actually streaming in data
|
|
|
|
|
// resource bundle fallback mechanism is used.
|
2000-05-22 19:49:10 +00:00
|
|
|
|
|
|
|
|
|
if (U_FAILURE(status))
|
|
|
|
|
{
|
2000-05-24 19:58:40 +00:00
|
|
|
|
return 0;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (dataIsOwned)
|
|
|
|
|
{
|
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
2000-05-24 19:58:40 +00:00
|
|
|
|
const char* realName = 0;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
isOverIgnore = FALSE;
|
|
|
|
|
setStrength(Collator::TERTIARY); // This is the default strength
|
|
|
|
|
|
2000-06-30 20:31:39 +00:00
|
|
|
|
ResourceBundle rb((char *)0, name, status);
|
2000-05-24 19:58:40 +00:00
|
|
|
|
if(U_SUCCESS(status)) {
|
|
|
|
|
ResourceBundle binary = rb.get("%%Collation", status); //This is the bundle that actually contains the collation data
|
|
|
|
|
realName = binary.getName();
|
|
|
|
|
if(U_SUCCESS(status)) {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
UErrorCode intStatus = U_ZERO_ERROR;
|
|
|
|
|
constructFromCache(realName, intStatus); // check whether we already have this data in cache
|
|
|
|
|
if(U_SUCCESS(intStatus)) {
|
|
|
|
|
return realName;
|
|
|
|
|
}
|
|
|
|
|
int32_t inDataLen = 0;
|
|
|
|
|
const uint8_t *inData = binary.getBinary(inDataLen, status); //This got us the real binary data
|
|
|
|
|
|
|
|
|
|
UMemoryStream *ifs = uprv_mstrm_openBuffer(inData, inDataLen);
|
|
|
|
|
|
|
|
|
|
if (ifs == 0) {
|
|
|
|
|
status = U_FILE_ACCESS_ERROR;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The streamIn function does the actual work here...
|
2000-11-09 21:58:19 +00:00
|
|
|
|
RuleBasedCollatorStreamer::streamIn(this, ifs, status);
|
2000-08-11 01:27:17 +00:00
|
|
|
|
|
|
|
|
|
if (!uprv_mstrm_error(ifs)) {
|
|
|
|
|
}
|
|
|
|
|
else if (data && data->isBogus()) {
|
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
} else {
|
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We constructed the data when streaming it in, so we own it
|
|
|
|
|
dataIsOwned = TRUE;
|
|
|
|
|
|
|
|
|
|
uprv_mstrm_close(ifs);
|
|
|
|
|
addToCache(realName); // add the newly constructed data to cache
|
|
|
|
|
return realName;
|
2000-05-24 19:58:40 +00:00
|
|
|
|
} else {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
|
|
|
|
return 0;
|
2000-05-24 19:58:40 +00:00
|
|
|
|
}
|
2000-05-24 20:33:09 +00:00
|
|
|
|
} else {
|
|
|
|
|
return 0;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
: Collator(),
|
|
|
|
|
isOverIgnore(FALSE),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// sourceCursor(0),
|
|
|
|
|
//targetCursor(0),
|
2000-08-23 23:48:04 +00:00
|
|
|
|
mPattern(0),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
cursor1(0),
|
|
|
|
|
cursor2(0),
|
2000-08-14 23:23:20 +00:00
|
|
|
|
dataIsOwned(FALSE),
|
|
|
|
|
data(0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
|
|
|
|
|
2000-05-24 19:58:40 +00:00
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
// Try to load, in order:
|
|
|
|
|
// 1. The desired locale's collation.
|
|
|
|
|
// 2. A fallback of the desired locale.
|
|
|
|
|
// 3. The default locale's collation.
|
|
|
|
|
// 4. A fallback of the default locale.
|
|
|
|
|
// 5. The default collation rules, which contains en_US collation rules.
|
|
|
|
|
|
|
|
|
|
// To reiterate, we try:
|
|
|
|
|
// Specific:
|
|
|
|
|
// language+country+variant
|
|
|
|
|
// language+country
|
|
|
|
|
// language
|
|
|
|
|
// Default:
|
|
|
|
|
// language+country+variant
|
|
|
|
|
// language+country
|
|
|
|
|
// language
|
|
|
|
|
// Root: (aka DEFAULTRULES)
|
2000-05-24 19:58:40 +00:00
|
|
|
|
// steps 1-5 are handled by resource bundle fallback mechanism.
|
|
|
|
|
// however, in a very unprobable situation that no resource bundle
|
|
|
|
|
// data exists, step 5 is repeated with hardcoded default rules.
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
2000-05-24 22:03:54 +00:00
|
|
|
|
const char *locName = constructFromBundle(desiredLocale, status); /*!*/
|
2000-05-24 19:58:40 +00:00
|
|
|
|
|
|
|
|
|
if (U_SUCCESS(status)) {
|
2000-11-08 22:56:34 +00:00
|
|
|
|
data->desiredLocale = desiredLocale;
|
2000-05-24 19:58:40 +00:00
|
|
|
|
data->realLocaleName = locName;
|
|
|
|
|
if(status != U_USING_DEFAULT_ERROR) {
|
1999-08-16 21:50:52 +00:00
|
|
|
|
setDecomposition(Normalizer::NO_OP);
|
|
|
|
|
}
|
2000-05-24 19:58:40 +00:00
|
|
|
|
} else {
|
|
|
|
|
UErrorCode intStatus = U_ZERO_ERROR;
|
|
|
|
|
constructFromCache(ResourceBundle::kDefaultFilename, intStatus);
|
|
|
|
|
if(U_FAILURE(intStatus)) {
|
|
|
|
|
intStatus = U_ZERO_ERROR;
|
|
|
|
|
constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus);
|
|
|
|
|
if (intStatus == U_ZERO_ERROR) {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
status = U_USING_DEFAULT_ERROR;
|
2000-05-24 19:58:40 +00:00
|
|
|
|
} else {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
status = intStatus; // bubble back
|
2000-05-24 19:58:40 +00:00
|
|
|
|
}
|
2000-08-11 01:27:17 +00:00
|
|
|
|
|
2000-05-24 19:58:40 +00:00
|
|
|
|
if (status == U_MEMORY_ALLOCATION_ERROR) {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
return;
|
2000-05-24 19:58:40 +00:00
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
2000-05-24 19:58:40 +00:00
|
|
|
|
data->realLocaleName = ResourceBundle::kDefaultFilename;
|
|
|
|
|
setDecomposition(Normalizer::NO_OP);
|
|
|
|
|
addToCache(ResourceBundle::kDefaultFilename);
|
|
|
|
|
}
|
|
|
|
|
return;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::constructFromFile( const Locale& locale,
|
|
|
|
|
const UnicodeString& localeFileName,
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool tryBinaryFile,
|
1999-08-16 21:50:52 +00:00
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
|
|
|
|
// constructFromFile creates a collation object by reading from a
|
|
|
|
|
// file. It does not employ the usual FILE search mechanism with
|
|
|
|
|
// locales, default locales, and base locales. Instead, it tries to
|
|
|
|
|
// look only in files with the given localFileName. It does,
|
|
|
|
|
// however, employ the LOCALE search mechanism.
|
|
|
|
|
|
|
|
|
|
// This method maintains the binary collation files. If a collation
|
|
|
|
|
// is not present in binary form, but is present in text form (in a
|
|
|
|
|
// resource bundle file), it will be loaded in text form, and then
|
|
|
|
|
// written to disk.
|
|
|
|
|
|
|
|
|
|
// If tryBinaryFile is true, then try to load from the binary file first.
|
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if(U_FAILURE(status)) {
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(dataIsOwned) {
|
|
|
|
|
delete data;
|
|
|
|
|
data = 0;
|
|
|
|
|
}
|
2000-08-11 01:27:17 +00:00
|
|
|
|
|
2000-05-15 18:39:17 +00:00
|
|
|
|
if(tryBinaryFile) {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
char *binaryFilePath = createPathName(UnicodeString(u_getDataDirectory(),""),
|
|
|
|
|
localeFileName,
|
|
|
|
|
UnicodeString(kFilenameSuffix,""));
|
|
|
|
|
|
2000-05-15 18:39:17 +00:00
|
|
|
|
// Try to load up the collation from a binary file first
|
|
|
|
|
constructFromFile(binaryFilePath, status);
|
|
|
|
|
#ifdef COLLDEBUG
|
|
|
|
|
cerr << localeFileName << kFilenameSuffix << " binary load " << u_errorName(status) << endl;
|
|
|
|
|
#endif
|
|
|
|
|
if(U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
|
|
|
|
|
delete [] binaryFilePath;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if(status == U_FILE_ACCESS_ERROR) {
|
|
|
|
|
status = U_ZERO_ERROR;
|
|
|
|
|
}
|
2000-06-26 22:46:15 +00:00
|
|
|
|
delete [] binaryFilePath;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Now try to load it up from a resource bundle text source file
|
2000-04-15 21:23:28 +00:00
|
|
|
|
UnicodeString dataDir = UnicodeString(u_getDataDirectory(),"");
|
1999-12-04 02:31:40 +00:00
|
|
|
|
|
2000-04-15 21:23:28 +00:00
|
|
|
|
char *ch;
|
|
|
|
|
ch = new char[localeFileName.size() + 1];
|
|
|
|
|
ch[localeFileName.extract(0, 0x7fffffff, ch, "")] = 0;
|
|
|
|
|
ResourceBundle bundle(dataDir, ch, status);
|
|
|
|
|
|
|
|
|
|
delete [] ch;
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
// if there is no resource bundle file for the give locale, break out
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if(U_FAILURE(status))
|
1999-12-03 02:51:54 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
2000-05-15 18:39:17 +00:00
|
|
|
|
#ifdef COLLDEBUG
|
|
|
|
|
cerr << localeFileName << " ascii load " << u_errorName(status) << endl;
|
|
|
|
|
#endif
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
2000-05-15 18:39:17 +00:00
|
|
|
|
// check and see if this resource bundle contains collation data
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
2000-05-15 18:39:17 +00:00
|
|
|
|
UnicodeString colString;
|
|
|
|
|
UErrorCode intStatus = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
2000-07-14 22:31:35 +00:00
|
|
|
|
ResourceBundle colElems = bundle.get("CollationElements", intStatus);
|
|
|
|
|
if (U_FAILURE(intStatus))
|
|
|
|
|
{
|
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
colString = colElems.getStringEx("Sequence", intStatus);
|
|
|
|
|
|
|
|
|
|
if(U_FAILURE(intStatus)) {
|
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
|
|
|
|
return;
|
|
|
|
|
}
|
2000-05-15 18:39:17 +00:00
|
|
|
|
|
|
|
|
|
if(colString.isBogus()) {
|
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
|
return;
|
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
// Having loaded the collation from the resource bundle text file,
|
|
|
|
|
// now retrieve the CollationElements tagged data, merged with the
|
|
|
|
|
// default rules. If that fails, use the default rules alone.
|
|
|
|
|
|
|
|
|
|
colString.insert(0, DEFAULTRULES);
|
|
|
|
|
if(colString.isBogus()) {
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constructFromRules(colString, intStatus);
|
1999-10-07 00:07:53 +00:00
|
|
|
|
if(intStatus == U_MEMORY_ALLOCATION_ERROR) {
|
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
1999-10-07 00:07:53 +00:00
|
|
|
|
if(intStatus != U_ZERO_ERROR) {
|
|
|
|
|
status = U_USING_DEFAULT_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
// predefined tables should contain correct grammar
|
1999-10-07 00:07:53 +00:00
|
|
|
|
intStatus = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
constructFromRules(DEFAULTRULES, intStatus);
|
1999-10-07 00:07:53 +00:00
|
|
|
|
if(intStatus != U_ZERO_ERROR) {
|
1999-08-16 21:50:52 +00:00
|
|
|
|
status = intStatus;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef COLLDEBUG
|
1999-11-23 22:49:29 +00:00
|
|
|
|
cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << " - try= " << (tryBinaryFile?"true":"false") << endl;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#endif
|
2000-06-26 22:46:15 +00:00
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RuleBasedCollator::~RuleBasedCollator()
|
|
|
|
|
{
|
|
|
|
|
if (dataIsOwned)
|
|
|
|
|
{
|
|
|
|
|
delete data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data = 0;
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// delete sourceCursor;
|
|
|
|
|
// sourceCursor = 0;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// delete targetCursor;
|
|
|
|
|
// targetCursor = 0;
|
|
|
|
|
|
|
|
|
|
if (cursor1 != NULL) {
|
|
|
|
|
delete cursor1;
|
|
|
|
|
cursor1 = 0;
|
|
|
|
|
}
|
|
|
|
|
if (cursor2 != NULL) {
|
|
|
|
|
delete cursor2;
|
|
|
|
|
cursor2 = 0;
|
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
delete mPattern;
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Collator*
|
|
|
|
|
RuleBasedCollator::clone() const
|
|
|
|
|
{
|
|
|
|
|
return new RuleBasedCollator(*this);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create a CollationElementIterator object that will iterator over the elements
|
|
|
|
|
// in a string, using the collation rules defined in this RuleBasedCollator
|
|
|
|
|
CollationElementIterator*
|
|
|
|
|
RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
CollationElementIterator *newCursor = 0;
|
|
|
|
|
|
|
|
|
|
newCursor = new CollationElementIterator(source, this, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return newCursor;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create a CollationElementIterator object that will iterator over the elements
|
|
|
|
|
// in a string, using the collation rules defined in this RuleBasedCollator
|
|
|
|
|
CollationElementIterator*
|
|
|
|
|
RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
CollationElementIterator *newCursor = 0;
|
|
|
|
|
|
|
|
|
|
newCursor = new CollationElementIterator(source, this, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return newCursor;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Return a string representation of this collator's rules.
|
|
|
|
|
// The string can later be passed to the constructor that takes a
|
|
|
|
|
// UnicodeString argument, which will construct a collator that's
|
|
|
|
|
// functionally identical to this one.
|
|
|
|
|
// You can also allow users to edit the string in order to change
|
|
|
|
|
// the collation data, or you can print it out for inspection, or whatever.
|
|
|
|
|
|
|
|
|
|
const UnicodeString&
|
|
|
|
|
RuleBasedCollator::getRules() const
|
|
|
|
|
{
|
|
|
|
|
if (mPattern != 0)
|
|
|
|
|
{
|
|
|
|
|
MergeCollation*& nonConstMPattern = *(MergeCollation**)&mPattern;
|
|
|
|
|
mPattern->emitPattern(data->ruleTable);
|
|
|
|
|
data->isRuleTableLoaded = TRUE;
|
|
|
|
|
delete nonConstMPattern;
|
|
|
|
|
nonConstMPattern = 0;
|
|
|
|
|
}
|
|
|
|
|
else if (!data->isRuleTableLoaded)
|
|
|
|
|
{
|
|
|
|
|
// At this point the caller wants the rules, but the rule table data
|
|
|
|
|
// is not loaded. Furthermore, there is no mPattern object to load
|
|
|
|
|
// the rules from. Therefore, we fetch the rules off the disk.
|
|
|
|
|
// Notice that we pass in a tryBinaryFile value of FALSE, since
|
|
|
|
|
// by design the binary file has NO rules in it!
|
2000-06-26 22:46:15 +00:00
|
|
|
|
//UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
//RuleBasedCollator temp(data->realLocaleName, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
RuleBasedCollator temp;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
temp.constructFromFile(data->desiredLocale, data->realLocaleName, FALSE, status);
|
|
|
|
|
|
|
|
|
|
// We must check that mPattern is nonzero here, or we run the risk
|
|
|
|
|
// of an infinite loop.
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_SUCCESS(status) && temp.mPattern != 0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
data->ruleTable = temp.getRules();
|
|
|
|
|
data->isRuleTableLoaded = TRUE;
|
|
|
|
|
#ifdef _DEBUG
|
1999-12-01 17:50:12 +00:00
|
|
|
|
// // the following is useful for specific debugging purposes
|
|
|
|
|
// UnicodeString name;
|
|
|
|
|
// cerr << "Table collation rules loaded dynamically for "
|
|
|
|
|
// << data->desiredLocale.getName(name)
|
|
|
|
|
// << " at "
|
|
|
|
|
// << data->realLocaleName
|
|
|
|
|
// << ", " << dec << data->ruleTable.size() << " characters"
|
|
|
|
|
// << endl;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
#ifdef _DEBUG
|
1999-12-01 17:50:12 +00:00
|
|
|
|
// UnicodeString name;
|
|
|
|
|
// cerr << "Unable to load table collation rules dynamically for "
|
|
|
|
|
// << data->desiredLocale.getName(name)
|
|
|
|
|
// << " at "
|
|
|
|
|
// << data->realLocaleName
|
|
|
|
|
// << endl;
|
1999-12-08 23:31:17 +00:00
|
|
|
|
// cerr << "Status " << u_errorName(status) << ", mPattern " << temp.mPattern << endl;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#endif
|
2000-08-11 01:27:17 +00:00
|
|
|
|
/* SRL have to add this because we now have the situation where
|
|
|
|
|
DEFAULT is loaded from a binary file w/ no rules. */
|
|
|
|
|
UErrorCode intStatus = U_ZERO_ERROR;
|
|
|
|
|
temp.constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus);
|
|
|
|
|
|
|
|
|
|
if(U_SUCCESS(intStatus) && (temp.mPattern != 0))
|
|
|
|
|
{
|
|
|
|
|
data->ruleTable = temp.getRules();
|
|
|
|
|
data->isRuleTableLoaded = TRUE;
|
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return data->ruleTable;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Collator::EComparisonResult
|
|
|
|
|
RuleBasedCollator::compare( const UnicodeString& source,
|
|
|
|
|
const UnicodeString& target,
|
|
|
|
|
int32_t length) const
|
|
|
|
|
{
|
|
|
|
|
UnicodeString source_togo;
|
|
|
|
|
UnicodeString target_togo;
|
|
|
|
|
UTextOffset begin=0;
|
|
|
|
|
|
1999-12-28 23:57:50 +00:00
|
|
|
|
source.extract(begin, uprv_min(length,source.length()), source_togo);
|
|
|
|
|
target.extract(begin, uprv_min(length,target.length()), target_togo);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return (RuleBasedCollator::compare(source_togo, target_togo));
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
Collator::EComparisonResult
|
|
|
|
|
RuleBasedCollator::compare(const UChar* source,
|
|
|
|
|
int32_t sourceLength,
|
|
|
|
|
const UChar* target,
|
|
|
|
|
int32_t targetLength) const
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
// check if source and target are valid strings
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if (((source == 0) && (target == 0)) ||
|
|
|
|
|
((sourceLength == 0) && (targetLength == 0)))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return Collator::EQUAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Collator::EComparisonResult result = Collator::EQUAL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if (cursor1 == NULL)
|
|
|
|
|
{
|
|
|
|
|
((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLength, getDecomposition());
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
cursor1->setModeAndText(getDecomposition(), source, sourceLength, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if ( /*cursor1->cursor == NULL ||*/ U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return Collator::EQUAL;
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if (cursor2 == NULL)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
((RuleBasedCollator *)this)->cursor2 = new NormalizerIterator(target, targetLength, getDecomposition());
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
cursor2->setModeAndText(getDecomposition(), target, targetLength, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if (/*cursor2 == NULL ||*/ U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return Collator::EQUAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t sOrder, tOrder;
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// int32_t sOrder = CollationElementIterator::NULLORDER, tOrder = CollationElementIterator::NULLORDER;
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool gets = TRUE, gett = TRUE;
|
|
|
|
|
UBool initialCheckSecTer = getStrength() >= Collator::SECONDARY;
|
|
|
|
|
UBool checkSecTer = initialCheckSecTer;
|
|
|
|
|
UBool checkTertiary = getStrength() >= Collator::TERTIARY;
|
|
|
|
|
UBool isFrenchSec = data->isFrenchSec;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
uint32_t pSOrder, pTOrder;
|
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
|
for(;;)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
// Get the next collation element in each of the strings, unless
|
|
|
|
|
// we've been requested to skip it.
|
|
|
|
|
if (gets)
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
sOrder = getStrengthOrder((NormalizerIterator*)cursor1, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return Collator::EQUAL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
gets = TRUE;
|
|
|
|
|
|
|
|
|
|
if (gett)
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
tOrder = getStrengthOrder((NormalizerIterator*)cursor2, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return Collator::EQUAL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
gett = TRUE;
|
|
|
|
|
|
|
|
|
|
// If we've hit the end of one of the strings, jump out of the loop
|
|
|
|
|
if ((sOrder == CollationElementIterator::NULLORDER)||
|
|
|
|
|
(tOrder == CollationElementIterator::NULLORDER))
|
|
|
|
|
{
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If there's no difference at this position, we can skip to the
|
|
|
|
|
// next one.
|
|
|
|
|
pSOrder = CollationElementIterator::primaryOrder(sOrder);
|
|
|
|
|
pTOrder = CollationElementIterator::primaryOrder(tOrder);
|
|
|
|
|
if (sOrder == tOrder)
|
|
|
|
|
{
|
|
|
|
|
if (isFrenchSec && pSOrder != 0)
|
|
|
|
|
{
|
|
|
|
|
if (!checkSecTer)
|
|
|
|
|
{
|
|
|
|
|
// in french, a secondary difference more to the right is stronger,
|
|
|
|
|
// so accents have to be checked with each base element
|
|
|
|
|
checkSecTer = initialCheckSecTer;
|
|
|
|
|
|
|
|
|
|
// but tertiary differences are less important than the first
|
|
|
|
|
// secondary difference, so checking tertiary remains disabled
|
|
|
|
|
checkTertiary = FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compare primary differences first.
|
|
|
|
|
if (pSOrder != pTOrder)
|
|
|
|
|
{
|
|
|
|
|
if (sOrder == 0)
|
|
|
|
|
{
|
|
|
|
|
// The entire source element is ignorable.
|
|
|
|
|
// Skip to the next source element, but don't fetch another target element.
|
|
|
|
|
gett = FALSE;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (tOrder == 0)
|
|
|
|
|
{
|
|
|
|
|
gets = FALSE;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The source and target elements aren't ignorable, but it's still possible
|
|
|
|
|
// for the primary component of one of the elements to be ignorable....
|
|
|
|
|
if (pSOrder == 0) // primary order in source is ignorable
|
|
|
|
|
{
|
|
|
|
|
// The source's primary is ignorable, but the target's isn't. We treat ignorables
|
|
|
|
|
// as a secondary difference, so remember that we found one.
|
|
|
|
|
if (checkSecTer)
|
|
|
|
|
{
|
|
|
|
|
result = Collator::GREATER; // (strength is SECONDARY)
|
|
|
|
|
checkSecTer = FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Skip to the next source element, but don't fetch another target element.
|
|
|
|
|
gett = FALSE;
|
|
|
|
|
}
|
|
|
|
|
else if (pTOrder == 0)
|
|
|
|
|
{
|
|
|
|
|
// record differences - see the comment above.
|
|
|
|
|
if (checkSecTer)
|
|
|
|
|
{
|
|
|
|
|
result = Collator::LESS; // (strength is SECONDARY)
|
|
|
|
|
checkSecTer = FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Skip to the next target element, but don't fetch another source element.
|
|
|
|
|
gets = FALSE;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Neither of the orders is ignorable, and we already know that the primary
|
|
|
|
|
// orders are different because of the (pSOrder != pTOrder) test above.
|
|
|
|
|
// Record the difference and stop the comparison.
|
|
|
|
|
if (pSOrder < pTOrder)
|
|
|
|
|
{
|
|
|
|
|
return Collator::LESS; // (strength is PRIMARY)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Collator::GREATER; // (strength is PRIMARY)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{ // else of if ( pSOrder != pTOrder )
|
|
|
|
|
// primary order is the same, but complete order is different. So there
|
|
|
|
|
// are no base elements at this point, only ignorables (Since the strings are
|
|
|
|
|
// normalized)
|
|
|
|
|
|
|
|
|
|
if (checkSecTer)
|
|
|
|
|
{
|
|
|
|
|
// a secondary or tertiary difference may still matter
|
|
|
|
|
uint32_t secSOrder = CollationElementIterator::secondaryOrder(sOrder);
|
|
|
|
|
uint32_t secTOrder = CollationElementIterator::secondaryOrder(tOrder);
|
|
|
|
|
|
|
|
|
|
if (secSOrder != secTOrder)
|
|
|
|
|
{
|
|
|
|
|
// there is a secondary difference
|
|
|
|
|
result = (secSOrder < secTOrder) ? Collator::LESS : Collator::GREATER;
|
|
|
|
|
// (strength is SECONDARY)
|
|
|
|
|
checkSecTer = FALSE;
|
|
|
|
|
// (even in french, only the first secondary difference within
|
|
|
|
|
// a base character matters)
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (checkTertiary)
|
|
|
|
|
{
|
|
|
|
|
// a tertiary difference may still matter
|
|
|
|
|
uint32_t terSOrder = CollationElementIterator::tertiaryOrder(sOrder);
|
|
|
|
|
uint32_t terTOrder = CollationElementIterator::tertiaryOrder(tOrder);
|
|
|
|
|
|
|
|
|
|
if (terSOrder != terTOrder)
|
|
|
|
|
{
|
|
|
|
|
// there is a tertiary difference
|
|
|
|
|
result = (terSOrder < terTOrder) ? Collator::LESS : Collator::GREATER;
|
|
|
|
|
// (strength is TERTIARY)
|
|
|
|
|
checkTertiary = FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} // if (checkSecTer)
|
|
|
|
|
|
|
|
|
|
} // if ( pSOrder != pTOrder )
|
|
|
|
|
} // while()
|
|
|
|
|
|
|
|
|
|
if (sOrder != CollationElementIterator::NULLORDER)
|
|
|
|
|
{
|
|
|
|
|
// (tOrder must be CollationElementIterator::NULLORDER,
|
|
|
|
|
// since this point is only reached when sOrder or tOrder is NULLORDER.)
|
|
|
|
|
// The source string has more elements, but the target string hasn't.
|
|
|
|
|
do
|
|
|
|
|
{
|
|
|
|
|
if (CollationElementIterator::primaryOrder(sOrder) != 0)
|
|
|
|
|
{
|
|
|
|
|
// We found an additional non-ignorable base character in the source string.
|
|
|
|
|
// This is a primary difference, so the source is greater
|
|
|
|
|
return Collator::GREATER; // (strength is PRIMARY)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (CollationElementIterator::secondaryOrder(sOrder) != 0)
|
|
|
|
|
{
|
|
|
|
|
// Additional secondary elements mean the source string is greater
|
|
|
|
|
if (checkSecTer)
|
|
|
|
|
{
|
|
|
|
|
result = Collator::GREATER; // (strength is SECONDARY)
|
|
|
|
|
checkSecTer = FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
1999-11-23 22:49:29 +00:00
|
|
|
|
while ((sOrder = getStrengthOrder(cursor1, status)) != CollationElementIterator::NULLORDER);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
else if (tOrder != CollationElementIterator::NULLORDER)
|
|
|
|
|
{
|
|
|
|
|
// The target string has more elements, but the source string hasn't.
|
|
|
|
|
do
|
|
|
|
|
{
|
|
|
|
|
if (CollationElementIterator::primaryOrder(tOrder) != 0)
|
|
|
|
|
{
|
|
|
|
|
// We found an additional non-ignorable base character in the target string.
|
|
|
|
|
// This is a primary difference, so the source is less
|
|
|
|
|
return Collator::LESS; // (strength is PRIMARY)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (CollationElementIterator::secondaryOrder(tOrder) != 0)
|
|
|
|
|
{
|
|
|
|
|
// Additional secondary elements in the target mean the source string is less
|
|
|
|
|
if (checkSecTer)
|
|
|
|
|
{
|
|
|
|
|
result = Collator::LESS; // (strength is SECONDARY)
|
|
|
|
|
checkSecTer = FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
1999-11-23 22:49:29 +00:00
|
|
|
|
while ((tOrder = getStrengthOrder(cursor2, status)) != CollationElementIterator::NULLORDER);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// For IDENTICAL comparisons, we use a bitwise character comparison
|
|
|
|
|
// as a tiebreaker if all else is equal
|
|
|
|
|
// NOTE: The java code compares result with 0, and
|
|
|
|
|
// puts the result of the string comparison directly into result
|
|
|
|
|
if (result == Collator::EQUAL && getStrength() == IDENTICAL)
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
#if 0
|
|
|
|
|
// ******** for the UChar normalization interface.
|
|
|
|
|
// It doesn't work much faster, and the code was broken
|
|
|
|
|
// so it's commented out. --srl
|
|
|
|
|
// UChar sourceDecomp[1024], targetDecomp[1024];
|
2000-08-11 01:27:17 +00:00
|
|
|
|
// int32_t sourceDecompLength = 1024;
|
|
|
|
|
// int32_t targetDecompLength = 1024;
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// int8_t comparison;
|
2000-08-11 01:27:17 +00:00
|
|
|
|
// Normalizer::EMode decompMode = getDecomposition();
|
|
|
|
|
|
|
|
|
|
// if (decompMode != Normalizer::NO_OP)
|
|
|
|
|
// {
|
|
|
|
|
// Normalizer::normalize(source, sourceLength, decompMode,
|
|
|
|
|
// 0, sourceDecomp, sourceDecompLength, status);
|
|
|
|
|
|
|
|
|
|
// Normalizer::normalize(target, targetLength, decompMode,
|
|
|
|
|
// 0, targetDecomp, targetDecompLength, status);
|
|
|
|
|
|
|
|
|
|
// comparison = u_strcmp(sourceDecomp,targetDecomp);
|
|
|
|
|
// }
|
|
|
|
|
// else
|
|
|
|
|
// {
|
|
|
|
|
// comparison = u_strcmp(source, target); /* ! */
|
|
|
|
|
// }
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
|
UnicodeString sourceDecomp, targetDecomp;
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
int8_t comparison;
|
|
|
|
|
|
|
|
|
|
Normalizer::normalize(source, getDecomposition(),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
0, sourceDecomp, status);
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
Normalizer::normalize(target, getDecomposition(),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
0, targetDecomp, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
comparison = sourceDecomp.compare(targetDecomp);
|
1999-11-23 22:49:29 +00:00
|
|
|
|
#endif
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
if (comparison < 0)
|
|
|
|
|
{
|
|
|
|
|
result = Collator::LESS;
|
|
|
|
|
}
|
|
|
|
|
else if (comparison == 0)
|
|
|
|
|
{
|
|
|
|
|
result = Collator::EQUAL;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
result = Collator::GREATER;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
|
RuleBasedCollator::nextContractChar(NormalizerIterator *cursor,
|
|
|
|
|
UChar ch,
|
|
|
|
|
UErrorCode& status) const
|
|
|
|
|
{
|
|
|
|
|
// First get the ordering of this single character
|
|
|
|
|
VectorOfPToContractElement *list = getContractValues(ch);
|
|
|
|
|
EntryPair *pair = (EntryPair *)list->at(0);
|
|
|
|
|
int32_t order = pair->value;
|
|
|
|
|
|
|
|
|
|
// Now iterate through the chars following it and
|
|
|
|
|
// look for the longest match
|
|
|
|
|
((UnicodeString&)key).remove();
|
|
|
|
|
((UnicodeString&)key) += ch;
|
|
|
|
|
|
|
|
|
|
while ((ch = cursor->current()) != Normalizer::DONE)
|
|
|
|
|
{
|
|
|
|
|
((UnicodeString&)key) += ch;
|
|
|
|
|
|
|
|
|
|
int32_t n = getEntry(list, key, TRUE);
|
|
|
|
|
|
|
|
|
|
if (n == UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
cursor->next();
|
|
|
|
|
|
|
|
|
|
pair = (EntryPair *)list->at(n);
|
|
|
|
|
order = pair->value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return order;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compare two strings using this collator
|
|
|
|
|
Collator::EComparisonResult
|
|
|
|
|
RuleBasedCollator::compare(const UnicodeString& source,
|
|
|
|
|
const UnicodeString& target) const
|
|
|
|
|
{
|
|
|
|
|
return compare(source.getUChars(), source.length(), target.getUChars(), target.length());
|
|
|
|
|
}
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
// Retrieve a collation key for the specified string
|
|
|
|
|
// The key can be compared with other collation keys using a bitwise comparison
|
|
|
|
|
// (e.g. memcmp) to find the ordering of their respective source strings.
|
|
|
|
|
// This is handy when doing a sort, where each sort key must be compared
|
|
|
|
|
// many times.
|
|
|
|
|
//
|
|
|
|
|
// The basic algorithm here is to find all of the collation elements for each
|
|
|
|
|
// character in the source string, convert them to an ASCII representation,
|
|
|
|
|
// and put them into the collation key. But it's trickier than that.
|
|
|
|
|
// Each collation element in a string has three components: primary ('A' vs 'B'),
|
|
|
|
|
// secondary ('u' vs '<27>'), and tertiary ('A' vs 'a'), and a primary difference
|
|
|
|
|
// at the end of a string takes precedence over a secondary or tertiary
|
|
|
|
|
// difference earlier in the string.
|
|
|
|
|
//
|
|
|
|
|
// To account for this, we put all of the primary orders at the beginning of the
|
|
|
|
|
// string, followed by the secondary and tertiary orders. Each set of orders is
|
|
|
|
|
// terminated by nulls so that a key for a string which is a initial substring of
|
|
|
|
|
// another key will compare less without any special case.
|
|
|
|
|
//
|
|
|
|
|
// Here's a hypothetical example, with the collation element represented as
|
|
|
|
|
// a three-digit number, one digit for primary, one for secondary, etc.
|
|
|
|
|
//
|
|
|
|
|
// String: A a B <20>
|
|
|
|
|
// Collation Elements: 101 100 201 511
|
|
|
|
|
// Collation Key: 1125<null>0001<null>1011<null>
|
|
|
|
|
//
|
|
|
|
|
// To make things even trickier, secondary differences (accent marks) are compared
|
|
|
|
|
// starting at the *end* of the string in languages with French secondary ordering.
|
|
|
|
|
// But when comparing the accent marks on a single base character, they are compared
|
|
|
|
|
// from the beginning. To handle this, we reverse all of the accents that belong
|
|
|
|
|
// to each base character, then we reverse the entire string of secondary orderings
|
|
|
|
|
// at the end.
|
|
|
|
|
//
|
|
|
|
|
CollationKey&
|
|
|
|
|
RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
|
|
|
|
CollationKey& sortkey,
|
|
|
|
|
UErrorCode& status) const
|
1999-11-23 22:49:29 +00:00
|
|
|
|
{
|
1999-12-08 02:11:04 +00:00
|
|
|
|
return RuleBasedCollator::getCollationKey(source.getUChars(), source.length(), sortkey, status);
|
1999-11-23 22:49:29 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CollationKey&
|
|
|
|
|
RuleBasedCollator::getCollationKey( const UChar* source,
|
|
|
|
|
int32_t sourceLen,
|
|
|
|
|
CollationKey& sortkey,
|
|
|
|
|
UErrorCode& status) const
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return sortkey.setToBogus();
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if ((!source) || (sourceLen == 0))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return sortkey.reset();
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if (cursor1 == NULL)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLen, getDecomposition());
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
cursor1->setModeAndText(getDecomposition(), source,sourceLen, status);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return sortkey.setToBogus();
|
|
|
|
|
}
|
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool compareSec = (getStrength() >= Collator::SECONDARY);
|
|
|
|
|
UBool compareTer = (getStrength() >= Collator::TERTIARY);
|
|
|
|
|
UBool compareIdent = (getStrength() == Collator::IDENTICAL);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
int32_t order = 0;
|
|
|
|
|
int32_t totalPrimary = 0;
|
|
|
|
|
int32_t totalSec = 0;
|
|
|
|
|
int32_t totalTer = 0;
|
|
|
|
|
int32_t totalIdent = 0;
|
|
|
|
|
UnicodeString decomp;
|
|
|
|
|
|
|
|
|
|
// iterate over the source, counting primary, secondary, and tertiary entries
|
1999-11-23 22:49:29 +00:00
|
|
|
|
while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) !=
|
2000-08-11 01:27:17 +00:00
|
|
|
|
CollationElementIterator::NULLORDER)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
int32_t secOrder = CollationElementIterator::secondaryOrder(order);
|
|
|
|
|
int32_t terOrder = CollationElementIterator::tertiaryOrder(order);
|
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return sortkey.setToBogus();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (! CollationElementIterator::isIgnorable(order))
|
|
|
|
|
{
|
|
|
|
|
totalPrimary += 1;
|
|
|
|
|
|
|
|
|
|
if (compareSec)
|
|
|
|
|
{
|
|
|
|
|
totalSec += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareTer)
|
|
|
|
|
{
|
|
|
|
|
totalTer += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (compareSec && secOrder != 0)
|
|
|
|
|
{
|
|
|
|
|
totalSec += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareTer && terOrder != 0)
|
|
|
|
|
{
|
|
|
|
|
totalTer += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// count the null bytes after the entires
|
|
|
|
|
totalPrimary += 1;
|
|
|
|
|
|
|
|
|
|
if (compareSec)
|
|
|
|
|
{
|
|
|
|
|
totalSec += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareTer)
|
|
|
|
|
{
|
|
|
|
|
totalTer += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareIdent)
|
|
|
|
|
{
|
1999-11-23 22:49:29 +00:00
|
|
|
|
Normalizer::normalize(source, getDecomposition(), // SRL: ??
|
1999-08-16 21:50:52 +00:00
|
|
|
|
0, decomp, status);
|
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_SUCCESS(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-12-08 02:11:04 +00:00
|
|
|
|
totalIdent = decomp.length() + 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compute total number of bytes to hold the entries
|
|
|
|
|
// and make sure the key can hold them
|
|
|
|
|
uint32_t size = 2 * (totalPrimary + totalSec + totalTer + totalIdent);
|
|
|
|
|
|
|
|
|
|
sortkey.ensureCapacity(size);
|
|
|
|
|
|
|
|
|
|
if (sortkey.isBogus())
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return sortkey;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t primaryCursor = 0;
|
|
|
|
|
int32_t secCursor = 2 * totalPrimary;
|
|
|
|
|
int32_t secBase = secCursor;
|
|
|
|
|
int32_t preSecIgnore = secBase;
|
|
|
|
|
int32_t terCursor = secCursor + (2 * totalSec);
|
|
|
|
|
int32_t identCursor = terCursor + (2 * totalTer);
|
|
|
|
|
|
|
|
|
|
// reset source to the beginning
|
1999-11-23 22:49:29 +00:00
|
|
|
|
cursor1->reset();
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
// now iterate over the source computing the actual entries
|
1999-11-23 22:49:29 +00:00
|
|
|
|
while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != CollationElementIterator::NULLORDER)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return sortkey.reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t primaryOrder = CollationElementIterator::primaryOrder(order);
|
|
|
|
|
int32_t secOrder = CollationElementIterator::secondaryOrder(order);
|
|
|
|
|
int32_t terOrder = CollationElementIterator::tertiaryOrder(order);
|
|
|
|
|
|
|
|
|
|
if (! CollationElementIterator::isIgnorable(order))
|
|
|
|
|
{
|
|
|
|
|
primaryCursor = sortkey.storeBytes(primaryCursor, primaryOrder + SORTKEYOFFSET);
|
|
|
|
|
|
|
|
|
|
if (compareSec)
|
|
|
|
|
{
|
|
|
|
|
if (data->isFrenchSec && (preSecIgnore < secCursor))
|
|
|
|
|
{
|
|
|
|
|
sortkey.reverseBytes(preSecIgnore, secCursor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
secCursor = sortkey.storeBytes(secCursor, secOrder + SORTKEYOFFSET);
|
|
|
|
|
|
|
|
|
|
preSecIgnore = secCursor;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareTer)
|
|
|
|
|
{
|
|
|
|
|
terCursor = sortkey.storeBytes(terCursor, terOrder + SORTKEYOFFSET);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (compareSec && secOrder != 0)
|
|
|
|
|
{
|
|
|
|
|
secCursor = sortkey.storeBytes(secCursor, secOrder + data->maxSecOrder + SORTKEYOFFSET);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareTer && terOrder != 0)
|
|
|
|
|
{
|
|
|
|
|
terCursor = sortkey.storeBytes(terCursor, terOrder + data->maxTerOrder + SORTKEYOFFSET);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// append 0 at the end of each portion.
|
|
|
|
|
sortkey.storeBytes(primaryCursor, 0);
|
|
|
|
|
|
|
|
|
|
if (compareSec)
|
|
|
|
|
{
|
|
|
|
|
if (data->isFrenchSec)
|
|
|
|
|
{
|
|
|
|
|
if (preSecIgnore < secCursor)
|
|
|
|
|
{
|
|
|
|
|
sortkey.reverseBytes(preSecIgnore, secCursor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sortkey.reverseBytes(secBase, secCursor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sortkey.storeBytes(secCursor, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareTer)
|
|
|
|
|
{
|
|
|
|
|
sortkey.storeBytes(terCursor, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (compareIdent)
|
|
|
|
|
{
|
|
|
|
|
sortkey.storeUnicodeString(identCursor, decomp);
|
|
|
|
|
}
|
|
|
|
|
|
1999-11-23 22:49:29 +00:00
|
|
|
|
// Debugging - print out the sortkey [--srl]
|
|
|
|
|
// {
|
|
|
|
|
// const uint8_t *bytes;
|
|
|
|
|
// int32_t xcount;
|
|
|
|
|
// bytes = sortkey.getByteArray(xcount);
|
|
|
|
|
// // fprintf(stderr, "\n\n- [%02X] [%02X]\n\n", (int)(bytes[0]&0xFF), (int)(bytes[1]&0xFF) );
|
|
|
|
|
// }
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return sortkey;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Build this collator's rule tables based on a string representation of the rules
|
|
|
|
|
// See the big diagram at the top of this file for an overview of how the tables
|
|
|
|
|
// are organized.
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::build(const UnicodeString& pattern,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// This array maps Unicode characters to their collation ordering
|
|
|
|
|
data->mapping = ucmp32_open(UNMAPPED);
|
|
|
|
|
|
|
|
|
|
if (data->mapping->fBogus)
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t i = 0;
|
|
|
|
|
UnicodeString lastGroupChars;
|
|
|
|
|
UnicodeString expChars;
|
|
|
|
|
UnicodeString groupChars;
|
|
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
|
if (pattern.length() == 0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_INVALID_FORMAT_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Build the merged collation entries
|
|
|
|
|
// Since rules can be specified in any order in the string
|
|
|
|
|
// (e.g. "c , C < d , D < e , E .... C < CH")
|
|
|
|
|
// this splits all of the rules in the string out into separate
|
|
|
|
|
// objects and then sorts them. In the above example, it merges the
|
|
|
|
|
// "C < CH" rule in just before the "C < D" rule.
|
|
|
|
|
|
|
|
|
|
mPattern = new MergeCollation(pattern, getDecomposition(), status);
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
ucmp32_close(data->mapping);
|
|
|
|
|
data->mapping = 0;
|
|
|
|
|
delete mPattern;
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t order = 0;
|
|
|
|
|
|
|
|
|
|
// Walk through each entry
|
|
|
|
|
for (i = 0; i < mPattern->getCount(); ++i)
|
|
|
|
|
{
|
|
|
|
|
const PatternEntry* entry = mPattern->getItemAt(i);
|
|
|
|
|
groupChars.remove();
|
|
|
|
|
expChars.remove();
|
|
|
|
|
|
|
|
|
|
// if entry is valid
|
|
|
|
|
if (entry != NULL)
|
|
|
|
|
{
|
|
|
|
|
entry->getChars(groupChars);
|
|
|
|
|
|
|
|
|
|
// check if french secondary needs to be turned on
|
1999-12-08 02:11:04 +00:00
|
|
|
|
if ((groupChars.length() > 1) &&
|
2000-06-29 18:42:34 +00:00
|
|
|
|
(groupChars[groupChars.length()-1] == 0x0040))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
data->isFrenchSec = TRUE;
|
2000-06-29 18:42:34 +00:00
|
|
|
|
groupChars.remove(groupChars.length()-1);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
order = increment((Collator::ECollationStrength)entry->getStrength(), order);
|
|
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
|
if (entry->getExtension(expChars).length() != 0)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
// encountered an expanding character, where one character on input
|
|
|
|
|
// expands to several sort elements (e.g. '<27>' --> 'o' 'e')
|
|
|
|
|
addExpandOrder(groupChars, expChars, order, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
1999-12-08 02:11:04 +00:00
|
|
|
|
else if (groupChars.length() > 1)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
// encountered a contracting character, where several characters on input
|
|
|
|
|
// contract into one sort order. For example, "ch" is treated as a single
|
|
|
|
|
// character in traditional Spanish sorting.
|
|
|
|
|
addContractOrder(groupChars, order, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Nothing out of the ordinary -- one character maps to one sort order
|
|
|
|
|
addOrder(groupChars[0], order, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add expanding entries for pre-composed characters
|
|
|
|
|
addComposedChars();
|
|
|
|
|
|
|
|
|
|
// Fill in all the expanding chars values
|
|
|
|
|
commit();
|
|
|
|
|
|
|
|
|
|
// Compact the data mapping table
|
|
|
|
|
ucmp32_compact(data->mapping, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Add expanding entries for pre-composed unicode characters so that this
|
|
|
|
|
* collator can be used reasonably well with decomposition turned off.
|
|
|
|
|
*/
|
|
|
|
|
void RuleBasedCollator::addComposedChars()
|
|
|
|
|
{
|
|
|
|
|
UnicodeString buf;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
// Iterate through all of the pre-composed characters in Unicode
|
|
|
|
|
ComposedCharIter iter;
|
|
|
|
|
UnicodeString decomp;
|
|
|
|
|
|
|
|
|
|
while (iter.hasNext())
|
|
|
|
|
{
|
|
|
|
|
UChar c = iter.next();
|
|
|
|
|
|
|
|
|
|
if (getCharOrder(c) == UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
//
|
|
|
|
|
// We don't already have an ordering for this pre-composed character.
|
|
|
|
|
//
|
|
|
|
|
// First, see if the decomposed string is already in our
|
|
|
|
|
// tables as a single contracting-string ordering.
|
|
|
|
|
// If so, just map the precomposed character to that order.
|
|
|
|
|
//
|
|
|
|
|
// TODO: What we should really be doing here is trying to find the
|
|
|
|
|
// longest initial substring of the decomposition that is present
|
|
|
|
|
// in the tables as a contracting character sequence, and find its
|
|
|
|
|
// ordering. Then do this recursively with the remaining chars
|
|
|
|
|
// so that we build a list of orderings, and add that list to
|
|
|
|
|
// the expansion table.
|
|
|
|
|
// That would be more correct but also significantly slower, so
|
|
|
|
|
// I'm not totally sure it's worth doing.
|
|
|
|
|
//
|
|
|
|
|
iter.getDecomposition(decomp);
|
|
|
|
|
int contractOrder = getContractOrder(decomp);
|
|
|
|
|
|
|
|
|
|
if (contractOrder != UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
addOrder(c, contractOrder, status);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
//
|
|
|
|
|
// We don't have a contracting ordering for the entire string
|
|
|
|
|
// that results from the decomposition, but if we have orders
|
|
|
|
|
// for each individual character, we can add an expanding
|
|
|
|
|
// table entry for the pre-composed character
|
|
|
|
|
//
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool allThere = TRUE;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
int32_t i;
|
|
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
|
for (i = 0; i < decomp.length(); i += 1)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
if (getCharOrder(decomp[i]) == UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
allThere = FALSE;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (allThere)
|
|
|
|
|
{
|
|
|
|
|
buf.remove();
|
|
|
|
|
buf += c;
|
|
|
|
|
addExpandOrder(buf, decomp, UNMAPPED, status);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// When the expanding character tables are built by addExpandOrder,
|
|
|
|
|
// it doesn't know what the final ordering of each character
|
|
|
|
|
// in the expansion will be. Instead, it just puts the raw character
|
|
|
|
|
// code into the table, adding CHARINDEX as a flag. Now that we've
|
|
|
|
|
// finished building the mapping table, we can go back and look up
|
|
|
|
|
// that character to see what its real collation order is and
|
|
|
|
|
// stick that into the expansion table. That lets us avoid doing
|
|
|
|
|
// a two-stage lookup later.
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::commit()
|
|
|
|
|
{
|
|
|
|
|
// if there are any expanding characters
|
|
|
|
|
if (data->expandTable != NULL)
|
|
|
|
|
{
|
|
|
|
|
int32_t i;
|
|
|
|
|
for (i = 0; i < data->expandTable->size(); i += 1)
|
|
|
|
|
{
|
|
|
|
|
VectorOfInt* valueList = data->expandTable->at(i);
|
|
|
|
|
int32_t j;
|
|
|
|
|
for (j = 0; j < valueList->size(); j++)
|
|
|
|
|
{
|
|
|
|
|
// found a expanding character
|
|
|
|
|
// the expanding char value is not filled in yet
|
|
|
|
|
if ((valueList->at(j) < EXPANDCHARINDEX) &&
|
|
|
|
|
(valueList->at(j) > CHARINDEX))
|
|
|
|
|
{
|
|
|
|
|
// Get the real values for the non-filled entry
|
|
|
|
|
UChar ch = (UChar)(valueList->at(j) - CHARINDEX);
|
|
|
|
|
int32_t realValue = ucmp32_get(data->mapping, ch);
|
|
|
|
|
|
|
|
|
|
if (realValue == UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
// The real value is still unmapped, maybe it'signorable
|
|
|
|
|
valueList->atPut(j, IGNORABLEMASK & ch);
|
|
|
|
|
}
|
|
|
|
|
// fill in the value
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
valueList->atPut(j, realValue);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Increment of the last order based on the comparison level.
|
|
|
|
|
*/
|
|
|
|
|
int32_t
|
|
|
|
|
RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t lastValue)
|
|
|
|
|
{
|
|
|
|
|
switch(aStrength)
|
|
|
|
|
{
|
|
|
|
|
case Collator::PRIMARY:
|
|
|
|
|
// increment priamry order and mask off secondary and tertiary difference
|
|
|
|
|
lastValue += PRIMARYORDERINCREMENT;
|
|
|
|
|
lastValue &= PRIMARYORDERMASK;
|
|
|
|
|
isOverIgnore = TRUE;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case Collator::SECONDARY:
|
|
|
|
|
// increment secondary order and mask off tertiary difference
|
|
|
|
|
lastValue += SECONDARYORDERINCREMENT;
|
|
|
|
|
lastValue &= SECONDARYDIFFERENCEONLY;
|
|
|
|
|
|
|
|
|
|
// record max # of ignorable chars with secondary difference
|
|
|
|
|
if (isOverIgnore == FALSE)
|
|
|
|
|
{
|
|
|
|
|
data->maxSecOrder += 1;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case Collator::TERTIARY:
|
|
|
|
|
// increment tertiary order
|
|
|
|
|
lastValue += TERTIARYORDERINCREMENT;
|
|
|
|
|
|
|
|
|
|
// record max # of ignorable chars with tertiary difference
|
|
|
|
|
if (isOverIgnore == FALSE)
|
|
|
|
|
{
|
|
|
|
|
data->maxTerOrder += 1;
|
|
|
|
|
}
|
|
|
|
|
break;
|
1999-11-23 22:49:29 +00:00
|
|
|
|
|
|
|
|
|
// case IDENTICAL?
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return lastValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Adds a character and its designated order into the collation table.
|
|
|
|
|
// This is the simple case, with no expansion or contraction
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::addOrder(UChar ch,
|
|
|
|
|
int32_t anOrder,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// try to find the order of the char in the mapping table
|
|
|
|
|
int32_t order = ucmp32_get(data->mapping, ch);
|
|
|
|
|
|
|
|
|
|
if (order >= CONTRACTCHARINDEX)
|
|
|
|
|
{
|
|
|
|
|
// There's already an entry for this character that points to a contracting
|
|
|
|
|
// character table. Instead of adding the character directly to the mapping
|
|
|
|
|
// table, we must add it to the contract table instead.
|
|
|
|
|
key.remove();
|
|
|
|
|
key += ch;
|
|
|
|
|
if (key.isBogus())
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
addContractOrder(key, anOrder, status);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// add the entry to the mapping table, the same later entry replaces the previous one
|
|
|
|
|
ucmp32_set(data->mapping, ch, anOrder);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add an expanding-character entry to the table.
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::addExpandOrder( const UnicodeString& contractChars,
|
|
|
|
|
const UnicodeString& expandChars,
|
|
|
|
|
int32_t anOrder,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create an expansion table entry
|
|
|
|
|
int32_t tableIndex = addExpansion(anOrder, expandChars);
|
|
|
|
|
|
|
|
|
|
// And add its index into the main mapping table
|
1999-12-08 02:11:04 +00:00
|
|
|
|
if (contractChars.length() > 1)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
addContractOrder(contractChars, tableIndex, status);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
addOrder(contractChars[0], tableIndex, status);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t RuleBasedCollator::addExpansion(int32_t anOrder, const UnicodeString &expandChars)
|
|
|
|
|
{
|
|
|
|
|
if (data->expandTable == NULL)
|
|
|
|
|
{
|
|
|
|
|
data->expandTable = new VectorOfPToExpandTable();
|
|
|
|
|
|
|
|
|
|
if (data->expandTable == NULL)
|
|
|
|
|
{
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If anOrder is valid, we want to add it at the beginning of the list
|
|
|
|
|
int32_t offset = (anOrder == UNMAPPED) ? 0 : 1;
|
|
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
|
VectorOfInt *valueList = new VectorOfInt(expandChars.length() + offset);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
if (offset == 1)
|
|
|
|
|
{
|
|
|
|
|
valueList->atPut(0, anOrder);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t i;
|
1999-12-08 02:11:04 +00:00
|
|
|
|
for (i = 0; i < expandChars.length(); i += 1)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
UChar ch = expandChars[i];
|
|
|
|
|
int32_t mapValue = getCharOrder(ch);
|
|
|
|
|
|
|
|
|
|
if (mapValue != UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
valueList->atPut(i + offset, mapValue);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// can't find it in the table, will be filled in by commit().
|
|
|
|
|
valueList->atPut(i + offset, CHARINDEX + (int32_t)ch);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add the expanding char list into the expansion table.
|
|
|
|
|
int32_t tableIndex = EXPANDCHARINDEX + data->expandTable->size();
|
|
|
|
|
data->expandTable->atPut(data->expandTable->size(), valueList);
|
|
|
|
|
|
|
|
|
|
return tableIndex;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add a string of characters that contracts into a single ordering.
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::addContractOrder(const UnicodeString& groupChars,
|
|
|
|
|
int32_t anOrder,
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool fwd,
|
1999-08-16 21:50:52 +00:00
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status))
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (data->contractTable == NULL)
|
|
|
|
|
{
|
|
|
|
|
data->contractTable = new VectorOfPToContractTable();
|
|
|
|
|
if (data->contractTable->isBogus())
|
|
|
|
|
{
|
|
|
|
|
delete data->contractTable;
|
|
|
|
|
data->contractTable = NULL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// See if the initial character of the string already has a contract table.
|
|
|
|
|
// e.g. for "ch", look for 'c'.
|
|
|
|
|
int32_t entry = ucmp32_get(data->mapping, groupChars[0]);
|
|
|
|
|
VectorOfPToContractElement *entryTable = getContractValues(entry - CONTRACTCHARINDEX);
|
|
|
|
|
|
|
|
|
|
if (entryTable == NULL)
|
|
|
|
|
{
|
|
|
|
|
// We need to create a new table of contract entries for this base char
|
|
|
|
|
int32_t tableIndex = CONTRACTCHARINDEX + data->contractTable->size();
|
|
|
|
|
EntryPair *pair = NULL;
|
|
|
|
|
UnicodeString substring;
|
|
|
|
|
|
|
|
|
|
entryTable = new VectorOfPToContractElement();
|
|
|
|
|
if (entryTable->isBogus())
|
|
|
|
|
{
|
|
|
|
|
delete entryTable;
|
|
|
|
|
delete data->contractTable;
|
|
|
|
|
data->contractTable = NULL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data->contractTable->atPut(data->contractTable->size(), entryTable);
|
|
|
|
|
if (data->contractTable->isBogus())
|
|
|
|
|
{
|
|
|
|
|
delete entryTable;
|
|
|
|
|
delete data->contractTable;
|
|
|
|
|
data->contractTable = NULL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Add the initial character's current ordering first. then
|
|
|
|
|
// update its mapping to point to this contract table
|
|
|
|
|
groupChars.extract(0, 1, substring);
|
|
|
|
|
if (substring.isBogus())
|
|
|
|
|
{
|
|
|
|
|
delete entryTable;
|
|
|
|
|
delete data->contractTable;
|
|
|
|
|
data->contractTable = NULL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pair = new EntryPair(substring, entry);
|
|
|
|
|
|
|
|
|
|
entryTable->atPut(0, pair);
|
|
|
|
|
if (entryTable->isBogus())
|
|
|
|
|
{
|
|
|
|
|
delete entryTable;
|
|
|
|
|
delete data->contractTable;
|
|
|
|
|
data->contractTable = NULL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ucmp32_set(data->mapping, groupChars[0], tableIndex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Now add (or replace) this string in the table
|
|
|
|
|
int32_t index = getEntry(entryTable, groupChars, fwd);
|
|
|
|
|
|
|
|
|
|
if (index != UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
EntryPair *pair = (EntryPair *) entryTable->at(index);
|
|
|
|
|
pair->value = anOrder;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
EntryPair *pair = new EntryPair(groupChars, anOrder, fwd);
|
|
|
|
|
|
|
|
|
|
entryTable->atPut(entryTable->size(), pair);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If this was a forward mapping for a contracting string, also add a
|
|
|
|
|
// reverse mapping for it, so that CollationElementIterator::previous
|
|
|
|
|
// can work right
|
|
|
|
|
if (fwd)
|
|
|
|
|
{
|
|
|
|
|
UnicodeString reverse(groupChars);
|
|
|
|
|
|
|
|
|
|
if (reverse.isBogus())
|
|
|
|
|
{
|
|
|
|
|
delete entryTable;
|
|
|
|
|
delete data->contractTable;
|
|
|
|
|
data->contractTable = NULL;
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
addContractOrder(reverse.reverse(), anOrder, FALSE, status);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* If the given string has been specified as a contracting string
|
|
|
|
|
* in this collation table, return its ordering.
|
|
|
|
|
* Otherwise return UNMAPPED.
|
|
|
|
|
*/
|
|
|
|
|
int32_t RuleBasedCollator::getContractOrder(const UnicodeString &groupChars) const
|
|
|
|
|
{
|
|
|
|
|
int32_t result = UNMAPPED;
|
|
|
|
|
|
|
|
|
|
if (data->contractTable != NULL)
|
|
|
|
|
{
|
|
|
|
|
VectorOfPToContractElement *entryTable = getContractValues(groupChars[0]);
|
|
|
|
|
|
|
|
|
|
if (entryTable != NULL)
|
|
|
|
|
{
|
|
|
|
|
int32_t index = getEntry(entryTable, groupChars, TRUE);
|
|
|
|
|
|
|
|
|
|
if (index != UNMAPPED)
|
|
|
|
|
{
|
|
|
|
|
EntryPair *pair = entryTable->at(index);
|
|
|
|
|
|
|
|
|
|
result = pair->value;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t RuleBasedCollator::getCharOrder(UChar ch) const
|
|
|
|
|
{
|
|
|
|
|
int32_t order = ucmp32_get(data->mapping, ch);
|
|
|
|
|
|
|
|
|
|
if (order >= CONTRACTCHARINDEX)
|
|
|
|
|
{
|
|
|
|
|
VectorOfPToContractElement *groupList = getContractValues(order - CONTRACTCHARINDEX);
|
|
|
|
|
EntryPair *pair = groupList->at(0);
|
|
|
|
|
|
|
|
|
|
order = pair->value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return order;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create a hash code for this collation. Just hash the main rule table --
|
|
|
|
|
// that should be good enough for almost any use.
|
|
|
|
|
int32_t
|
|
|
|
|
RuleBasedCollator::hashCode() const
|
|
|
|
|
{
|
|
|
|
|
int32_t value = 0;
|
|
|
|
|
int32_t c;
|
1999-12-08 02:11:04 +00:00
|
|
|
|
int32_t count = getRules().length();
|
1999-08-16 21:50:52 +00:00
|
|
|
|
UTextOffset pos = count - 1;
|
|
|
|
|
|
|
|
|
|
if (count > 64)
|
|
|
|
|
{
|
|
|
|
|
count = 64; // only hash upto limit
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int16_t i = 0;
|
|
|
|
|
|
|
|
|
|
while (i < count)
|
|
|
|
|
{
|
|
|
|
|
c = data->ruleTable[pos];
|
|
|
|
|
value = ((value << (c & 0x0f)) ^ (c << 8)) + (c ^ value);
|
|
|
|
|
i += 1;
|
|
|
|
|
pos -= 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (value == 0)
|
|
|
|
|
{
|
|
|
|
|
value = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// find the contracting char entry in the list
|
|
|
|
|
int32_t
|
|
|
|
|
RuleBasedCollator::getEntry(VectorOfPToContractElement* list,
|
|
|
|
|
const UnicodeString& name,
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool fwd)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
int32_t i;
|
|
|
|
|
|
|
|
|
|
if (list != NULL)
|
|
|
|
|
{
|
|
|
|
|
for (i = 0; i < list->size(); i += 1)
|
|
|
|
|
{
|
|
|
|
|
EntryPair *pair = list->at(i);
|
|
|
|
|
|
|
|
|
|
if ((pair != NULL) && (pair->fwd == fwd) && (pair->entryName == name))
|
|
|
|
|
{
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return RuleBasedCollator::UNMAPPED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// look for the contracting list entry with the beginning char
|
|
|
|
|
VectorOfPToContractElement*
|
|
|
|
|
RuleBasedCollator::getContractValues(UChar ch) const
|
|
|
|
|
{
|
|
|
|
|
int32_t index = ucmp32_get(data->mapping, ch);
|
|
|
|
|
return getContractValues(index - CONTRACTCHARINDEX);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// look for the contracting list entry with the index
|
|
|
|
|
VectorOfPToContractElement*
|
|
|
|
|
RuleBasedCollator::getContractValues(int32_t index) const
|
|
|
|
|
{
|
|
|
|
|
if (data->contractTable != NULL)
|
|
|
|
|
{
|
|
|
|
|
if (index >= 0)
|
|
|
|
|
{
|
|
|
|
|
return data->contractTable->at(index);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return the maximum length of any expansion sequences that end
|
|
|
|
|
* with the specified comparison order.
|
|
|
|
|
*
|
|
|
|
|
* @param order a collation order returned by previous or next.
|
|
|
|
|
* @return the maximum length of any expansion seuences ending
|
|
|
|
|
* with the specified order.
|
|
|
|
|
*
|
|
|
|
|
* @see CollationElementIterator#getMaxExpansion
|
|
|
|
|
*/
|
|
|
|
|
int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
|
|
|
|
|
{
|
|
|
|
|
int32_t result = 1;
|
|
|
|
|
|
|
|
|
|
if (data->expandTable != NULL)
|
|
|
|
|
{
|
|
|
|
|
// Right now this does a linear search through the entire
|
|
|
|
|
// expandsion table. If a collator had a large number of expansions,
|
|
|
|
|
// this could cause a performance problem, but in practice that
|
|
|
|
|
// rarely happens
|
|
|
|
|
int32_t i;
|
|
|
|
|
for (i = 0; i < data->expandTable->size(); i += 1)
|
|
|
|
|
{
|
|
|
|
|
VectorOfInt *valueList = data->expandTable->at(i);
|
|
|
|
|
int32_t length = valueList->size();
|
|
|
|
|
|
|
|
|
|
if (length > result && valueList->at(length-1) == order)
|
|
|
|
|
{
|
|
|
|
|
result = length;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the entry of hash table of the expanding string in the collation
|
|
|
|
|
* table.
|
2000-04-14 05:22:29 +00:00
|
|
|
|
* @param offset the index of the expanding string value list
|
1999-08-16 21:50:52 +00:00
|
|
|
|
*/
|
|
|
|
|
VectorOfInt *RuleBasedCollator::getExpandValueList(int32_t order) const
|
|
|
|
|
{
|
|
|
|
|
return data->expandTable->at(order - EXPANDCHARINDEX);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2000-11-09 21:58:19 +00:00
|
|
|
|
void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, UMemoryStream* is, UErrorCode& status)
|
2000-05-22 19:49:10 +00:00
|
|
|
|
{
|
2000-11-09 21:58:19 +00:00
|
|
|
|
if (!uprv_mstrm_error(is) && U_SUCCESS(status)) {
|
2000-05-22 19:49:10 +00:00
|
|
|
|
// Check that this is the correct file type
|
|
|
|
|
int16_t id;
|
|
|
|
|
|
|
|
|
|
uprv_mstrm_read(is, &id, sizeof(id));
|
|
|
|
|
if (id != collator->FILEID)
|
|
|
|
|
{
|
|
|
|
|
// This isn't the right type of file. Mark the ios
|
|
|
|
|
// as failing and return.
|
|
|
|
|
uprv_mstrm_setError(is); // force the stream to set its error flag
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Stream in large objects
|
|
|
|
|
char isNull;
|
|
|
|
|
|
|
|
|
|
uprv_mstrm_read(is, &isNull, sizeof(isNull));
|
|
|
|
|
if (isNull)
|
|
|
|
|
{
|
|
|
|
|
delete collator->data;
|
|
|
|
|
collator->data = NULL;
|
2000-11-09 21:58:19 +00:00
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (collator->data == NULL)
|
|
|
|
|
{
|
|
|
|
|
collator->data = new TableCollationData;
|
|
|
|
|
}
|
|
|
|
|
|
2000-11-09 21:58:19 +00:00
|
|
|
|
collator->data->streamIn(is, status);
|
2000-05-22 19:49:10 +00:00
|
|
|
|
if (collator->data->isBogus()) {
|
|
|
|
|
uprv_mstrm_setError(is); // force the stream to set its error flag
|
2000-11-09 21:58:19 +00:00
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Verify that the end marker is present
|
|
|
|
|
uprv_mstrm_read(is, &id, sizeof(id));
|
|
|
|
|
if (id != collator->FILEID)
|
|
|
|
|
{
|
|
|
|
|
// This isn't the right type of file. Mark the ios
|
|
|
|
|
// as failing and return.
|
|
|
|
|
uprv_mstrm_setError(is); // force the stream to set its error flag
|
2000-11-09 21:58:19 +00:00
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reset other data members
|
|
|
|
|
collator->isOverIgnore = FALSE;
|
|
|
|
|
collator->lastChar = 0;
|
|
|
|
|
delete collator->mPattern;
|
|
|
|
|
collator->mPattern = 0;
|
|
|
|
|
collator->key.remove();
|
|
|
|
|
collator->dataIsOwned = TRUE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RuleBasedCollatorStreamer::streamOut(const RuleBasedCollator* collator, UMemoryStream* os)
|
|
|
|
|
{
|
|
|
|
|
if (!uprv_mstrm_error(os))
|
|
|
|
|
{
|
|
|
|
|
// We use a 16-bit ID code to identify this file.
|
|
|
|
|
int16_t id = collator->FILEID;
|
|
|
|
|
uprv_mstrm_write(os, (uint8_t *)&id, sizeof(id));
|
|
|
|
|
|
|
|
|
|
// Stream out the data
|
|
|
|
|
char isNull;
|
|
|
|
|
isNull = (collator->data == 0);
|
|
|
|
|
uprv_mstrm_write(os, (uint8_t*)&isNull, sizeof(isNull));
|
|
|
|
|
|
|
|
|
|
if (!isNull)
|
|
|
|
|
{
|
|
|
|
|
collator->data->streamOut(os);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Write out the ID to indicate the end
|
|
|
|
|
uprv_mstrm_write(os, (uint8_t *)&id, sizeof(id));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, FileStream* is)
|
|
|
|
|
{
|
|
|
|
|
if (!T_FileStream_error(is))
|
|
|
|
|
{
|
|
|
|
|
// Check that this is the correct file type
|
|
|
|
|
int16_t id;
|
|
|
|
|
|
|
|
|
|
T_FileStream_read(is, &id, sizeof(id));
|
|
|
|
|
if (id != collator->FILEID)
|
|
|
|
|
{
|
|
|
|
|
// This isn't the right type of file. Mark the ios
|
|
|
|
|
// as failing and return.
|
|
|
|
|
T_FileStream_setError(is); // force the stream to set its error flag
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Stream in large objects
|
|
|
|
|
char isNull;
|
|
|
|
|
|
|
|
|
|
T_FileStream_read(is, &isNull, sizeof(isNull));
|
|
|
|
|
if (isNull)
|
|
|
|
|
{
|
|
|
|
|
delete collator->data;
|
|
|
|
|
collator->data = NULL;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (collator->data == NULL)
|
|
|
|
|
{
|
|
|
|
|
collator->data = new TableCollationData;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
collator->data->streamIn(is);
|
|
|
|
|
if (collator->data->isBogus()) {
|
|
|
|
|
T_FileStream_setError(is); // force the stream to set its error flag
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Verify that the end marker is present
|
|
|
|
|
T_FileStream_read(is, &id, sizeof(id));
|
|
|
|
|
if (id != collator->FILEID)
|
|
|
|
|
{
|
|
|
|
|
// This isn't the right type of file. Mark the ios
|
|
|
|
|
// as failing and return.
|
|
|
|
|
T_FileStream_setError(is); // force the stream to set its error flag
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reset other data members
|
|
|
|
|
collator->isOverIgnore = FALSE;
|
|
|
|
|
collator->lastChar = 0;
|
|
|
|
|
delete collator->mPattern;
|
|
|
|
|
collator->mPattern = 0;
|
|
|
|
|
collator->key.remove();
|
|
|
|
|
collator->dataIsOwned = TRUE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RuleBasedCollatorStreamer::streamOut(const RuleBasedCollator* collator, FileStream* os)
|
|
|
|
|
{
|
|
|
|
|
if (!T_FileStream_error(os))
|
|
|
|
|
{
|
|
|
|
|
// We use a 16-bit ID code to identify this file.
|
|
|
|
|
int16_t id = collator->FILEID;
|
|
|
|
|
T_FileStream_write(os, &id, sizeof(id));
|
|
|
|
|
|
|
|
|
|
// Stream out the data
|
|
|
|
|
char isNull;
|
|
|
|
|
isNull = (collator->data == 0);
|
|
|
|
|
T_FileStream_write(os, &isNull, sizeof(isNull));
|
|
|
|
|
|
|
|
|
|
if (!isNull)
|
|
|
|
|
{
|
|
|
|
|
collator->data->streamOut(os);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Write out the ID to indicate the end
|
|
|
|
|
T_FileStream_write(os, &id, sizeof(id));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool RuleBasedCollator::writeToFile(const char* fileName) const
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
FileStream* ofs = T_FileStream_open(fileName, "wb");
|
|
|
|
|
if (ofs != 0)
|
|
|
|
|
{
|
|
|
|
|
RuleBasedCollatorStreamer::streamOut(this, ofs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef COLLDEBUG
|
|
|
|
|
fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs),
|
1999-11-23 22:49:29 +00:00
|
|
|
|
(!T_FileStream_error(ofs) ? ", OK" : ", FAIL"));
|
1999-08-16 21:50:52 +00:00
|
|
|
|
#endif
|
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
|
UBool err = T_FileStream_error(ofs) == 0;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
T_FileStream_close(ofs);
|
|
|
|
|
return err;
|
|
|
|
|
}
|
2000-05-22 19:49:10 +00:00
|
|
|
|
/*
|
|
|
|
|
UBool RuleBasedCollator::prepareForBundle() const
|
|
|
|
|
{
|
|
|
|
|
UMemoryStream* ofs = uprv_mstrm_openNew(0);
|
|
|
|
|
if (ofs != 0)
|
|
|
|
|
{
|
|
|
|
|
RuleBasedCollatorStreamer::streamOut(this, ofs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef COLLDEBUG
|
|
|
|
|
fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs),
|
|
|
|
|
(!T_FileStream_error(ofs) ? ", OK" : ", FAIL"));
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
UBool err = uprv_mstrm_error(ofs) == 0;
|
|
|
|
|
|
|
|
|
|
uprv_mstrm_close(ofs);
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
*/
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
void RuleBasedCollator::addToCache(const UnicodeString& key)
|
|
|
|
|
{
|
|
|
|
|
// This method doesn't add the RuleBasedCollator itself to the cache. Instead,
|
|
|
|
|
// it adds the given RuleBasedCollator's data object to the TableCollationData
|
|
|
|
|
// cache, and marks it as non-owned in the given RuleBasedCollator object.
|
|
|
|
|
TableCollationData::addToCache(key, data);
|
|
|
|
|
dataIsOwned = FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::constructFromCache(const UnicodeString& key,
|
|
|
|
|
UErrorCode& status)
|
|
|
|
|
{
|
|
|
|
|
// Attempt to construct this RuleBasedCollator object from cached TableCollationData.
|
|
|
|
|
// If no such data is in the cache, return false.
|
1999-10-18 22:48:32 +00:00
|
|
|
|
if (U_FAILURE(status)) return;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
if (dataIsOwned)
|
|
|
|
|
{
|
|
|
|
|
delete data;
|
|
|
|
|
data = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
isOverIgnore = FALSE;
|
|
|
|
|
lastChar = 0;
|
|
|
|
|
mPattern = 0;
|
|
|
|
|
setStrength(Collator::TERTIARY);
|
|
|
|
|
|
|
|
|
|
dataIsOwned = FALSE;
|
|
|
|
|
data = TableCollationData::findInCache(key);
|
|
|
|
|
if (data == NULL)
|
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char*
|
|
|
|
|
RuleBasedCollator::createPathName( const UnicodeString& prefix,
|
|
|
|
|
const UnicodeString& name,
|
|
|
|
|
const UnicodeString& suffix)
|
|
|
|
|
{
|
|
|
|
|
// Concatenate three elements to form a file name, and return it.
|
|
|
|
|
|
|
|
|
|
UnicodeString workingName(prefix);
|
|
|
|
|
int32_t size;
|
|
|
|
|
char* returnVal;
|
|
|
|
|
|
|
|
|
|
workingName += name;
|
|
|
|
|
workingName += suffix;
|
|
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
|
size = workingName.length();
|
1999-08-16 21:50:52 +00:00
|
|
|
|
returnVal = new char[size + 1];
|
1999-12-02 23:26:44 +00:00
|
|
|
|
workingName.extract(0, size, returnVal, "");
|
1999-08-16 21:50:52 +00:00
|
|
|
|
returnVal[size] = 0;
|
|
|
|
|
|
|
|
|
|
return returnVal;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
RuleBasedCollator::chopLocale(UnicodeString& localeName)
|
|
|
|
|
{
|
|
|
|
|
// chopLocale removes the final element from a locale string.
|
|
|
|
|
// For instance, "de_CH" becomes "de", and "de" becomes "".
|
|
|
|
|
// "" remains "".
|
|
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
|
int32_t size = localeName.length();
|
1999-08-16 21:50:52 +00:00
|
|
|
|
int32_t i;
|
|
|
|
|
|
|
|
|
|
for (i = size - 1; i > 0; i--)
|
|
|
|
|
{
|
1999-09-30 23:47:52 +00:00
|
|
|
|
if (localeName[i] == 0x005F)
|
1999-08-16 21:50:52 +00:00
|
|
|
|
{
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (i < 0)
|
|
|
|
|
{
|
2000-05-18 21:25:51 +00:00
|
|
|
|
i = 0;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
localeName.remove(i, size - i);
|
|
|
|
|
}
|
|
|
|
|
|
2000-05-18 21:25:51 +00:00
|
|
|
|
|
|
|
|
|
uint8_t *
|
|
|
|
|
RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &status)
|
|
|
|
|
{
|
2000-08-11 01:27:17 +00:00
|
|
|
|
UMemoryStream *memdata = 0;
|
|
|
|
|
uint8_t *data = 0;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
|
if(U_FAILURE(status)) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
2000-05-18 21:25:51 +00:00
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
|
memdata = uprv_mstrm_openNew(0);
|
2000-05-22 19:49:10 +00:00
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
|
if (memdata != 0) {
|
2000-05-22 19:49:10 +00:00
|
|
|
|
RuleBasedCollatorStreamer::streamOut(this, memdata);
|
2000-08-11 01:27:17 +00:00
|
|
|
|
}
|
2000-05-22 19:49:10 +00:00
|
|
|
|
|
2000-08-11 01:27:17 +00:00
|
|
|
|
UBool err = uprv_mstrm_error(memdata) == 0;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = (uint8_t *)uprv_malloc(memdata->fPos);
|
|
|
|
|
if(data == 0) {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
|
uprv_mstrm_close(memdata);
|
|
|
|
|
length = 0;
|
|
|
|
|
return 0;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
} else {
|
2000-08-11 01:27:17 +00:00
|
|
|
|
uprv_memcpy(data, memdata->fStart, memdata->fPos);
|
|
|
|
|
length = memdata->fPos;
|
|
|
|
|
uprv_mstrm_close(memdata);
|
|
|
|
|
return data;
|
2000-05-22 19:49:10 +00:00
|
|
|
|
}
|
2000-05-18 21:25:51 +00:00
|
|
|
|
}
|
|
|
|
|
|
2000-11-20 06:40:54 +00:00
|
|
|
|
void RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value, UErrorCode &status) {
|
2000-11-17 23:32:32 +00:00
|
|
|
|
status = U_UNSUPPORTED_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
2000-11-20 06:40:54 +00:00
|
|
|
|
UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &status) {
|
2000-11-17 23:32:32 +00:00
|
|
|
|
status = U_UNSUPPORTED_ERROR;
|
|
|
|
|
return UCOL_ATTR_DEFAULT;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Collator* RuleBasedCollator::safeClone(void) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Collator::EComparisonResult RuleBasedCollator::compare(ForwardCharacterIterator &source,
|
|
|
|
|
ForwardCharacterIterator &target) {
|
|
|
|
|
return EQUAL;
|
|
|
|
|
}
|
|
|
|
|
|
2000-11-20 06:40:54 +00:00
|
|
|
|
int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
|
2000-11-17 23:32:32 +00:00
|
|
|
|
uint8_t *result,
|
|
|
|
|
int32_t resultLength) const {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2000-11-20 06:40:54 +00:00
|
|
|
|
int32_t RuleBasedCollator::getSortKey(const UChar *source,
|
2000-11-17 23:32:32 +00:00
|
|
|
|
int32_t sourceLength,
|
|
|
|
|
uint8_t *result,
|
|
|
|
|
int32_t resultLength) const {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2000-05-18 21:25:51 +00:00
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
//eof
|