ICU-96 performance improvements
X-SVN-Rev: 247
This commit is contained in:
parent
c372fee921
commit
c07aed7913
@ -35,6 +35,7 @@
|
||||
// 6/20/97 helena Java class name change.
|
||||
// 04/23/99 stephen Removed EDecompositionMode, merged with
|
||||
// Normalizer::EMode
|
||||
// 11/23/9 srl Inlining of some critical functions
|
||||
//=============================================================================
|
||||
|
||||
#include "colcache.h"
|
||||
@ -140,11 +141,6 @@ Collator::greater(const UnicodeString& source,
|
||||
return (compare(source, target) == Collator::GREATER);
|
||||
}
|
||||
|
||||
Collator::ECollationStrength
|
||||
Collator::getStrength() const
|
||||
{
|
||||
return strength;
|
||||
}
|
||||
|
||||
void
|
||||
Collator::setStrength(Collator::ECollationStrength newStrength)
|
||||
@ -152,11 +148,6 @@ Collator::setStrength(Collator::ECollationStrength newStrength)
|
||||
strength = newStrength;
|
||||
}
|
||||
|
||||
Normalizer::EMode
|
||||
Collator::getDecomposition() const
|
||||
{
|
||||
return decmp;
|
||||
}
|
||||
void
|
||||
Collator::setDecomposition(Normalizer::EMode decompositionMode)
|
||||
{
|
||||
|
@ -34,6 +34,10 @@
|
||||
// 02/10/98 damiba Added compare() with length as parameter.
|
||||
// 04/23/99 stephen Removed EDecompositionMode, merged with
|
||||
// Normalizer::EMode.
|
||||
// 11/02/99 helena Collator performance enhancements. Eliminates the
|
||||
// UnicodeString construction and special case for NO_OP.
|
||||
// 11/23/99 srl More performance enhancements. Inlining of
|
||||
// critical accessors.
|
||||
//=============================================================================
|
||||
|
||||
#ifndef COLL_H
|
||||
@ -299,6 +303,38 @@ public:
|
||||
int32_t length) const = 0;
|
||||
|
||||
|
||||
/**
|
||||
* The comparison function compares the character data stored in two
|
||||
* different string arrays. Returns information about whether a string
|
||||
* array is less than, greater than or equal to another string array.
|
||||
* <p>Example of use:
|
||||
* <pre>
|
||||
* . UErrorCode status = U_ZERO_ERROR;
|
||||
* . Collator *myCollation = Collator::createInstance(Locale::US, status);
|
||||
* . if (U_FAILURE(status)) return;
|
||||
* . myCollation->setStrength(Collator::PRIMARY);
|
||||
* . // result would be Collator::EQUAL ("abc" == "ABC")
|
||||
* . // (no primary difference between "abc" and "ABC")
|
||||
* . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
|
||||
* . myCollation->setStrength(Collator::TERTIARY);
|
||||
* . // result would be Collator::LESS (abc" <<< "ABC")
|
||||
* . // (with tertiary difference between "abc" and "ABC")
|
||||
* . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
|
||||
* </pre>
|
||||
* @param source the source string array to be compared with.
|
||||
* @param sourceLength the length of the source string array. If this value
|
||||
* is equal to -1, the string array is null-terminated.
|
||||
* @param target the string that is to be compared with the source string.
|
||||
* @param targetLength the length of the target string array. If this value
|
||||
* is equal to -1, the string array is null-terminated.
|
||||
* @return Returns a byte value. GREATER if source is greater
|
||||
* than target; EQUAL if source is equal to target; LESS if source is less
|
||||
* than target
|
||||
**/
|
||||
virtual EComparisonResult compare( const UChar* source,
|
||||
int32_t sourceLength,
|
||||
const UChar* target,
|
||||
int32_t targetLength) const = 0;
|
||||
|
||||
/** Transforms the string into a series of characters that can be compared
|
||||
* with CollationKey::compareTo. It is not possible to restore the original
|
||||
@ -339,6 +375,24 @@ public:
|
||||
virtual CollationKey& getCollationKey(const UnicodeString& source,
|
||||
CollationKey& key,
|
||||
UErrorCode& status) const = 0;
|
||||
|
||||
/** Transforms the string into a series of characters that can be compared
|
||||
* with CollationKey::compareTo. It is not possible to restore the original
|
||||
* string from the chars in the sort key. The generated sort key handles
|
||||
* only a limited number of ignorable characters.
|
||||
* <p>Use CollationKey::equals or CollationKey::compare to compare the
|
||||
* generated sort keys.
|
||||
* <p>If the source string is null, a null collation key will be returned.
|
||||
* @param source the source string to be transformed into a sort key.
|
||||
* @param sourceLength length of the collation key
|
||||
* @param key the collation key to be filled in
|
||||
* @return the collation key of the string based on the collation rules.
|
||||
* @see CollationKey#compare
|
||||
*/
|
||||
virtual CollationKey& getCollationKey(const UChar *source,
|
||||
int32_t sourceLength,
|
||||
CollationKey& key,
|
||||
UErrorCode& status) const = 0;
|
||||
/**
|
||||
* Generates the hash code for the collation object
|
||||
*/
|
||||
@ -503,4 +557,17 @@ Collator::operator!=(const Collator& other) const
|
||||
return result;
|
||||
}
|
||||
|
||||
inline Collator::ECollationStrength
|
||||
Collator::getStrength() const
|
||||
{
|
||||
return strength;
|
||||
}
|
||||
|
||||
inline Normalizer::EMode
|
||||
Collator::getDecomposition() const
|
||||
{
|
||||
return decmp;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -43,7 +43,10 @@
|
||||
* Normalizer::EMode
|
||||
* 06/14/99 stephen Removed kResourceBundleSuffix
|
||||
* 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
|
||||
* files are no longer used.
|
||||
* files are no longer used.
|
||||
* 11/02/99 helena Collator performance enhancements. Special case
|
||||
* for NO_OP situations.
|
||||
* 11/17/99 srl More performance enhancements. Inlined some internal functions.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
@ -69,6 +72,8 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <ustring.h>
|
||||
|
||||
|
||||
class RuleBasedCollatorStreamer
|
||||
{
|
||||
@ -129,14 +134,124 @@ const int16_t RuleBasedCollator::FILEID = 0x5443; // unique f
|
||||
const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension
|
||||
char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id
|
||||
|
||||
//================ Some inline definitions of implementation functions........ ========
|
||||
|
||||
// Get the character order in the mapping table
|
||||
inline int32_t
|
||||
RuleBasedCollator::getUnicodeOrder(UChar ch) const
|
||||
{
|
||||
return ucmp32_get(data->mapping, ch);
|
||||
}
|
||||
|
||||
inline int32_t
|
||||
RuleBasedCollator::strengthOrder(int32_t value) const
|
||||
{
|
||||
if (getStrength() == PRIMARY)
|
||||
{
|
||||
return (value & PRIMARYDIFFERENCEONLY);
|
||||
} else if (getStrength() == SECONDARY)
|
||||
{
|
||||
return (value & SECONDARYDIFFERENCEONLY);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
inline int32_t
|
||||
RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor,
|
||||
UErrorCode status) const
|
||||
{
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return CollationElementIterator::NULLORDER;
|
||||
}
|
||||
|
||||
if (cursor->bufferAlias != NULL)
|
||||
{
|
||||
// bufferAlias needs a bit of an explanation.
|
||||
// When we hit an expanding character in the text, we call the order's
|
||||
// getExpandValues method to retrieve an array of the orderings for all
|
||||
// of the characters in the expansion (see the end of this method).
|
||||
// The first ordering is returned, and an alias to the orderings array
|
||||
// is saved so that the remaining orderings can be returned on subsequent
|
||||
// calls to next. So, if the expanding buffer is not exhausted,
|
||||
// all we have to do here is return the next ordering in the buffer.
|
||||
if (cursor->expIndex < cursor->bufferAlias->size())
|
||||
{
|
||||
//_L((stderr, "next from [%08X] from bufferAlias\n", this));
|
||||
return strengthOrder(cursor->bufferAlias->at(cursor->expIndex++));
|
||||
}
|
||||
else
|
||||
{
|
||||
cursor->bufferAlias = NULL;
|
||||
cursor->expIndex = 0;
|
||||
}
|
||||
}
|
||||
else if (cursor->swapOrder != 0)
|
||||
{
|
||||
// If we find a character with no order, we return the marking
|
||||
// flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character
|
||||
// itself shifted left 16 bits as orders. At this point, the
|
||||
// UNMAPPEDCHARVALUE flag has already been returned by the code
|
||||
// below, so just return the shifted character here.
|
||||
int32_t order = cursor->swapOrder << 16;
|
||||
|
||||
//_L((stderr, "next from [%08X] swaporder..\n", this));
|
||||
cursor->swapOrder = 0;
|
||||
|
||||
return order;
|
||||
}
|
||||
|
||||
UChar ch = cursor->current();
|
||||
cursor->next();
|
||||
|
||||
//_L((stderr, "Next from [%08X] = [%04X], [%c]\n", cursor, (int)ch & 0xFFFF, (char)(ch & 0xFF)));
|
||||
|
||||
if (ch == Normalizer::DONE) {
|
||||
return CollationElementIterator::NULLORDER;
|
||||
}
|
||||
// Ask the collator for this character's ordering.
|
||||
int32_t value = getUnicodeOrder(ch);
|
||||
|
||||
if (value == UNMAPPED)
|
||||
{
|
||||
// Returned an "unmapped" flag and save the character so it can be
|
||||
// returned next time this method is called.
|
||||
if (ch == 0x0000) return ch;
|
||||
cursor->swapOrder = ch; // \u0000 is not valid in C++'s UnicodeString
|
||||
return CollationElementIterator::UNMAPPEDCHARVALUE;
|
||||
}
|
||||
|
||||
if (value >= CONTRACTCHARINDEX)
|
||||
{
|
||||
value = nextContractChar(cursor, ch, status);
|
||||
}
|
||||
|
||||
if (value >= EXPANDCHARINDEX)
|
||||
{
|
||||
cursor->bufferAlias = getExpandValueList(value);
|
||||
cursor->expIndex = 0;
|
||||
value = cursor->bufferAlias->at(cursor->expIndex++);
|
||||
}
|
||||
|
||||
int32_t str = strengthOrder(value);
|
||||
|
||||
return strengthOrder(value);
|
||||
}
|
||||
|
||||
// ==================== End inlines ============================================
|
||||
|
||||
|
||||
//===============================================================================
|
||||
|
||||
RuleBasedCollator::RuleBasedCollator()
|
||||
: Collator(),
|
||||
isOverIgnore(FALSE),
|
||||
mPattern(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
//targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
data(0),
|
||||
dataIsOwned(FALSE)
|
||||
{
|
||||
@ -146,8 +261,10 @@ RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
|
||||
: Collator(that),
|
||||
isOverIgnore(that.isOverIgnore),
|
||||
mPattern(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
//targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
dataIsOwned(FALSE),
|
||||
data(that.data) // Alias the data pointer
|
||||
{
|
||||
@ -214,8 +331,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
||||
: Collator(),
|
||||
isOverIgnore(FALSE),
|
||||
mPattern(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
/// targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
data(0),
|
||||
dataIsOwned(FALSE)
|
||||
{
|
||||
@ -233,8 +352,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
||||
: Collator(collationStrength, Normalizer::NO_OP),
|
||||
isOverIgnore(FALSE),
|
||||
mPattern(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
// targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
data(0),
|
||||
dataIsOwned(FALSE)
|
||||
{
|
||||
@ -242,7 +363,6 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constructFromRules(rules, status);
|
||||
}
|
||||
|
||||
@ -252,8 +372,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
||||
: Collator(TERTIARY, decompositionMode),
|
||||
isOverIgnore(FALSE),
|
||||
mPattern(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
// targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
data(0),
|
||||
dataIsOwned(FALSE)
|
||||
{
|
||||
@ -272,8 +394,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
|
||||
: Collator(collationStrength, decompositionMode),
|
||||
isOverIgnore(FALSE),
|
||||
mPattern(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
//targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
data(0),
|
||||
dataIsOwned(FALSE)
|
||||
{
|
||||
@ -392,10 +516,14 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
|
||||
isOverIgnore(FALSE),
|
||||
dataIsOwned(FALSE),
|
||||
data(0),
|
||||
sourceCursor(0),
|
||||
targetCursor(0),
|
||||
// sourceCursor(0),
|
||||
//targetCursor(0),
|
||||
cursor1(0),
|
||||
cursor2(0),
|
||||
mPattern(0)
|
||||
{
|
||||
|
||||
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return;
|
||||
@ -447,6 +575,18 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
|
||||
return;
|
||||
}
|
||||
|
||||
// srl write out default.col
|
||||
{
|
||||
UnicodeString defLocaleName = ResourceBundle::kDefaultFilename;
|
||||
char *binaryFilePath = createPathName(Locale::getDataDirectory(),
|
||||
defLocaleName, kFilenameSuffix);
|
||||
bool_t ok = writeToFile(binaryFilePath);
|
||||
delete [] binaryFilePath;
|
||||
#ifdef COLLDEBUG
|
||||
cerr << defLocaleName << " [default] binary write " << (ok? "OK" : "Failed") << endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
data->desiredLocale = desiredLocale;
|
||||
desiredLocale.getName(localeName);
|
||||
data->realLocaleName = localeName;
|
||||
@ -567,7 +707,7 @@ RuleBasedCollator::constructFromFile( const Locale& locale,
|
||||
// Try to load up the collation from a binary file first
|
||||
constructFromFile(binaryFilePath, status);
|
||||
#ifdef COLLDEBUG
|
||||
cerr << localeFileName << " binary load " << errorName(status) << endl;
|
||||
cerr << localeFileName << kFilenameSuffix << " binary load " << errorName(status) << endl;
|
||||
#endif
|
||||
if(U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR)
|
||||
return;
|
||||
@ -629,7 +769,7 @@ RuleBasedCollator::constructFromFile( const Locale& locale,
|
||||
}
|
||||
|
||||
#ifdef COLLDEBUG
|
||||
cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << endl;
|
||||
cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << " - try= " << (tryBinaryFile?"true":"false") << endl;
|
||||
#endif
|
||||
|
||||
if(U_SUCCESS(status) && tryBinaryFile) {
|
||||
@ -655,11 +795,20 @@ RuleBasedCollator::~RuleBasedCollator()
|
||||
|
||||
data = 0;
|
||||
|
||||
delete sourceCursor;
|
||||
sourceCursor = 0;
|
||||
// delete sourceCursor;
|
||||
// sourceCursor = 0;
|
||||
|
||||
delete targetCursor;
|
||||
targetCursor = 0;
|
||||
// delete targetCursor;
|
||||
// targetCursor = 0;
|
||||
|
||||
if (cursor1 != NULL) {
|
||||
delete cursor1;
|
||||
cursor1 = 0;
|
||||
}
|
||||
if (cursor2 != NULL) {
|
||||
delete cursor2;
|
||||
cursor2 = 0;
|
||||
}
|
||||
|
||||
delete mPattern;
|
||||
mPattern = 0;
|
||||
@ -742,13 +891,13 @@ RuleBasedCollator::getRules() const
|
||||
data->isRuleTableLoaded = TRUE;
|
||||
#ifdef _DEBUG
|
||||
// the following is useful for specific debugging purposes
|
||||
// UnicodeString name;
|
||||
// cerr << "Table collation rules loaded dynamically for "
|
||||
// << data->desiredLocale.getName(name)
|
||||
// << " at "
|
||||
// << data->realLocaleName
|
||||
// << ", " << dec << data->ruleTable.size() << " characters"
|
||||
// << endl;
|
||||
UnicodeString name;
|
||||
cerr << "Table collation rules loaded dynamically for "
|
||||
<< data->desiredLocale.getName(name)
|
||||
<< " at "
|
||||
<< data->realLocaleName
|
||||
<< ", " << dec << data->ruleTable.size() << " characters"
|
||||
<< endl;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
@ -762,6 +911,16 @@ RuleBasedCollator::getRules() const
|
||||
<< endl;
|
||||
cerr << "Status " << errorName(status) << ", mPattern " << temp.mPattern << endl;
|
||||
#endif
|
||||
/* SRL have to add this because we now have the situation where
|
||||
DEFAULT is loaded from a binary file w/ no rules. */
|
||||
UErrorCode intStatus = U_ZERO_ERROR;
|
||||
temp.constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus);
|
||||
|
||||
if(U_SUCCESS(intStatus) && (temp.mPattern != 0))
|
||||
{
|
||||
data->ruleTable = temp.getRules();
|
||||
data->isRuleTableLoaded = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -783,14 +942,15 @@ RuleBasedCollator::compare( const UnicodeString& source,
|
||||
return (RuleBasedCollator::compare(source_togo, target_togo));
|
||||
}
|
||||
|
||||
|
||||
// Compare two strings using this collator
|
||||
Collator::EComparisonResult
|
||||
RuleBasedCollator::compare(const UnicodeString& source,
|
||||
const UnicodeString& target) const
|
||||
Collator::EComparisonResult
|
||||
RuleBasedCollator::compare(const UChar* source,
|
||||
int32_t sourceLength,
|
||||
const UChar* target,
|
||||
int32_t targetLength) const
|
||||
{
|
||||
// check if source and target are valid strings
|
||||
if (source.isBogus() || target.isBogus())
|
||||
if (((source == 0) && (target == 0)) ||
|
||||
((sourceLength == 0) && (targetLength == 0)))
|
||||
{
|
||||
return Collator::EQUAL;
|
||||
}
|
||||
@ -798,55 +958,36 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
Collator::EComparisonResult result = Collator::EQUAL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// The basic algorithm here is that we use CollationElementIterators
|
||||
// to step through both the source and target strings. We compare each
|
||||
// collation element in the source string against the corresponding one
|
||||
// in the target, checking for differences.
|
||||
//
|
||||
// If a difference is found, we set <result> to LESS or GREATER to
|
||||
// indicate whether the source string is less or greater than the target.
|
||||
//
|
||||
// However, it's not that simple. If we find a tertiary difference
|
||||
// (e.g. 'A' vs. 'a') near the beginning of a string, it can be
|
||||
// overridden by a primary difference (e.g. "A" vs. "B") later in
|
||||
// the string. For example, "AA" < "aB", even though 'A' > 'a'.
|
||||
//
|
||||
// To keep track of this, we use checkSecTer and checkTertiary to keep
|
||||
// track of the strength of the most significant difference that has been
|
||||
// found so far. When we find a difference whose strength is greater than
|
||||
// the previous ones, it overrides the last difference (if any) that
|
||||
// was found.
|
||||
//
|
||||
|
||||
if (sourceCursor == NULL)
|
||||
if (cursor1 == NULL)
|
||||
{
|
||||
((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source);
|
||||
((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLength, getDecomposition());
|
||||
}
|
||||
else
|
||||
{
|
||||
sourceCursor->setText(source, status);
|
||||
cursor1->setModeAndText(getDecomposition(), source, sourceLength, status);
|
||||
}
|
||||
|
||||
if (sourceCursor == NULL || U_FAILURE(status))
|
||||
if ( /*cursor1->cursor == NULL ||*/ U_FAILURE(status))
|
||||
{
|
||||
return Collator::EQUAL;
|
||||
}
|
||||
|
||||
if (targetCursor == NULL)
|
||||
if (cursor2 == NULL)
|
||||
{
|
||||
((RuleBasedCollator *)this)->targetCursor = createCollationElementIterator(target);
|
||||
((RuleBasedCollator *)this)->cursor2 = new NormalizerIterator(target, targetLength, getDecomposition());
|
||||
}
|
||||
else
|
||||
{
|
||||
targetCursor->setText(target, status);
|
||||
cursor2->setModeAndText(getDecomposition(), target, targetLength, status);
|
||||
}
|
||||
|
||||
if (targetCursor == NULL || U_FAILURE(status))
|
||||
if (/*cursor2 == NULL ||*/ U_FAILURE(status))
|
||||
{
|
||||
return Collator::EQUAL;
|
||||
}
|
||||
|
||||
int32_t sOrder, tOrder;
|
||||
// int32_t sOrder = CollationElementIterator::NULLORDER, tOrder = CollationElementIterator::NULLORDER;
|
||||
bool_t gets = TRUE, gett = TRUE;
|
||||
bool_t initialCheckSecTer = getStrength() >= Collator::SECONDARY;
|
||||
bool_t checkSecTer = initialCheckSecTer;
|
||||
@ -860,7 +1001,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
// we've been requested to skip it.
|
||||
if (gets)
|
||||
{
|
||||
sOrder = sourceCursor->next(status);
|
||||
sOrder = getStrengthOrder((NormalizerIterator*)cursor1, status);
|
||||
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
@ -872,7 +1013,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
|
||||
if (gett)
|
||||
{
|
||||
tOrder = targetCursor->next(status);
|
||||
tOrder = getStrengthOrder((NormalizerIterator*)cursor2, status);
|
||||
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
@ -1036,7 +1177,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
}
|
||||
}
|
||||
}
|
||||
while ((sOrder = sourceCursor->next(status)) != CollationElementIterator::NULLORDER);
|
||||
while ((sOrder = getStrengthOrder(cursor1, status)) != CollationElementIterator::NULLORDER);
|
||||
}
|
||||
else if (tOrder != CollationElementIterator::NULLORDER)
|
||||
{
|
||||
@ -1060,7 +1201,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
}
|
||||
}
|
||||
}
|
||||
while ((tOrder = targetCursor->next(status)) != CollationElementIterator::NULLORDER);
|
||||
while ((tOrder = getStrengthOrder(cursor2, status)) != CollationElementIterator::NULLORDER);
|
||||
}
|
||||
|
||||
|
||||
@ -1070,15 +1211,46 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
// puts the result of the string comparison directly into result
|
||||
if (result == Collator::EQUAL && getStrength() == IDENTICAL)
|
||||
{
|
||||
UnicodeString sourceDecomp, targetDecomp;
|
||||
#if 0
|
||||
// ******** for the UChar normalization interface.
|
||||
// It doesn't work much faster, and the code was broken
|
||||
// so it's commented out. --srl
|
||||
// UChar sourceDecomp[1024], targetDecomp[1024];
|
||||
// int32_t sourceDecompLength = 1024;
|
||||
// int32_t targetDecompLength = 1024;
|
||||
|
||||
// int8_t comparison;
|
||||
// Normalizer::EMode decompMode = getDecomposition();
|
||||
|
||||
// if (decompMode != Normalizer::NO_OP)
|
||||
// {
|
||||
// Normalizer::normalize(source, sourceLength, decompMode,
|
||||
// 0, sourceDecomp, sourceDecompLength, status);
|
||||
|
||||
// Normalizer::normalize(target, targetLength, decompMode,
|
||||
// 0, targetDecomp, targetDecompLength, status);
|
||||
|
||||
// comparison = u_strcmp(sourceDecomp,targetDecomp);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// comparison = u_strcmp(source, target); /* ! */
|
||||
// }
|
||||
|
||||
#else
|
||||
|
||||
UnicodeString sourceDecomp, targetDecomp;
|
||||
|
||||
int8_t comparison;
|
||||
|
||||
Normalizer::normalize(source, getDecomposition(),
|
||||
0, sourceDecomp, status);
|
||||
0, sourceDecomp, status);
|
||||
|
||||
Normalizer::normalize(target, getDecomposition(),
|
||||
0, targetDecomp, status);
|
||||
0, targetDecomp, status);
|
||||
|
||||
comparison = sourceDecomp.compare(targetDecomp);
|
||||
#endif
|
||||
|
||||
if (comparison < 0)
|
||||
{
|
||||
@ -1097,6 +1269,49 @@ RuleBasedCollator::compare(const UnicodeString& source,
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
int32_t
|
||||
RuleBasedCollator::nextContractChar(NormalizerIterator *cursor,
|
||||
UChar ch,
|
||||
UErrorCode& status) const
|
||||
{
|
||||
// First get the ordering of this single character
|
||||
VectorOfPToContractElement *list = getContractValues(ch);
|
||||
EntryPair *pair = (EntryPair *)list->at(0);
|
||||
int32_t order = pair->value;
|
||||
|
||||
// Now iterate through the chars following it and
|
||||
// look for the longest match
|
||||
((UnicodeString&)key).remove();
|
||||
((UnicodeString&)key) += ch;
|
||||
|
||||
while ((ch = cursor->current()) != Normalizer::DONE)
|
||||
{
|
||||
((UnicodeString&)key) += ch;
|
||||
|
||||
int32_t n = getEntry(list, key, TRUE);
|
||||
|
||||
if (n == UNMAPPED)
|
||||
{
|
||||
break;
|
||||
}
|
||||
cursor->next();
|
||||
|
||||
pair = (EntryPair *)list->at(n);
|
||||
order = pair->value;
|
||||
}
|
||||
|
||||
return order;
|
||||
}
|
||||
|
||||
// Compare two strings using this collator
|
||||
Collator::EComparisonResult
|
||||
RuleBasedCollator::compare(const UnicodeString& source,
|
||||
const UnicodeString& target) const
|
||||
{
|
||||
return compare(source.getUChars(), source.length(), target.getUChars(), target.length());
|
||||
}
|
||||
|
||||
// Retrieve a collation key for the specified string
|
||||
// The key can be compared with other collation keys using a bitwise comparison
|
||||
// (e.g. memcmp) to find the ordering of their respective source strings.
|
||||
@ -1134,6 +1349,15 @@ CollationKey&
|
||||
RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
CollationKey& sortkey,
|
||||
UErrorCode& status) const
|
||||
{
|
||||
return RuleBasedCollator::getCollationKey(source.getUChars(), source.size(), sortkey, status);
|
||||
}
|
||||
|
||||
CollationKey&
|
||||
RuleBasedCollator::getCollationKey( const UChar* source,
|
||||
int32_t sourceLen,
|
||||
CollationKey& sortkey,
|
||||
UErrorCode& status) const
|
||||
{
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
@ -1141,27 +1365,21 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
return sortkey.setToBogus();
|
||||
}
|
||||
|
||||
if (source.isBogus())
|
||||
{
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return sortkey.setToBogus();
|
||||
}
|
||||
|
||||
if (source.size() == 0)
|
||||
if ((!source) || (sourceLen == 0))
|
||||
{
|
||||
return sortkey.reset();
|
||||
}
|
||||
|
||||
if (sourceCursor == NULL)
|
||||
if (cursor1 == NULL)
|
||||
{
|
||||
((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source);
|
||||
((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLen, getDecomposition());
|
||||
}
|
||||
else
|
||||
{
|
||||
sourceCursor->setText(source, status);
|
||||
cursor1->setModeAndText(getDecomposition(), source,sourceLen, status);
|
||||
}
|
||||
|
||||
if (sourceCursor == NULL || U_FAILURE(status))
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
return sortkey.setToBogus();
|
||||
}
|
||||
@ -1177,7 +1395,8 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
UnicodeString decomp;
|
||||
|
||||
// iterate over the source, counting primary, secondary, and tertiary entries
|
||||
while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER)
|
||||
while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) !=
|
||||
CollationElementIterator::NULLORDER)
|
||||
{
|
||||
int32_t secOrder = CollationElementIterator::secondaryOrder(order);
|
||||
int32_t terOrder = CollationElementIterator::tertiaryOrder(order);
|
||||
@ -1230,7 +1449,7 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
|
||||
if (compareIdent)
|
||||
{
|
||||
Normalizer::normalize(source, getDecomposition(),
|
||||
Normalizer::normalize(source, getDecomposition(), // SRL: ??
|
||||
0, decomp, status);
|
||||
|
||||
if (U_SUCCESS(status))
|
||||
@ -1259,10 +1478,10 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
int32_t identCursor = terCursor + (2 * totalTer);
|
||||
|
||||
// reset source to the beginning
|
||||
sourceCursor->reset();
|
||||
cursor1->reset();
|
||||
|
||||
// now iterate over the source computing the actual entries
|
||||
while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER)
|
||||
while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != CollationElementIterator::NULLORDER)
|
||||
{
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
@ -1336,6 +1555,14 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
|
||||
sortkey.storeUnicodeString(identCursor, decomp);
|
||||
}
|
||||
|
||||
// Debugging - print out the sortkey [--srl]
|
||||
// {
|
||||
// const uint8_t *bytes;
|
||||
// int32_t xcount;
|
||||
// bytes = sortkey.getByteArray(xcount);
|
||||
// // fprintf(stderr, "\n\n- [%02X] [%02X]\n\n", (int)(bytes[0]&0xFF), (int)(bytes[1]&0xFF) );
|
||||
// }
|
||||
|
||||
return sortkey;
|
||||
}
|
||||
|
||||
@ -1615,6 +1842,8 @@ RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t las
|
||||
data->maxTerOrder += 1;
|
||||
}
|
||||
break;
|
||||
|
||||
// case IDENTICAL?
|
||||
}
|
||||
|
||||
return lastValue;
|
||||
@ -2017,12 +2246,6 @@ VectorOfInt *RuleBasedCollator::getExpandValueList(int32_t order) const
|
||||
return data->expandTable->at(order - EXPANDCHARINDEX);
|
||||
}
|
||||
|
||||
// Get the character order in the mapping table
|
||||
int32_t
|
||||
RuleBasedCollator::getUnicodeOrder(UChar ch) const
|
||||
{
|
||||
return ucmp32_get(data->mapping, ch);
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, FileStream* is)
|
||||
@ -2117,7 +2340,7 @@ bool_t RuleBasedCollator::writeToFile(const char* fileName) const
|
||||
|
||||
#ifdef COLLDEBUG
|
||||
fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs),
|
||||
(!T_FileStream_error(ofs) ? ", OK" : ", FAIL");
|
||||
(!T_FileStream_error(ofs) ? ", OK" : ", FAIL"));
|
||||
#endif
|
||||
|
||||
bool_t err = T_FileStream_error(ofs) == 0;
|
||||
|
@ -37,7 +37,10 @@
|
||||
* 04/23/99 stephen Removed EDecompositionMode, merged with
|
||||
* Normalizer::EMode
|
||||
* 06/14/99 stephen Removed kResourceBundleSuffix
|
||||
*
|
||||
* 11/02/99 helena Collator performance enhancements. Eliminates the
|
||||
* UnicodeString construction and special case for NO_OP.
|
||||
* 11/23/99 srl More performance enhancements. Updates to NormalizerIterator
|
||||
* internal state management.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
@ -420,6 +423,39 @@ public:
|
||||
const UnicodeString& target,
|
||||
int32_t length) const;
|
||||
|
||||
/**
|
||||
* The comparison function compares the character data stored in two
|
||||
* different string arrays. Returns information about whether a string
|
||||
* array is less than, greater than or equal to another string array.
|
||||
* <p>Example of use:
|
||||
* <pre>
|
||||
* . UErrorCode status = U_ZERO_ERROR;
|
||||
* . Collator *myCollation = Collator::createInstance(Locale::US, status);
|
||||
* . if (U_FAILURE(status)) return;
|
||||
* . myCollation->setStrength(Collator::PRIMARY);
|
||||
* . // result would be Collator::EQUAL ("abc" == "ABC")
|
||||
* . // (no primary difference between "abc" and "ABC")
|
||||
* . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
|
||||
* . myCollation->setStrength(Collator::TERTIARY);
|
||||
* . // result would be Collator::LESS (abc" <<< "ABC")
|
||||
* . // (with tertiary difference between "abc" and "ABC")
|
||||
* . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
|
||||
* </pre>
|
||||
* @param source the source string array to be compared with.
|
||||
* @param sourceLength the length of the source string array. If this value
|
||||
* is equal to -1, the string array is null-terminated.
|
||||
* @param target the string that is to be compared with the source string.
|
||||
* @param targetLength the length of the target string array. If this value
|
||||
* is equal to -1, the string array is null-terminated.
|
||||
* @return Returns a byte value. GREATER if source is greater
|
||||
* than target; EQUAL if source is equal to target; LESS if source is less
|
||||
* than target
|
||||
**/
|
||||
virtual EComparisonResult compare( const UChar* source,
|
||||
int32_t sourceLength,
|
||||
const UChar* target,
|
||||
int32_t targetLength) const ;
|
||||
|
||||
/** Transforms a specified region of the string into a series of characters
|
||||
* that can be compared with CollationKey.compare. Use a CollationKey when
|
||||
* you need to do repeated comparisions on the same string. For a single comparison
|
||||
@ -433,6 +469,13 @@ public:
|
||||
virtual CollationKey& getCollationKey( const UnicodeString& source,
|
||||
CollationKey& key,
|
||||
UErrorCode& status) const;
|
||||
|
||||
virtual CollationKey& getCollationKey(const UChar *source,
|
||||
int32_t sourceLength,
|
||||
CollationKey& key,
|
||||
UErrorCode& status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Generates the hash code for the rule-based collation object.
|
||||
* @return the hash code.
|
||||
@ -705,11 +748,41 @@ private:
|
||||
const UnicodeString& name,
|
||||
const UnicodeString& suffix);
|
||||
|
||||
/**
|
||||
* Chops off the last portion of the locale name. For example, from "en_US_CA"
|
||||
* to "en_US" and "en_US" to "en".
|
||||
* @param localeName the locale name.
|
||||
/* Internal class for quick iteration over the text.
|
||||
100% pure inline code
|
||||
*/
|
||||
class NormalizerIterator {
|
||||
public:
|
||||
Normalizer *cursor;
|
||||
VectorOfInt *bufferAlias;
|
||||
int32_t swapOrder;
|
||||
UChar* text;
|
||||
int32_t expIndex;
|
||||
int32_t textLen;
|
||||
UTextOffset currentOffset;
|
||||
|
||||
NormalizerIterator(void);
|
||||
NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode);
|
||||
~NormalizerIterator(void);
|
||||
void setText(const UChar* source, int32_t length, UErrorCode& status);
|
||||
void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status);
|
||||
|
||||
UChar current(void) const;
|
||||
UChar next(void);
|
||||
void reset(void);
|
||||
};
|
||||
|
||||
int32_t getStrengthOrder(NormalizerIterator* cursor,
|
||||
UErrorCode status) const;
|
||||
int32_t strengthOrder(int32_t value) const ;
|
||||
int32_t nextContractChar(NormalizerIterator *cursor,
|
||||
UChar ch,
|
||||
UErrorCode& status) const;
|
||||
/**
|
||||
* Chops off the last portion of the locale name. For example, from "en_US_CA"
|
||||
* to "en_US" and "en_US" to "en".
|
||||
* @param localeName the locale name.
|
||||
*/
|
||||
static void chopLocale(UnicodeString& localeName);
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
@ -751,12 +824,151 @@ private:
|
||||
UnicodeString sbuffer;
|
||||
UnicodeString tbuffer;
|
||||
UnicodeString key;
|
||||
CollationElementIterator *sourceCursor;
|
||||
CollationElementIterator *targetCursor;
|
||||
NormalizerIterator *cursor1;
|
||||
NormalizerIterator *cursor2;
|
||||
bool_t dataIsOwned;
|
||||
TableCollationData* data;
|
||||
};
|
||||
|
||||
inline
|
||||
RuleBasedCollator::NormalizerIterator::NormalizerIterator() :
|
||||
cursor(0),
|
||||
bufferAlias(0),
|
||||
swapOrder(0),
|
||||
text(0),
|
||||
textLen(0),
|
||||
currentOffset(0),
|
||||
expIndex(0)
|
||||
{
|
||||
}
|
||||
|
||||
inline
|
||||
RuleBasedCollator::NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) :
|
||||
cursor(0),
|
||||
bufferAlias(0),
|
||||
swapOrder(0),
|
||||
text(0),
|
||||
textLen(0),
|
||||
currentOffset(0),
|
||||
expIndex(0)
|
||||
{
|
||||
if (mode == Normalizer::NO_OP) {
|
||||
text = (UChar*)source;
|
||||
textLen = length;
|
||||
currentOffset = 0;
|
||||
} else {
|
||||
cursor = new Normalizer(source, length, mode);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
RuleBasedCollator::NormalizerIterator::~NormalizerIterator()
|
||||
{
|
||||
if (cursor != 0) {
|
||||
delete cursor;
|
||||
cursor = 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void
|
||||
RuleBasedCollator::NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status)
|
||||
{
|
||||
if (cursor == 0) {
|
||||
text = (UChar*)source;
|
||||
textLen = length;
|
||||
currentOffset = 0;
|
||||
|
||||
} else {
|
||||
text = 0;
|
||||
cursor->setText(source, length, status);
|
||||
}
|
||||
bufferAlias = 0;
|
||||
swapOrder = 0;
|
||||
expIndex = 0;
|
||||
currentOffset = 0;
|
||||
}
|
||||
|
||||
/* You can only set mode after the comparision of two strings is completed.
|
||||
Setting the mode in the middle of a comparison is not allowed.
|
||||
*/
|
||||
inline
|
||||
void
|
||||
|
||||
|
||||
RuleBasedCollator::NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status)
|
||||
{
|
||||
if (cursor != NULL) {
|
||||
if (mode != Normalizer::NO_OP) {
|
||||
cursor->setMode(mode);
|
||||
cursor->setText(source, length, status);
|
||||
} else {
|
||||
delete cursor;
|
||||
cursor = 0;
|
||||
|
||||
text = (UChar*)source;
|
||||
textLen = length;
|
||||
currentOffset = 0;
|
||||
}
|
||||
} else {
|
||||
if(mode == Normalizer::NO_OP)
|
||||
{
|
||||
text = (UChar*)source;
|
||||
textLen = length;
|
||||
currentOffset = 0;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
cursor = new Normalizer(source, length, mode);
|
||||
}
|
||||
}
|
||||
|
||||
bufferAlias = 0;
|
||||
swapOrder = 0;
|
||||
expIndex = 0;
|
||||
}
|
||||
|
||||
inline
|
||||
UChar
|
||||
RuleBasedCollator::NormalizerIterator::current(void) const
|
||||
{
|
||||
if (text != 0) {
|
||||
if(currentOffset >= textLen)
|
||||
{
|
||||
return Normalizer::DONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
return text[currentOffset];
|
||||
}
|
||||
}
|
||||
|
||||
return cursor->current();
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
UChar
|
||||
RuleBasedCollator::NormalizerIterator::next(void)
|
||||
{
|
||||
if (text != 0) {
|
||||
return ((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE);
|
||||
}
|
||||
return cursor->next();
|
||||
}
|
||||
|
||||
inline
|
||||
void
|
||||
RuleBasedCollator::NormalizerIterator::reset(void)
|
||||
{
|
||||
currentOffset = 0;
|
||||
if(cursor)
|
||||
{
|
||||
cursor->reset();
|
||||
}
|
||||
}
|
||||
|
||||
inline bool_t
|
||||
RuleBasedCollator::operator!=(const Collator& other) const
|
||||
@ -772,4 +984,7 @@ RuleBasedCollator::addContractOrder(const UnicodeString &groupChars,
|
||||
addContractOrder(groupChars, anOrder, TRUE, status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -138,11 +138,7 @@ ucol_strcoll( const UCollator *coll,
|
||||
const UChar *target,
|
||||
int32_t targetLength)
|
||||
{
|
||||
int32_t srcLen = (sourceLength == -1 ? u_strlen(source) : sourceLength);
|
||||
const UnicodeString tempSource((UChar*)source, sourceLength, sourceLength);
|
||||
int32_t targLen = (targetLength == -1 ? u_strlen(target) : targetLength);
|
||||
const UnicodeString tempTarget((UChar*)target, targLen, targLen);
|
||||
return (UCollationResult) ((Collator*)coll)->compare(tempSource, tempTarget);
|
||||
return (UCollationResult) ((Collator*)coll)->compare(source,sourceLength,target,targetLength);
|
||||
}
|
||||
|
||||
U_CAPI bool_t
|
||||
@ -290,12 +286,12 @@ ucol_getSortKey(const UCollator *coll,
|
||||
const uint8_t* bytes = NULL;
|
||||
CollationKey key;
|
||||
int32_t copyLen;
|
||||
int32_t len = (sourceLength == -1 ? u_strlen(source)
|
||||
int32_t len = (sourceLength == -1 ? u_strlen(source)
|
||||
: sourceLength);
|
||||
UnicodeString string((UChar*)source, len, len);
|
||||
// UnicodeString string((UChar*)source, len, len);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
((Collator*)coll)->getCollationKey(string, key, status);
|
||||
((Collator*)coll)->getCollationKey(source, len, key, status);
|
||||
if(U_FAILURE(status))
|
||||
return 0;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user