scuffed-code/icu4c/source/i18n/collationcompare.cpp
2014-02-25 21:21:49 +00:00

364 lines
15 KiB
C++

/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationcompare.cpp
*
* created on: 2012feb14 with new and old collation code
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/ucol.h"
#include "cmemory.h"
#include "collation.h"
#include "collationcompare.h"
#include "collationiterator.h"
#include "collationsettings.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
UCollationResult
CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
const CollationSettings &settings,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
int32_t options = settings.options;
uint32_t variableTop;
if((options & CollationSettings::ALTERNATE_MASK) == 0) {
variableTop = 0;
} else {
// +1 so that we can use "<" and primary ignorables test out early.
variableTop = settings.variableTop + 1;
}
UBool anyVariable = FALSE;
// Fetch CEs, compare primaries, store secondary & tertiary weights.
U_ALIGN_CODE(16);
for(;;) {
// We fetch CEs until we get a non-ignorable primary or reach the end.
uint32_t leftPrimary;
do {
int64_t ce = left.nextCE(errorCode);
leftPrimary = (uint32_t)(ce >> 32);
if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
// Variable CE, shift it to quaternary level.
// Ignore all following primary ignorables, and shift further variable CEs.
anyVariable = TRUE;
do {
// Store only the primary of the variable CE.
left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
for(;;) {
ce = left.nextCE(errorCode);
leftPrimary = (uint32_t)(ce >> 32);
if(leftPrimary == 0) {
left.setCurrentCE(0);
} else {
break;
}
}
} while(leftPrimary < variableTop &&
leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
}
} while(leftPrimary == 0);
uint32_t rightPrimary;
do {
int64_t ce = right.nextCE(errorCode);
rightPrimary = (uint32_t)(ce >> 32);
if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
// Variable CE, shift it to quaternary level.
// Ignore all following primary ignorables, and shift further variable CEs.
anyVariable = TRUE;
do {
// Store only the primary of the variable CE.
right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
for(;;) {
ce = right.nextCE(errorCode);
rightPrimary = (uint32_t)(ce >> 32);
if(rightPrimary == 0) {
right.setCurrentCE(0);
} else {
break;
}
}
} while(rightPrimary < variableTop &&
rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
}
} while(rightPrimary == 0);
if(leftPrimary != rightPrimary) {
// Return the primary difference, with script reordering.
const uint8_t *reorderTable = settings.reorderTable;
if (reorderTable != NULL) {
leftPrimary = Collation::reorder(reorderTable, leftPrimary);
rightPrimary = Collation::reorder(reorderTable, rightPrimary);
}
return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
}
if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
}
if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
// Compare the buffered secondary & tertiary weights.
// We might skip the secondary level but continue with the case level
// which is turned on separately.
if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
int32_t leftIndex = 0;
int32_t rightIndex = 0;
for(;;) {
uint32_t leftSecondary;
do {
leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
} while(leftSecondary == 0);
uint32_t rightSecondary;
do {
rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
} while(rightSecondary == 0);
if(leftSecondary != rightSecondary) {
return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
}
if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
}
} else {
// The backwards secondary level compares secondary weights backwards
// within segments separated by the merge separator (U+FFFE, weight 02).
int32_t leftStart = 0;
int32_t rightStart = 0;
for(;;) {
// Find the merge separator or the NO_CE terminator.
int32_t leftLimit = leftStart;
uint32_t leftLower32;
while((leftLower32 = (uint32_t)left.getCE(leftLimit)) >
Collation::MERGE_SEPARATOR_LOWER32 ||
leftLower32 == 0) {
++leftLimit;
}
int32_t rightLimit = rightStart;
uint32_t rightLower32;
while((rightLower32 = (uint32_t)right.getCE(rightLimit)) >
Collation::MERGE_SEPARATOR_LOWER32 ||
rightLower32 == 0) {
++rightLimit;
}
// Compare the segments.
int32_t leftIndex = leftLimit;
int32_t rightIndex = rightLimit;
for(;;) {
int32_t leftSecondary = 0;
while(leftSecondary == 0 && leftIndex > leftStart) {
leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
}
int32_t rightSecondary = 0;
while(rightSecondary == 0 && rightIndex > rightStart) {
rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
}
if(leftSecondary != rightSecondary) {
return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
}
if(leftSecondary == 0) { break; }
}
// Did we reach the end of either string?
// Both strings have the same number of merge separators,
// or else there would have been a primary-level difference.
U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
if(left.getCE(leftLimit) == Collation::NO_CE) { break; }
// Skip both merge separators and continue.
leftStart = leftLimit + 1;
rightStart = rightLimit + 1;
}
}
}
if((options & CollationSettings::CASE_LEVEL) != 0) {
int32_t strength = CollationSettings::getStrength(options);
int32_t leftIndex = 0;
int32_t rightIndex = 0;
for(;;) {
uint32_t leftCase, leftLower32, rightCase;
if(strength == UCOL_PRIMARY) {
// Primary+caseLevel: Ignore case level weights of primary ignorables.
// Otherwise we would get a-umlaut > a
// which is not desirable for accent-insensitive sorting.
// Check for (lower 32 bits) == 0 as well because variable CEs are stored
// with only primary weights.
int64_t ce;
do {
ce = left.getCE(leftIndex++);
leftCase = (uint32_t)ce;
} while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
leftLower32 = leftCase;
leftCase &= 0xc000;
do {
ce = right.getCE(rightIndex++);
rightCase = (uint32_t)ce;
} while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
rightCase &= 0xc000;
} else {
// Secondary+caseLevel: By analogy with the above,
// ignore case level weights of secondary ignorables.
//
// Note: A tertiary CE has uppercase case bits (0.0.ut)
// to keep tertiary+caseFirst well-formed.
//
// Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
// Otherwise a tertiary CE's uppercase would be no greater than
// a primary/secondary CE's uppercase.
// (See UCA well-formedness condition 2.)
// We could construct a special case weight higher than uppercase,
// but it's simpler to always ignore case weights of secondary ignorables,
// turning 0.0.ut into 0.0.0.t.
// (See LDML Collation, Case Parameters.)
do {
leftCase = (uint32_t)left.getCE(leftIndex++);
} while(leftCase <= 0xffff);
leftLower32 = leftCase;
leftCase &= 0xc000;
do {
rightCase = (uint32_t)right.getCE(rightIndex++);
} while(rightCase <= 0xffff);
rightCase &= 0xc000;
}
// No need to handle NO_CE and MERGE_SEPARATOR specially:
// There is one case weight for each previous-level weight,
// so level length differences were handled there.
if(leftCase != rightCase) {
if((options & CollationSettings::UPPER_FIRST) == 0) {
return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
} else {
return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
}
}
if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
}
}
if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
int32_t leftIndex = 0;
int32_t rightIndex = 0;
uint32_t anyQuaternaries = 0;
for(;;) {
uint32_t leftLower32, leftTertiary;
do {
leftLower32 = (uint32_t)left.getCE(leftIndex++);
anyQuaternaries |= leftLower32;
U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
(leftLower32 & 0xc0c0) == 0);
leftTertiary = leftLower32 & tertiaryMask;
} while(leftTertiary == 0);
uint32_t rightLower32, rightTertiary;
do {
rightLower32 = (uint32_t)right.getCE(rightIndex++);
anyQuaternaries |= rightLower32;
U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
(rightLower32 & 0xc0c0) == 0);
rightTertiary = rightLower32 & tertiaryMask;
} while(rightTertiary == 0);
if(leftTertiary != rightTertiary) {
if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
// Pass through NO_CE and MERGE_SEPARATOR
// and keep real tertiary weights larger than the MERGE_SEPARATOR.
// Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
// to keep tertiary CEs well-formed.
// Their case+tertiary weights must be greater than those of
// primary and secondary CEs.
if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
if(leftLower32 > 0xffff) {
leftTertiary ^= 0xc000;
} else {
leftTertiary += 0x4000;
}
}
if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
if(rightLower32 > 0xffff) {
rightTertiary ^= 0xc000;
} else {
rightTertiary += 0x4000;
}
}
}
return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
}
if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
}
if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
// If there are no "variable" CEs and no non-zero quaternary weights,
// then there are no quaternary differences.
return UCOL_EQUAL;
}
leftIndex = 0;
rightIndex = 0;
for(;;) {
uint32_t leftQuaternary;
do {
int64_t ce = left.getCE(leftIndex++);
leftQuaternary = (uint32_t)ce & 0xffff;
if(leftQuaternary == 0) {
// Variable primary or completely ignorable.
leftQuaternary = (uint32_t)(ce >> 32);
} else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
// Leave NO_CE or MERGE_SEPARATOR as is.
} else {
// Regular CE, not tertiary ignorable.
// Preserve the quaternary weight in bits 7..6.
leftQuaternary |= 0xffffff3f;
}
} while(leftQuaternary == 0);
uint32_t rightQuaternary;
do {
int64_t ce = right.getCE(rightIndex++);
rightQuaternary = (uint32_t)ce & 0xffff;
if(rightQuaternary == 0) {
// Variable primary or completely ignorable.
rightQuaternary = (uint32_t)(ce >> 32);
} else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
// Leave NO_CE or MERGE_SEPARATOR as is.
} else {
// Regular CE, not tertiary ignorable.
// Preserve the quaternary weight in bits 7..6.
rightQuaternary |= 0xffffff3f;
}
} while(rightQuaternary == 0);
if(leftQuaternary != rightQuaternary) {
// Return the difference, with script reordering.
const uint8_t *reorderTable = settings.reorderTable;
if (reorderTable != NULL) {
leftQuaternary = Collation::reorder(reorderTable, leftQuaternary);
rightQuaternary = Collation::reorder(reorderTable, rightQuaternary);
}
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
}
if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; }
}
return UCOL_EQUAL;
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION