ICU-10829 simplify U+FFFE collation: U+FFFE use common non-primary weights, adjust code for that, test order of U+FFFE equivalent to ucol_mergeSortkeys() but not necessarily same sort keys, omit case level if lowerFirst and only common weights

X-SVN-Rev: 36856
This commit is contained in:
Markus Scherer 2014-12-11 17:04:32 +00:00
parent 7ce10f73cf
commit 030eff56d3
12 changed files with 193 additions and 183 deletions

View File

@ -146,7 +146,7 @@
* This value may change in subsequent releases of ICU. * This value may change in subsequent releases of ICU.
* @stable ICU 2.4 * @stable ICU 2.4
*/ */
#define UCOL_RUNTIME_VERSION 8 #define UCOL_RUNTIME_VERSION 9
/** /**
* Collation builder code version. * Collation builder code version.

View File

@ -47051,7 +47051,7 @@ FDD1 FDD0; [E4, 05, 05] # unassigned first primary
# SPECIAL MAX/MIN COLLATION ELEMENTS # SPECIAL MAX/MIN COLLATION ELEMENTS
FFFE; [02, 02, 02] # Special LOWEST primary, for merge/interleaving FFFE; [02, 05, 05] # Special LOWEST primary, for merge/interleaving
FFFF; [EF FF, 05, 05] # Special HIGHEST primary, for ranges FFFF; [EF FF, 05, 05] # Special HIGHEST primary, for ranges

View File

@ -29,17 +29,19 @@ public:
// Special sort key bytes for all levels. // Special sort key bytes for all levels.
static const uint8_t TERMINATOR_BYTE = 0; static const uint8_t TERMINATOR_BYTE = 0;
static const uint8_t LEVEL_SEPARATOR_BYTE = 1; static const uint8_t LEVEL_SEPARATOR_BYTE = 1;
/** The secondary/tertiary lower limit for tailoring before any root elements. */
static const uint32_t BEFORE_WEIGHT16 = 0x0100;
/** /**
* Merge-sort-key separator. * Merge-sort-key separator.
* Must not be used as the lead byte of any CE weight, * Same as the unique primary and identical-level weights of U+FFFE.
* nor as primary compression low terminator. * Must not be used as primary compression low terminator.
* Otherwise usable. * Otherwise usable.
*/ */
static const uint8_t MERGE_SEPARATOR_BYTE = 2; static const uint8_t MERGE_SEPARATOR_BYTE = 2;
static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE
static const uint32_t MERGE_SEPARATOR_WEIGHT16 = 0x0200; // U+FFFE static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE
static const uint32_t MERGE_SEPARATOR_LOWER32 = 0x02000200; // U+FFFE
static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000202; // U+FFFE
/** /**
* Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE. * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.

View File

@ -450,8 +450,8 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
} }
nodes.setElementAt(node, index); nodes.setElementAt(node, index);
int32_t nextIndex = nextIndexFromNode(node); int32_t nextIndex = nextIndexFromNode(node);
// Insert default nodes with weights 02 and 05, reset to the 02 node. // Insert default nodes with weights 01 and 05, reset to the 01 node.
node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength); node = nodeFromWeight16(Collation::BEFORE_WEIGHT16) | nodeFromStrength(strength);
index = insertNodeBetween(index, nextIndex, node, errorCode); index = insertNodeBetween(index, nextIndex, node, errorCode);
node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |
nodeFromStrength(strength); nodeFromStrength(strength);
@ -961,7 +961,7 @@ CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
index = nextIndexFromNode(node); index = nextIndexFromNode(node);
node = nodes.elementAti(index); node = nodes.elementAti(index);
U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
weight16FromNode(node) == BEFORE_WEIGHT16); weight16FromNode(node) == Collation::BEFORE_WEIGHT16);
// Skip to the explicit common node. // Skip to the explicit common node.
do { do {
index = nextIndexFromNode(node); index = nextIndexFromNode(node);
@ -1398,7 +1398,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
// Gap at the beginning of the tertiary CE range. // Gap at the beginning of the tertiary CE range.
t = rootElements.getTertiaryBoundary() - 0x100; t = rootElements.getTertiaryBoundary() - 0x100;
tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
} else if(t == BEFORE_WEIGHT16) { } else if(t == Collation::BEFORE_WEIGHT16) {
tLimit = Collation::COMMON_WEIGHT16; tLimit = Collation::COMMON_WEIGHT16;
} else if(!pIsTailored && !sIsTailored) { } else if(!pIsTailored && !sIsTailored) {
// p and s are root weights. // p and s are root weights.
@ -1441,7 +1441,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
// Gap at the beginning of the secondary CE range. // Gap at the beginning of the secondary CE range.
s = rootElements.getSecondaryBoundary() - 0x100; s = rootElements.getSecondaryBoundary() - 0x100;
sLimit = rootElements.getFirstSecondaryCE() >> 16; sLimit = rootElements.getFirstSecondaryCE() >> 16;
} else if(s == BEFORE_WEIGHT16) { } else if(s == Collation::BEFORE_WEIGHT16) {
sLimit = Collation::COMMON_WEIGHT16; sLimit = Collation::COMMON_WEIGHT16;
} else if(!pIsTailored) { } else if(!pIsTailored) {
// p is a root primary. // p is a root primary.

View File

@ -215,9 +215,6 @@ private:
static int32_t ceStrength(int64_t ce); static int32_t ceStrength(int64_t ce);
/** The secondary/tertiary lower limit for tailoring before the common weight. */
static const uint32_t BEFORE_WEIGHT16 = Collation::MERGE_SEPARATOR_WEIGHT16;
/** At most 1M nodes, limited by the 20 bits in node bit fields. */ /** At most 1M nodes, limited by the 20 bits in node bit fields. */
static const int32_t MAX_INDEX = 0xfffff; static const int32_t MAX_INDEX = 0xfffff;
/** /**

View File

@ -136,18 +136,17 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
int32_t rightStart = 0; int32_t rightStart = 0;
for(;;) { for(;;) {
// Find the merge separator or the NO_CE terminator. // Find the merge separator or the NO_CE terminator.
uint32_t p;
int32_t leftLimit = leftStart; int32_t leftLimit = leftStart;
uint32_t leftLower32; while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
while((leftLower32 = (uint32_t)left.getCE(leftLimit)) > Collation::MERGE_SEPARATOR_PRIMARY ||
Collation::MERGE_SEPARATOR_LOWER32 || p == 0) {
leftLower32 == 0) {
++leftLimit; ++leftLimit;
} }
int32_t rightLimit = rightStart; int32_t rightLimit = rightStart;
uint32_t rightLower32; while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
while((rightLower32 = (uint32_t)right.getCE(rightLimit)) > Collation::MERGE_SEPARATOR_PRIMARY ||
Collation::MERGE_SEPARATOR_LOWER32 || p == 0) {
rightLower32 == 0) {
++rightLimit; ++rightLimit;
} }
@ -175,7 +174,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
// Both strings have the same number of merge separators, // Both strings have the same number of merge separators,
// or else there would have been a primary-level difference. // or else there would have been a primary-level difference.
U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
if(left.getCE(leftLimit) == Collation::NO_CE) { break; } if(p == Collation::NO_CE_PRIMARY) { break; }
// Skip both merge separators and continue. // Skip both merge separators and continue.
leftStart = leftLimit + 1; leftStart = leftLimit + 1;
rightStart = rightLimit + 1; rightStart = rightLimit + 1;
@ -276,20 +275,19 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
if(leftTertiary != rightTertiary) { if(leftTertiary != rightTertiary) {
if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
// Pass through NO_CE and MERGE_SEPARATOR // Pass through NO_CE and keep real tertiary weights larger than that.
// and keep real tertiary weights larger than the MERGE_SEPARATOR.
// Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
// to keep tertiary CEs well-formed. // to keep tertiary CEs well-formed.
// Their case+tertiary weights must be greater than those of // Their case+tertiary weights must be greater than those of
// primary and secondary CEs. // primary and secondary CEs.
if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { if(leftTertiary > Collation::NO_CE_WEIGHT16) {
if(leftLower32 > 0xffff) { if(leftLower32 > 0xffff) {
leftTertiary ^= 0xc000; leftTertiary ^= 0xc000;
} else { } else {
leftTertiary += 0x4000; leftTertiary += 0x4000;
} }
} }
if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { if(rightTertiary > Collation::NO_CE_WEIGHT16) {
if(rightLower32 > 0xffff) { if(rightLower32 > 0xffff) {
rightTertiary ^= 0xc000; rightTertiary ^= 0xc000;
} else { } else {
@ -316,11 +314,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
do { do {
int64_t ce = left.getCE(leftIndex++); int64_t ce = left.getCE(leftIndex++);
leftQuaternary = (uint32_t)ce & 0xffff; leftQuaternary = (uint32_t)ce & 0xffff;
if(leftQuaternary == 0) { if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
// Variable primary or completely ignorable. // Variable primary or completely ignorable or NO_CE.
leftQuaternary = (uint32_t)(ce >> 32); leftQuaternary = (uint32_t)(ce >> 32);
} else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
// Leave NO_CE or MERGE_SEPARATOR as is.
} else { } else {
// Regular CE, not tertiary ignorable. // Regular CE, not tertiary ignorable.
// Preserve the quaternary weight in bits 7..6. // Preserve the quaternary weight in bits 7..6.
@ -332,11 +328,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
do { do {
int64_t ce = right.getCE(rightIndex++); int64_t ce = right.getCE(rightIndex++);
rightQuaternary = (uint32_t)ce & 0xffff; rightQuaternary = (uint32_t)ce & 0xffff;
if(rightQuaternary == 0) { if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
// Variable primary or completely ignorable. // Variable primary or completely ignorable or NO_CE.
rightQuaternary = (uint32_t)(ce >> 32); rightQuaternary = (uint32_t)(ce >> 32);
} else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
// Leave NO_CE or MERGE_SEPARATOR as is.
} else { } else {
// Regular CE, not tertiary ignorable. // Regular CE, not tertiary ignorable.
// Preserve the quaternary weight in bits 7..6. // Preserve the quaternary weight in bits 7..6.
@ -353,7 +347,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
} }
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
} }
if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; } if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
} }
return UCOL_EQUAL; return UCOL_EQUAL;
} }

View File

@ -262,7 +262,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
int32_t commonQuaternaries = 0; int32_t commonQuaternaries = 0;
uint32_t prevSecondary = 0; uint32_t prevSecondary = 0;
UBool anyMergeSeparators = FALSE; int32_t secSegmentStart = 0;
for(;;) { for(;;) {
// No need to keep all CEs in the buffer when we write a sort key. // No need to keep all CEs in the buffer when we write a sort key.
@ -350,7 +350,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
uint32_t s = lower32 >> 16; uint32_t s = lower32 >> 16;
if(s == 0) { if(s == 0) {
// secondary ignorable // secondary ignorable
} else if(s == Collation::COMMON_WEIGHT16) { } else if(s == Collation::COMMON_WEIGHT16 &&
((options & CollationSettings::BACKWARD_SECONDARY) == 0 ||
p != Collation::MERGE_SEPARATOR_PRIMARY)) {
// s is a common secondary weight, and
// backwards-secondary is off or the ce is not the merge separator.
++commonSecondaries; ++commonSecondaries;
} else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
if(commonSecondaries != 0) { if(commonSecondaries != 0) {
@ -389,16 +393,28 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
} }
// commonSecondaries == 0 // commonSecondaries == 0
} }
// Reduce separators so that we can look for byte<=1 later. if(0 < p && p <= Collation::MERGE_SEPARATOR_PRIMARY) {
if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) { // The backwards secondary level compares secondary weights backwards
if(s == Collation::MERGE_SEPARATOR_WEIGHT16) { // within segments separated by the merge separator (U+FFFE).
anyMergeSeparators = TRUE; uint8_t *secs = secondaries.data();
int32_t last = secondaries.length() - 1;
if(secSegmentStart < last) {
uint8_t *p = secs + secSegmentStart;
uint8_t *q = secs + last;
do {
uint8_t b = *p;
*p++ = *q;
*q-- = b;
} while(p < q);
} }
secondaries.appendByte((s >> 8) - 1); secondaries.appendByte(p == Collation::NO_CE_PRIMARY ?
Collation::LEVEL_SEPARATOR_BYTE : Collation::MERGE_SEPARATOR_BYTE);
prevSecondary = 0;
secSegmentStart = secondaries.length();
} else { } else {
secondaries.appendReverseWeight16(s); secondaries.appendReverseWeight16(s);
prevSecondary = s;
} }
prevSecondary = s;
} }
} }
@ -411,19 +427,23 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
} else { } else {
uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lead byte uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lead byte
U_ASSERT((c & 0xc0) != 0xc0); U_ASSERT((c & 0xc0) != 0xc0);
if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) { if((c & 0xc0) == 0 && c > Collation::LEVEL_SEPARATOR_BYTE) {
++commonCases; ++commonCases;
} else { } else {
if((options & CollationSettings::UPPER_FIRST) == 0) { if((options & CollationSettings::UPPER_FIRST) == 0) {
// lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14, upper=15. // lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14, upper=15.
if(commonCases != 0) { // If there are only common (=lowest) weights in the whole level,
// then we need not write anything.
// Level length differences are handled already on the next-higher level.
if(commonCases != 0 &&
(c > Collation::LEVEL_SEPARATOR_BYTE || !cases.isEmpty())) {
--commonCases; --commonCases;
while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) { while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) {
cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4); cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4);
commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT; commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT;
} }
uint32_t b; uint32_t b;
if(c <= Collation::MERGE_SEPARATOR_BYTE) { if(c <= Collation::LEVEL_SEPARATOR_BYTE) {
b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; b = CASE_LOWER_FIRST_COMMON_LOW + commonCases;
} else { } else {
b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases;
@ -431,7 +451,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
cases.appendByte(b << 4); cases.appendByte(b << 4);
commonCases = 0; commonCases = 0;
} }
if(c > Collation::MERGE_SEPARATOR_BYTE) { if(c > Collation::LEVEL_SEPARATOR_BYTE) {
c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4; // 14 or 15 c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4; // 14 or 15
} }
} else { } else {
@ -447,11 +467,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4); cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4);
commonCases = 0; commonCases = 0;
} }
if(c > Collation::MERGE_SEPARATOR_BYTE) { if(c > Collation::LEVEL_SEPARATOR_BYTE) {
c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4; // 2 or 1 c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4; // 2 or 1
} }
} }
// c is a separator byte 01 or 02, // c is a separator byte 01,
// or a left-shifted nibble 0x10, 0x20, ... 0xf0. // or a left-shifted nibble 0x10, 0x20, ... 0xf0.
cases.appendByte(c); cases.appendByte(c);
} }
@ -510,14 +530,14 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
// Their case+tertiary weights must be greater than those of // Their case+tertiary weights must be greater than those of
// primary and secondary CEs. // primary and secondary CEs.
// //
// Separators 01..02 -> 01..02 (unchanged) // Separator 01 -> 01 (unchanged)
// Lowercase 03..04 -> 83..84 (includes uncased) // Lowercase 02..04 -> 82..84 (includes uncased)
// Common weight 05 -> 85..C5 (common-weight compression range) // Common weight 05 -> 85..C5 (common-weight compression range)
// Lowercase 06..3F -> C6..FF // Lowercase 06..3F -> C6..FF
// Mixed case 43..7F -> 43..7F // Mixed case 42..7F -> 42..7F
// Uppercase 83..BF -> 03..3F // Uppercase 82..BF -> 02..3F
// Tertiary CE 86..BF -> C6..FF // Tertiary CE 86..BF -> C6..FF
if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) { if(t <= Collation::NO_CE_WEIGHT16) {
// Keep separators unchanged. // Keep separators unchanged.
} else if(lower32 > 0xffff) { } else if(lower32 > 0xffff) {
// Invert case bits of primary & secondary CEs. // Invert case bits of primary & secondary CEs.
@ -551,24 +571,22 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) {
uint32_t q = lower32 & 0xffff; uint32_t q = lower32 & 0xffff;
if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) { if((q & 0xc0) == 0 && q > Collation::NO_CE_WEIGHT16) {
++commonQuaternaries; ++commonQuaternaries;
} else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 && } else if(q == Collation::NO_CE_WEIGHT16 &&
(options & CollationSettings::ALTERNATE_MASK) == 0 && (options & CollationSettings::ALTERNATE_MASK) == 0 &&
(quaternaries.isEmpty() || quaternaries.isEmpty()) {
quaternaries[quaternaries.length() - 1] == Collation::MERGE_SEPARATOR_BYTE)) { // If alternate=non-ignorable and there are only common quaternary weights,
// If alternate=non-ignorable and there are only // then we need not write anything.
// common quaternary weights between two separators,
// then we need not write anything between these separators.
// The only weights greater than the merge separator and less than the common weight // The only weights greater than the merge separator and less than the common weight
// are shifted primary weights, which are not generated for alternate=non-ignorable. // are shifted primary weights, which are not generated for alternate=non-ignorable.
// There are also exactly as many quaternary weights as tertiary weights, // There are also exactly as many quaternary weights as tertiary weights,
// so level length differences are handled already on tertiary level. // so level length differences are handled already on tertiary level.
// Any above-common quaternary weight will compare greater regardless. // Any above-common quaternary weight will compare greater regardless.
quaternaries.appendByte(q >> 8); quaternaries.appendByte(Collation::LEVEL_SEPARATOR_BYTE);
} else { } else {
if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) { if(q == Collation::NO_CE_WEIGHT16) {
q >>= 8; q = Collation::LEVEL_SEPARATOR_BYTE;
} else { } else {
q = 0xfc + ((q >> 6) & 3); q = 0xfc + ((q >> 6) & 3);
} }
@ -602,42 +620,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; }
ok &= secondaries.isOk(); ok &= secondaries.isOk();
sink.Append(Collation::LEVEL_SEPARATOR_BYTE); sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
uint8_t *secs = secondaries.data(); secondaries.appendTo(sink);
int32_t length = secondaries.length() - 1; // Ignore the trailing NO_CE.
if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
// The backwards secondary level compares secondary weights backwards
// within segments separated by the merge separator (U+FFFE, weight 02).
// The separator weights 01 & 02 were reduced to 00 & 01 so that
// we do not accidentally separate at a _second_ weight byte of 02.
int32_t start = 0;
for(;;) {
// Find the merge separator or the NO_CE terminator.
int32_t limit;
if(anyMergeSeparators) {
limit = start;
while(secs[limit] > 1) { ++limit; }
} else {
limit = length;
}
// Reverse this segment.
if(start < limit) {
uint8_t *p = secs + start;
uint8_t *q = secs + limit - 1;
while(p < q) {
uint8_t s = *p;
*p++ = *q;
*q-- = s;
}
}
// Did we reach the end of the string?
if(secs[limit] == 0) { break; }
// Restore the merge separator.
secs[limit] = 2;
// Skip the merge separator and continue.
start = limit + 1;
}
}
sink.Append(reinterpret_cast<char *>(secs), length);
} }
if((levels & Collation::CASE_LEVEL_FLAG) != 0) { if((levels & Collation::CASE_LEVEL_FLAG) != 0) {
@ -649,21 +632,12 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
uint8_t b = 0; uint8_t b = 0;
for(int32_t i = 0; i < length; ++i) { for(int32_t i = 0; i < length; ++i) {
uint8_t c = (uint8_t)cases[i]; uint8_t c = (uint8_t)cases[i];
if(c <= Collation::MERGE_SEPARATOR_BYTE) { U_ASSERT((c & 0xf) == 0 && c != 0);
U_ASSERT(c != 0); if(b == 0) {
if(b != 0) { b = c;
sink.Append(b);
b = 0;
}
sink.Append(c);
} else { } else {
U_ASSERT((c & 0xf) == 0); sink.Append(b | (c >> 4));
if(b == 0) { b = 0;
b = c;
} else {
sink.Append(b | (c >> 4));
b = 0;
}
} }
} }
if(b != 0) { if(b != 0) {

View File

@ -124,7 +124,7 @@ CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
sec = elements[index] >> 16; sec = elements[index] >> 16;
} else { } else {
index = findPrimary(p) + 1; index = findPrimary(p) + 1;
previousSec = Collation::MERGE_SEPARATOR_WEIGHT16; previousSec = Collation::BEFORE_WEIGHT16;
sec = Collation::COMMON_WEIGHT16; sec = Collation::COMMON_WEIGHT16;
} }
U_ASSERT(s >= sec); U_ASSERT(s >= sec);
@ -149,12 +149,12 @@ CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) con
previousTer = 0; previousTer = 0;
} else { } else {
index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
previousTer = Collation::MERGE_SEPARATOR_WEIGHT16; previousTer = Collation::BEFORE_WEIGHT16;
} }
secTer = elements[index] & ~SEC_TER_DELTA_FLAG; secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
} else { } else {
index = findPrimary(p) + 1; index = findPrimary(p) + 1;
previousTer = Collation::MERGE_SEPARATOR_WEIGHT16; previousTer = Collation::BEFORE_WEIGHT16;
secTer = Collation::COMMON_SEC_AND_TER_CE; secTer = Collation::COMMON_SEC_AND_TER_CE;
} }
uint32_t st = (s << 16) | t; uint32_t st = (s << 16) | t;

View File

@ -126,7 +126,7 @@ CollationWeights::initForSecondary() {
maxBytes[1] = 0; maxBytes[1] = 0;
minBytes[2] = 0; minBytes[2] = 0;
maxBytes[2] = 0; maxBytes[2] = 0;
minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
maxBytes[3] = 0xff; maxBytes[3] = 0xff;
minBytes[4] = 2; minBytes[4] = 2;
maxBytes[4] = 0xff; maxBytes[4] = 0xff;
@ -142,7 +142,7 @@ CollationWeights::initForTertiary() {
maxBytes[2] = 0; maxBytes[2] = 0;
// We use only 6 bits per byte. // We use only 6 bits per byte.
// The other bits are used for case & quaternary weights. // The other bits are used for case & quaternary weights.
minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
maxBytes[3] = 0x3f; maxBytes[3] = 0x3f;
minBytes[4] = 2; minBytes[4] = 2;
maxBytes[4] = 0x3f; maxBytes[4] = 0x3f;

View File

@ -114,6 +114,8 @@ private:
UBool getCollationKey(const char *norm, const UnicodeString &line, UBool getCollationKey(const char *norm, const UnicodeString &line,
const UChar *s, int32_t length, const UChar *s, int32_t length,
CollationKey &key, IcuTestErrorCode &errorCode); CollationKey &key, IcuTestErrorCode &errorCode);
UBool getMergedCollationKey(const UChar *s, int32_t length,
CollationKey &key, IcuTestErrorCode &errorCode);
UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
const UnicodeString &prevString, const UnicodeString &s, const UnicodeString &prevString, const UnicodeString &s,
UCollationResult expectedOrder, Collation::Level expectedLevel, UCollationResult expectedOrder, Collation::Level expectedLevel,
@ -172,11 +174,9 @@ void CollationTest::TestMinMax() {
return; return;
} }
int64_t ce = ces.elementAti(0); int64_t ce = ces.elementAti(0);
int64_t expected = int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
Collation::MERGE_SEPARATOR_LOWER32;
if(ce != expected) { if(ce != expected) {
errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce); errln("CE(U+fffe)=%04lx != 02..", (long)ce);
} }
ce = ces.elementAti(1); ce = ces.elementAti(1);
@ -617,11 +617,8 @@ UBool isValidCE(const CollationRootElements &re, const CollationData &data,
} }
// Minimum & maximum lead bytes. // Minimum & maximum lead bytes.
if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) || if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
(s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) || s1 == Collation::LEVEL_SEPARATOR_BYTE ||
(t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) { t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
return FALSE;
}
if(t1 != 0 && t1 > 0x3f) {
return FALSE; return FALSE;
} }
if(c > 2) { if(c > 2) {
@ -1372,7 +1369,39 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
return FALSE; return FALSE;
} }
// If s contains U+FFFE, check that merged segments make the same key. // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
static const int32_t partSizes[] = { 32, 3, 1 };
for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
int32_t partSize = partSizes[psi];
CharString parts;
if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
infoln(fileTestName);
errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
norm, (int)partSize, errorCode.errorName());
infoln(line);
return FALSE;
}
if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
infoln(fileTestName);
errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
norm, (int)partSize);
infoln(line);
infoln(printCollationKey(key));
infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
return FALSE;
}
}
return TRUE;
}
/**
* Changes the key to the merged segments of the U+FFFE-separated substrings of s.
* Leaves key unchanged if s does not contain U+FFFE.
* @return TRUE if the key was successfully changed
*/
UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
CollationKey &key, IcuTestErrorCode &errorCode) {
if(errorCode.isFailure()) { return FALSE; }
LocalMemory<uint8_t> mergedKey; LocalMemory<uint8_t> mergedKey;
int32_t mergedKeyLength = 0; int32_t mergedKeyLength = 0;
int32_t mergedKeyCapacity = 0; int32_t mergedKeyCapacity = 0;
@ -1382,7 +1411,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
if(i == sLength) { if(i == sLength) {
if(segmentStart == 0) { if(segmentStart == 0) {
// s does not contain any U+FFFE. // s does not contain any U+FFFE.
break; return FALSE;
} }
} else if(s[i] != 0xfffe) { } else if(s[i] != 0xfffe) {
++i; ++i;
@ -1423,41 +1452,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
if(i == sLength) { break; } if(i == sLength) { break; }
segmentStart = ++i; segmentStart = ++i;
} }
if(segmentStart != 0 && key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
(mergedKeyLength != keyLength ||
uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
infoln(fileTestName);
errln("Collator(%s).getCollationKey(with U+FFFE) != "
"ucol_mergeSortkeys(segments)",
norm);
infoln(line);
infoln(printCollationKey(key));
infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
return FALSE;
}
// Check that internalNextSortKeyPart() makes the same key, with several part sizes.
static const int32_t partSizes[] = { 32, 3, 1 };
for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
int32_t partSize = partSizes[psi];
CharString parts;
if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
infoln(fileTestName);
errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
norm, (int)partSize, errorCode.errorName());
infoln(line);
return FALSE;
}
if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
infoln(fileTestName);
errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
norm, (int)partSize);
infoln(line);
infoln(printCollationKey(key));
infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
return FALSE;
}
}
return TRUE; return TRUE;
} }
@ -1488,6 +1483,29 @@ const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf
return buffer; return buffer;
} }
int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
UCollationResult order, UBool collHasCaseLevel) {
if(order == UCOL_EQUAL) {
return Collation::NO_LEVEL;
}
int32_t prevKeyLength;
const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
int32_t keyLength;
const uint8_t *bytes = key.getByteArray(keyLength);
int32_t level = Collation::PRIMARY_LEVEL;
for(int32_t i = 0;; ++i) {
uint8_t b = prevBytes[i];
if(b != bytes[i]) { break; }
if(b == Collation::LEVEL_SEPARATOR_BYTE) {
++level;
if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
++level;
}
}
}
return level;
}
} }
UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
@ -1649,23 +1667,9 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
infoln(printCollationKey(key)); infoln(printCollationKey(key));
return FALSE; return FALSE;
} }
UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
int32_t prevKeyLength;
const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
int32_t keyLength;
const uint8_t *bytes = key.getByteArray(keyLength);
int32_t level = Collation::PRIMARY_LEVEL;
for(int32_t i = 0;; ++i) {
uint8_t b = prevBytes[i];
if(b != bytes[i]) { break; }
if(b == Collation::LEVEL_SEPARATOR_BYTE) {
++level;
if(level == Collation::CASE_LEVEL &&
coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
++level;
}
}
}
if(level != expectedLevel) { if(level != expectedLevel) {
infoln(fileTestName); infoln(fileTestName);
errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d", errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
@ -1677,6 +1681,45 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
return FALSE; return FALSE;
} }
} }
// If either string contains U+FFFE, then their sort keys must compare the same as
// the merged sort keys of each string's between-FFFE segments.
//
// It is not required that
// sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
// only that those two methods yield the same order.
//
// Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
errorCode.isFailure()) {
order = prevKey.compareTo(key, errorCode);
if(order != expectedOrder || errorCode.isFailure()) {
infoln(fileTestName);
errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
"(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
(int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
infoln(prevFileLine);
infoln(fileLine);
infoln(printCollationKey(prevKey));
infoln(printCollationKey(key));
return FALSE;
}
int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
if(mergedLevel != level) {
infoln(fileTestName);
errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
"(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
(int)fileLineNumber, norm, order, mergedLevel, level);
infoln(prevFileLine);
infoln(fileLine);
infoln(printCollationKey(prevKey));
infoln(printCollationKey(key));
return FALSE;
}
}
}
return TRUE; return TRUE;
} }