ICU-10829 simplify U+FFFE collation: U+FFFE use common non-primary weights, adjust code for that, test order of U+FFFE equivalent to ucol_mergeSortkeys() but not necessarily same sort keys, omit case level if lowerFirst and only common weights
X-SVN-Rev: 36856
This commit is contained in:
parent
7ce10f73cf
commit
030eff56d3
@ -146,7 +146,7 @@
|
||||
* This value may change in subsequent releases of ICU.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define UCOL_RUNTIME_VERSION 8
|
||||
#define UCOL_RUNTIME_VERSION 9
|
||||
|
||||
/**
|
||||
* Collation builder code version.
|
||||
|
Binary file not shown.
Binary file not shown.
@ -47051,7 +47051,7 @@ FDD1 FDD0; [E4, 05, 05] # unassigned first primary
|
||||
|
||||
# SPECIAL MAX/MIN COLLATION ELEMENTS
|
||||
|
||||
FFFE; [02, 02, 02] # Special LOWEST primary, for merge/interleaving
|
||||
FFFE; [02, 05, 05] # Special LOWEST primary, for merge/interleaving
|
||||
FFFF; [EF FF, 05, 05] # Special HIGHEST primary, for ranges
|
||||
|
||||
|
||||
|
@ -29,17 +29,19 @@ public:
|
||||
// Special sort key bytes for all levels.
|
||||
static const uint8_t TERMINATOR_BYTE = 0;
|
||||
static const uint8_t LEVEL_SEPARATOR_BYTE = 1;
|
||||
|
||||
/** The secondary/tertiary lower limit for tailoring before any root elements. */
|
||||
static const uint32_t BEFORE_WEIGHT16 = 0x0100;
|
||||
|
||||
/**
|
||||
* Merge-sort-key separator.
|
||||
* Must not be used as the lead byte of any CE weight,
|
||||
* nor as primary compression low terminator.
|
||||
* Same as the unique primary and identical-level weights of U+FFFE.
|
||||
* Must not be used as primary compression low terminator.
|
||||
* Otherwise usable.
|
||||
*/
|
||||
static const uint8_t MERGE_SEPARATOR_BYTE = 2;
|
||||
static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE
|
||||
static const uint32_t MERGE_SEPARATOR_WEIGHT16 = 0x0200; // U+FFFE
|
||||
static const uint32_t MERGE_SEPARATOR_LOWER32 = 0x02000200; // U+FFFE
|
||||
static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000202; // U+FFFE
|
||||
static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE
|
||||
|
||||
/**
|
||||
* Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.
|
||||
|
@ -450,8 +450,8 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
|
||||
}
|
||||
nodes.setElementAt(node, index);
|
||||
int32_t nextIndex = nextIndexFromNode(node);
|
||||
// Insert default nodes with weights 02 and 05, reset to the 02 node.
|
||||
node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength);
|
||||
// Insert default nodes with weights 01 and 05, reset to the 01 node.
|
||||
node = nodeFromWeight16(Collation::BEFORE_WEIGHT16) | nodeFromStrength(strength);
|
||||
index = insertNodeBetween(index, nextIndex, node, errorCode);
|
||||
node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |
|
||||
nodeFromStrength(strength);
|
||||
@ -961,7 +961,7 @@ CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
|
||||
index = nextIndexFromNode(node);
|
||||
node = nodes.elementAti(index);
|
||||
U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
|
||||
weight16FromNode(node) == BEFORE_WEIGHT16);
|
||||
weight16FromNode(node) == Collation::BEFORE_WEIGHT16);
|
||||
// Skip to the explicit common node.
|
||||
do {
|
||||
index = nextIndexFromNode(node);
|
||||
@ -1398,7 +1398,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
|
||||
// Gap at the beginning of the tertiary CE range.
|
||||
t = rootElements.getTertiaryBoundary() - 0x100;
|
||||
tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
|
||||
} else if(t == BEFORE_WEIGHT16) {
|
||||
} else if(t == Collation::BEFORE_WEIGHT16) {
|
||||
tLimit = Collation::COMMON_WEIGHT16;
|
||||
} else if(!pIsTailored && !sIsTailored) {
|
||||
// p and s are root weights.
|
||||
@ -1441,7 +1441,7 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
|
||||
// Gap at the beginning of the secondary CE range.
|
||||
s = rootElements.getSecondaryBoundary() - 0x100;
|
||||
sLimit = rootElements.getFirstSecondaryCE() >> 16;
|
||||
} else if(s == BEFORE_WEIGHT16) {
|
||||
} else if(s == Collation::BEFORE_WEIGHT16) {
|
||||
sLimit = Collation::COMMON_WEIGHT16;
|
||||
} else if(!pIsTailored) {
|
||||
// p is a root primary.
|
||||
|
@ -215,9 +215,6 @@ private:
|
||||
|
||||
static int32_t ceStrength(int64_t ce);
|
||||
|
||||
/** The secondary/tertiary lower limit for tailoring before the common weight. */
|
||||
static const uint32_t BEFORE_WEIGHT16 = Collation::MERGE_SEPARATOR_WEIGHT16;
|
||||
|
||||
/** At most 1M nodes, limited by the 20 bits in node bit fields. */
|
||||
static const int32_t MAX_INDEX = 0xfffff;
|
||||
/**
|
||||
|
@ -136,18 +136,17 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
||||
int32_t rightStart = 0;
|
||||
for(;;) {
|
||||
// Find the merge separator or the NO_CE terminator.
|
||||
uint32_t p;
|
||||
int32_t leftLimit = leftStart;
|
||||
uint32_t leftLower32;
|
||||
while((leftLower32 = (uint32_t)left.getCE(leftLimit)) >
|
||||
Collation::MERGE_SEPARATOR_LOWER32 ||
|
||||
leftLower32 == 0) {
|
||||
while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
|
||||
Collation::MERGE_SEPARATOR_PRIMARY ||
|
||||
p == 0) {
|
||||
++leftLimit;
|
||||
}
|
||||
int32_t rightLimit = rightStart;
|
||||
uint32_t rightLower32;
|
||||
while((rightLower32 = (uint32_t)right.getCE(rightLimit)) >
|
||||
Collation::MERGE_SEPARATOR_LOWER32 ||
|
||||
rightLower32 == 0) {
|
||||
while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
|
||||
Collation::MERGE_SEPARATOR_PRIMARY ||
|
||||
p == 0) {
|
||||
++rightLimit;
|
||||
}
|
||||
|
||||
@ -175,7 +174,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
||||
// Both strings have the same number of merge separators,
|
||||
// or else there would have been a primary-level difference.
|
||||
U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
|
||||
if(left.getCE(leftLimit) == Collation::NO_CE) { break; }
|
||||
if(p == Collation::NO_CE_PRIMARY) { break; }
|
||||
// Skip both merge separators and continue.
|
||||
leftStart = leftLimit + 1;
|
||||
rightStart = rightLimit + 1;
|
||||
@ -276,20 +275,19 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
||||
|
||||
if(leftTertiary != rightTertiary) {
|
||||
if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
|
||||
// Pass through NO_CE and MERGE_SEPARATOR
|
||||
// and keep real tertiary weights larger than the MERGE_SEPARATOR.
|
||||
// Pass through NO_CE and keep real tertiary weights larger than that.
|
||||
// Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
|
||||
// to keep tertiary CEs well-formed.
|
||||
// Their case+tertiary weights must be greater than those of
|
||||
// primary and secondary CEs.
|
||||
if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
if(leftTertiary > Collation::NO_CE_WEIGHT16) {
|
||||
if(leftLower32 > 0xffff) {
|
||||
leftTertiary ^= 0xc000;
|
||||
} else {
|
||||
leftTertiary += 0x4000;
|
||||
}
|
||||
}
|
||||
if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
if(rightTertiary > Collation::NO_CE_WEIGHT16) {
|
||||
if(rightLower32 > 0xffff) {
|
||||
rightTertiary ^= 0xc000;
|
||||
} else {
|
||||
@ -316,11 +314,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
||||
do {
|
||||
int64_t ce = left.getCE(leftIndex++);
|
||||
leftQuaternary = (uint32_t)ce & 0xffff;
|
||||
if(leftQuaternary == 0) {
|
||||
// Variable primary or completely ignorable.
|
||||
if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
|
||||
// Variable primary or completely ignorable or NO_CE.
|
||||
leftQuaternary = (uint32_t)(ce >> 32);
|
||||
} else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
// Leave NO_CE or MERGE_SEPARATOR as is.
|
||||
} else {
|
||||
// Regular CE, not tertiary ignorable.
|
||||
// Preserve the quaternary weight in bits 7..6.
|
||||
@ -332,11 +328,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
||||
do {
|
||||
int64_t ce = right.getCE(rightIndex++);
|
||||
rightQuaternary = (uint32_t)ce & 0xffff;
|
||||
if(rightQuaternary == 0) {
|
||||
// Variable primary or completely ignorable.
|
||||
if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
|
||||
// Variable primary or completely ignorable or NO_CE.
|
||||
rightQuaternary = (uint32_t)(ce >> 32);
|
||||
} else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
// Leave NO_CE or MERGE_SEPARATOR as is.
|
||||
} else {
|
||||
// Regular CE, not tertiary ignorable.
|
||||
// Preserve the quaternary weight in bits 7..6.
|
||||
@ -353,7 +347,7 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
||||
}
|
||||
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
|
||||
}
|
||||
if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; }
|
||||
if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
|
||||
}
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
|
@ -262,7 +262,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
int32_t commonQuaternaries = 0;
|
||||
|
||||
uint32_t prevSecondary = 0;
|
||||
UBool anyMergeSeparators = FALSE;
|
||||
int32_t secSegmentStart = 0;
|
||||
|
||||
for(;;) {
|
||||
// No need to keep all CEs in the buffer when we write a sort key.
|
||||
@ -350,7 +350,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
uint32_t s = lower32 >> 16;
|
||||
if(s == 0) {
|
||||
// secondary ignorable
|
||||
} else if(s == Collation::COMMON_WEIGHT16) {
|
||||
} else if(s == Collation::COMMON_WEIGHT16 &&
|
||||
((options & CollationSettings::BACKWARD_SECONDARY) == 0 ||
|
||||
p != Collation::MERGE_SEPARATOR_PRIMARY)) {
|
||||
// s is a common secondary weight, and
|
||||
// backwards-secondary is off or the ce is not the merge separator.
|
||||
++commonSecondaries;
|
||||
} else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
|
||||
if(commonSecondaries != 0) {
|
||||
@ -389,16 +393,28 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
}
|
||||
// commonSecondaries == 0
|
||||
}
|
||||
// Reduce separators so that we can look for byte<=1 later.
|
||||
if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
if(s == Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
anyMergeSeparators = TRUE;
|
||||
if(0 < p && p <= Collation::MERGE_SEPARATOR_PRIMARY) {
|
||||
// The backwards secondary level compares secondary weights backwards
|
||||
// within segments separated by the merge separator (U+FFFE).
|
||||
uint8_t *secs = secondaries.data();
|
||||
int32_t last = secondaries.length() - 1;
|
||||
if(secSegmentStart < last) {
|
||||
uint8_t *p = secs + secSegmentStart;
|
||||
uint8_t *q = secs + last;
|
||||
do {
|
||||
uint8_t b = *p;
|
||||
*p++ = *q;
|
||||
*q-- = b;
|
||||
} while(p < q);
|
||||
}
|
||||
secondaries.appendByte((s >> 8) - 1);
|
||||
secondaries.appendByte(p == Collation::NO_CE_PRIMARY ?
|
||||
Collation::LEVEL_SEPARATOR_BYTE : Collation::MERGE_SEPARATOR_BYTE);
|
||||
prevSecondary = 0;
|
||||
secSegmentStart = secondaries.length();
|
||||
} else {
|
||||
secondaries.appendReverseWeight16(s);
|
||||
prevSecondary = s;
|
||||
}
|
||||
prevSecondary = s;
|
||||
}
|
||||
}
|
||||
|
||||
@ -411,19 +427,23 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
} else {
|
||||
uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lead byte
|
||||
U_ASSERT((c & 0xc0) != 0xc0);
|
||||
if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) {
|
||||
if((c & 0xc0) == 0 && c > Collation::LEVEL_SEPARATOR_BYTE) {
|
||||
++commonCases;
|
||||
} else {
|
||||
if((options & CollationSettings::UPPER_FIRST) == 0) {
|
||||
// lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14, upper=15.
|
||||
if(commonCases != 0) {
|
||||
// If there are only common (=lowest) weights in the whole level,
|
||||
// then we need not write anything.
|
||||
// Level length differences are handled already on the next-higher level.
|
||||
if(commonCases != 0 &&
|
||||
(c > Collation::LEVEL_SEPARATOR_BYTE || !cases.isEmpty())) {
|
||||
--commonCases;
|
||||
while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) {
|
||||
cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4);
|
||||
commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT;
|
||||
}
|
||||
uint32_t b;
|
||||
if(c <= Collation::MERGE_SEPARATOR_BYTE) {
|
||||
if(c <= Collation::LEVEL_SEPARATOR_BYTE) {
|
||||
b = CASE_LOWER_FIRST_COMMON_LOW + commonCases;
|
||||
} else {
|
||||
b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases;
|
||||
@ -431,7 +451,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
cases.appendByte(b << 4);
|
||||
commonCases = 0;
|
||||
}
|
||||
if(c > Collation::MERGE_SEPARATOR_BYTE) {
|
||||
if(c > Collation::LEVEL_SEPARATOR_BYTE) {
|
||||
c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4; // 14 or 15
|
||||
}
|
||||
} else {
|
||||
@ -447,11 +467,11 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4);
|
||||
commonCases = 0;
|
||||
}
|
||||
if(c > Collation::MERGE_SEPARATOR_BYTE) {
|
||||
if(c > Collation::LEVEL_SEPARATOR_BYTE) {
|
||||
c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4; // 2 or 1
|
||||
}
|
||||
}
|
||||
// c is a separator byte 01 or 02,
|
||||
// c is a separator byte 01,
|
||||
// or a left-shifted nibble 0x10, 0x20, ... 0xf0.
|
||||
cases.appendByte(c);
|
||||
}
|
||||
@ -510,14 +530,14 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
// Their case+tertiary weights must be greater than those of
|
||||
// primary and secondary CEs.
|
||||
//
|
||||
// Separators 01..02 -> 01..02 (unchanged)
|
||||
// Lowercase 03..04 -> 83..84 (includes uncased)
|
||||
// Separator 01 -> 01 (unchanged)
|
||||
// Lowercase 02..04 -> 82..84 (includes uncased)
|
||||
// Common weight 05 -> 85..C5 (common-weight compression range)
|
||||
// Lowercase 06..3F -> C6..FF
|
||||
// Mixed case 43..7F -> 43..7F
|
||||
// Uppercase 83..BF -> 03..3F
|
||||
// Mixed case 42..7F -> 42..7F
|
||||
// Uppercase 82..BF -> 02..3F
|
||||
// Tertiary CE 86..BF -> C6..FF
|
||||
if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
if(t <= Collation::NO_CE_WEIGHT16) {
|
||||
// Keep separators unchanged.
|
||||
} else if(lower32 > 0xffff) {
|
||||
// Invert case bits of primary & secondary CEs.
|
||||
@ -551,24 +571,22 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
|
||||
if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) {
|
||||
uint32_t q = lower32 & 0xffff;
|
||||
if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
if((q & 0xc0) == 0 && q > Collation::NO_CE_WEIGHT16) {
|
||||
++commonQuaternaries;
|
||||
} else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 &&
|
||||
} else if(q == Collation::NO_CE_WEIGHT16 &&
|
||||
(options & CollationSettings::ALTERNATE_MASK) == 0 &&
|
||||
(quaternaries.isEmpty() ||
|
||||
quaternaries[quaternaries.length() - 1] == Collation::MERGE_SEPARATOR_BYTE)) {
|
||||
// If alternate=non-ignorable and there are only
|
||||
// common quaternary weights between two separators,
|
||||
// then we need not write anything between these separators.
|
||||
quaternaries.isEmpty()) {
|
||||
// If alternate=non-ignorable and there are only common quaternary weights,
|
||||
// then we need not write anything.
|
||||
// The only weights greater than the merge separator and less than the common weight
|
||||
// are shifted primary weights, which are not generated for alternate=non-ignorable.
|
||||
// There are also exactly as many quaternary weights as tertiary weights,
|
||||
// so level length differences are handled already on tertiary level.
|
||||
// Any above-common quaternary weight will compare greater regardless.
|
||||
quaternaries.appendByte(q >> 8);
|
||||
quaternaries.appendByte(Collation::LEVEL_SEPARATOR_BYTE);
|
||||
} else {
|
||||
if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
||||
q >>= 8;
|
||||
if(q == Collation::NO_CE_WEIGHT16) {
|
||||
q = Collation::LEVEL_SEPARATOR_BYTE;
|
||||
} else {
|
||||
q = 0xfc + ((q >> 6) & 3);
|
||||
}
|
||||
@ -602,42 +620,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; }
|
||||
ok &= secondaries.isOk();
|
||||
sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
|
||||
uint8_t *secs = secondaries.data();
|
||||
int32_t length = secondaries.length() - 1; // Ignore the trailing NO_CE.
|
||||
if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
|
||||
// The backwards secondary level compares secondary weights backwards
|
||||
// within segments separated by the merge separator (U+FFFE, weight 02).
|
||||
// The separator weights 01 & 02 were reduced to 00 & 01 so that
|
||||
// we do not accidentally separate at a _second_ weight byte of 02.
|
||||
int32_t start = 0;
|
||||
for(;;) {
|
||||
// Find the merge separator or the NO_CE terminator.
|
||||
int32_t limit;
|
||||
if(anyMergeSeparators) {
|
||||
limit = start;
|
||||
while(secs[limit] > 1) { ++limit; }
|
||||
} else {
|
||||
limit = length;
|
||||
}
|
||||
// Reverse this segment.
|
||||
if(start < limit) {
|
||||
uint8_t *p = secs + start;
|
||||
uint8_t *q = secs + limit - 1;
|
||||
while(p < q) {
|
||||
uint8_t s = *p;
|
||||
*p++ = *q;
|
||||
*q-- = s;
|
||||
}
|
||||
}
|
||||
// Did we reach the end of the string?
|
||||
if(secs[limit] == 0) { break; }
|
||||
// Restore the merge separator.
|
||||
secs[limit] = 2;
|
||||
// Skip the merge separator and continue.
|
||||
start = limit + 1;
|
||||
}
|
||||
}
|
||||
sink.Append(reinterpret_cast<char *>(secs), length);
|
||||
secondaries.appendTo(sink);
|
||||
}
|
||||
|
||||
if((levels & Collation::CASE_LEVEL_FLAG) != 0) {
|
||||
@ -649,21 +632,12 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
||||
uint8_t b = 0;
|
||||
for(int32_t i = 0; i < length; ++i) {
|
||||
uint8_t c = (uint8_t)cases[i];
|
||||
if(c <= Collation::MERGE_SEPARATOR_BYTE) {
|
||||
U_ASSERT(c != 0);
|
||||
if(b != 0) {
|
||||
sink.Append(b);
|
||||
b = 0;
|
||||
}
|
||||
sink.Append(c);
|
||||
U_ASSERT((c & 0xf) == 0 && c != 0);
|
||||
if(b == 0) {
|
||||
b = c;
|
||||
} else {
|
||||
U_ASSERT((c & 0xf) == 0);
|
||||
if(b == 0) {
|
||||
b = c;
|
||||
} else {
|
||||
sink.Append(b | (c >> 4));
|
||||
b = 0;
|
||||
}
|
||||
sink.Append(b | (c >> 4));
|
||||
b = 0;
|
||||
}
|
||||
}
|
||||
if(b != 0) {
|
||||
|
@ -124,7 +124,7 @@ CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
|
||||
sec = elements[index] >> 16;
|
||||
} else {
|
||||
index = findPrimary(p) + 1;
|
||||
previousSec = Collation::MERGE_SEPARATOR_WEIGHT16;
|
||||
previousSec = Collation::BEFORE_WEIGHT16;
|
||||
sec = Collation::COMMON_WEIGHT16;
|
||||
}
|
||||
U_ASSERT(s >= sec);
|
||||
@ -149,12 +149,12 @@ CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) con
|
||||
previousTer = 0;
|
||||
} else {
|
||||
index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
|
||||
previousTer = Collation::MERGE_SEPARATOR_WEIGHT16;
|
||||
previousTer = Collation::BEFORE_WEIGHT16;
|
||||
}
|
||||
secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
|
||||
} else {
|
||||
index = findPrimary(p) + 1;
|
||||
previousTer = Collation::MERGE_SEPARATOR_WEIGHT16;
|
||||
previousTer = Collation::BEFORE_WEIGHT16;
|
||||
secTer = Collation::COMMON_SEC_AND_TER_CE;
|
||||
}
|
||||
uint32_t st = (s << 16) | t;
|
||||
|
@ -126,7 +126,7 @@ CollationWeights::initForSecondary() {
|
||||
maxBytes[1] = 0;
|
||||
minBytes[2] = 0;
|
||||
maxBytes[2] = 0;
|
||||
minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
|
||||
minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
|
||||
maxBytes[3] = 0xff;
|
||||
minBytes[4] = 2;
|
||||
maxBytes[4] = 0xff;
|
||||
@ -142,7 +142,7 @@ CollationWeights::initForTertiary() {
|
||||
maxBytes[2] = 0;
|
||||
// We use only 6 bits per byte.
|
||||
// The other bits are used for case & quaternary weights.
|
||||
minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
|
||||
minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
|
||||
maxBytes[3] = 0x3f;
|
||||
minBytes[4] = 2;
|
||||
maxBytes[4] = 0x3f;
|
||||
|
@ -114,6 +114,8 @@ private:
|
||||
UBool getCollationKey(const char *norm, const UnicodeString &line,
|
||||
const UChar *s, int32_t length,
|
||||
CollationKey &key, IcuTestErrorCode &errorCode);
|
||||
UBool getMergedCollationKey(const UChar *s, int32_t length,
|
||||
CollationKey &key, IcuTestErrorCode &errorCode);
|
||||
UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
|
||||
const UnicodeString &prevString, const UnicodeString &s,
|
||||
UCollationResult expectedOrder, Collation::Level expectedLevel,
|
||||
@ -172,11 +174,9 @@ void CollationTest::TestMinMax() {
|
||||
return;
|
||||
}
|
||||
int64_t ce = ces.elementAti(0);
|
||||
int64_t expected =
|
||||
((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
|
||||
Collation::MERGE_SEPARATOR_LOWER32;
|
||||
int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
|
||||
if(ce != expected) {
|
||||
errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
|
||||
errln("CE(U+fffe)=%04lx != 02..", (long)ce);
|
||||
}
|
||||
|
||||
ce = ces.elementAti(1);
|
||||
@ -617,11 +617,8 @@ UBool isValidCE(const CollationRootElements &re, const CollationData &data,
|
||||
}
|
||||
// Minimum & maximum lead bytes.
|
||||
if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
|
||||
(s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
|
||||
(t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
|
||||
return FALSE;
|
||||
}
|
||||
if(t1 != 0 && t1 > 0x3f) {
|
||||
s1 == Collation::LEVEL_SEPARATOR_BYTE ||
|
||||
t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
|
||||
return FALSE;
|
||||
}
|
||||
if(c > 2) {
|
||||
@ -1372,7 +1369,39 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// If s contains U+FFFE, check that merged segments make the same key.
|
||||
// Check that internalNextSortKeyPart() makes the same key, with several part sizes.
|
||||
static const int32_t partSizes[] = { 32, 3, 1 };
|
||||
for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
|
||||
int32_t partSize = partSizes[psi];
|
||||
CharString parts;
|
||||
if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
|
||||
infoln(fileTestName);
|
||||
errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
|
||||
norm, (int)partSize, errorCode.errorName());
|
||||
infoln(line);
|
||||
return FALSE;
|
||||
}
|
||||
if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
|
||||
infoln(fileTestName);
|
||||
errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
|
||||
norm, (int)partSize);
|
||||
infoln(line);
|
||||
infoln(printCollationKey(key));
|
||||
infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the key to the merged segments of the U+FFFE-separated substrings of s.
|
||||
* Leaves key unchanged if s does not contain U+FFFE.
|
||||
* @return TRUE if the key was successfully changed
|
||||
*/
|
||||
UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
|
||||
CollationKey &key, IcuTestErrorCode &errorCode) {
|
||||
if(errorCode.isFailure()) { return FALSE; }
|
||||
LocalMemory<uint8_t> mergedKey;
|
||||
int32_t mergedKeyLength = 0;
|
||||
int32_t mergedKeyCapacity = 0;
|
||||
@ -1382,7 +1411,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
|
||||
if(i == sLength) {
|
||||
if(segmentStart == 0) {
|
||||
// s does not contain any U+FFFE.
|
||||
break;
|
||||
return FALSE;
|
||||
}
|
||||
} else if(s[i] != 0xfffe) {
|
||||
++i;
|
||||
@ -1423,41 +1452,7 @@ UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
|
||||
if(i == sLength) { break; }
|
||||
segmentStart = ++i;
|
||||
}
|
||||
if(segmentStart != 0 &&
|
||||
(mergedKeyLength != keyLength ||
|
||||
uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
|
||||
infoln(fileTestName);
|
||||
errln("Collator(%s).getCollationKey(with U+FFFE) != "
|
||||
"ucol_mergeSortkeys(segments)",
|
||||
norm);
|
||||
infoln(line);
|
||||
infoln(printCollationKey(key));
|
||||
infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Check that internalNextSortKeyPart() makes the same key, with several part sizes.
|
||||
static const int32_t partSizes[] = { 32, 3, 1 };
|
||||
for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
|
||||
int32_t partSize = partSizes[psi];
|
||||
CharString parts;
|
||||
if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
|
||||
infoln(fileTestName);
|
||||
errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
|
||||
norm, (int)partSize, errorCode.errorName());
|
||||
infoln(line);
|
||||
return FALSE;
|
||||
}
|
||||
if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
|
||||
infoln(fileTestName);
|
||||
errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
|
||||
norm, (int)partSize);
|
||||
infoln(line);
|
||||
infoln(printCollationKey(key));
|
||||
infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -1488,6 +1483,29 @@ const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf
|
||||
return buffer;
|
||||
}
|
||||
|
||||
int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
|
||||
UCollationResult order, UBool collHasCaseLevel) {
|
||||
if(order == UCOL_EQUAL) {
|
||||
return Collation::NO_LEVEL;
|
||||
}
|
||||
int32_t prevKeyLength;
|
||||
const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
|
||||
int32_t keyLength;
|
||||
const uint8_t *bytes = key.getByteArray(keyLength);
|
||||
int32_t level = Collation::PRIMARY_LEVEL;
|
||||
for(int32_t i = 0;; ++i) {
|
||||
uint8_t b = prevBytes[i];
|
||||
if(b != bytes[i]) { break; }
|
||||
if(b == Collation::LEVEL_SEPARATOR_BYTE) {
|
||||
++level;
|
||||
if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
|
||||
++level;
|
||||
}
|
||||
}
|
||||
}
|
||||
return level;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
|
||||
@ -1649,23 +1667,9 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
|
||||
infoln(printCollationKey(key));
|
||||
return FALSE;
|
||||
}
|
||||
UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
|
||||
int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
|
||||
if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
|
||||
int32_t prevKeyLength;
|
||||
const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
|
||||
int32_t keyLength;
|
||||
const uint8_t *bytes = key.getByteArray(keyLength);
|
||||
int32_t level = Collation::PRIMARY_LEVEL;
|
||||
for(int32_t i = 0;; ++i) {
|
||||
uint8_t b = prevBytes[i];
|
||||
if(b != bytes[i]) { break; }
|
||||
if(b == Collation::LEVEL_SEPARATOR_BYTE) {
|
||||
++level;
|
||||
if(level == Collation::CASE_LEVEL &&
|
||||
coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
|
||||
++level;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(level != expectedLevel) {
|
||||
infoln(fileTestName);
|
||||
errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
|
||||
@ -1677,6 +1681,45 @@ UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
// If either string contains U+FFFE, then their sort keys must compare the same as
|
||||
// the merged sort keys of each string's between-FFFE segments.
|
||||
//
|
||||
// It is not required that
|
||||
// sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
|
||||
// only that those two methods yield the same order.
|
||||
//
|
||||
// Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
|
||||
if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
|
||||
getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
|
||||
errorCode.isFailure()) {
|
||||
order = prevKey.compareTo(key, errorCode);
|
||||
if(order != expectedOrder || errorCode.isFailure()) {
|
||||
infoln(fileTestName);
|
||||
errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
|
||||
"(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
|
||||
(int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
|
||||
infoln(prevFileLine);
|
||||
infoln(fileLine);
|
||||
infoln(printCollationKey(prevKey));
|
||||
infoln(printCollationKey(key));
|
||||
return FALSE;
|
||||
}
|
||||
int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
|
||||
if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
|
||||
if(mergedLevel != level) {
|
||||
infoln(fileTestName);
|
||||
errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
|
||||
"(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
|
||||
(int)fileLineNumber, norm, order, mergedLevel, level);
|
||||
infoln(prevFileLine);
|
||||
infoln(fileLine);
|
||||
infoln(printCollationKey(prevKey));
|
||||
infoln(printCollationKey(key));
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user