ICU-11448 collation common tertiary weights for normal Hiragana, change builder to cope with below-common tertiary weights

X-SVN-Rev: 36907
This commit is contained in:
Markus Scherer 2014-12-27 00:39:08 +00:00
parent 590a85cbf1
commit 6c0fcd7a64
5 changed files with 259 additions and 113 deletions

View File

@ -124,7 +124,7 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
int index = findOrInsertNodeForCEs(strength);
long node = nodes.elementAti(index);
// If the index is for a "weaker" tailored node,
// If the index is for a "weaker" node,
// then skip backwards over this and further "weaker" nodes.
while(strengthFromNode(node) > strength) {
index = previousIndexFromNode(node);
@ -168,6 +168,8 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
if(strength >= Collator.TERTIARY) {
index = findCommonNode(index, Collator.TERTIARY);
}
// findCommonNode() stayed on the stronger node or moved to
// an explicit common-weight node of the reset-before strength.
node = nodes.elementAti(index);
if(strengthFromNode(node) == strength) {
// Found a same-strength node with an explicit weight.
@ -178,90 +180,45 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
"reset secondary-before secondary ignorable not possible" :
"reset tertiary-before completely ignorable not possible");
}
assert(weight16 >= Collation.COMMON_WEIGHT16);
assert(weight16 > Collation.BEFORE_WEIGHT16);
// Reset to just before this node.
// Insert the preceding same-level explicit weight if it is not there already.
// Which explicit weight immediately precedes this one?
weight16 = getWeight16Before(index, node, strength);
// Does this preceding weight have a node?
int previousWeight16;
int previousIndex = previousIndexFromNode(node);
if(weight16 == Collation.COMMON_WEIGHT16) {
// Reset to just before this same-strength common-weight node.
for(int i = previousIndex;; i = previousIndexFromNode(node)) {
node = nodes.elementAti(i);
int previousStrength = strengthFromNode(node);
if(previousStrength < strength) {
assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex);
// Either the reset element has an above-common weight and
// the parent node provides the implied common weight,
// or the reset element has a weight<=common in the node
// right after the parent, and we need to insert the preceding weight.
previousWeight16 = Collation.COMMON_WEIGHT16;
break;
} else if(previousStrength == strength && !isTailoredNode(node)) {
previousWeight16 = weight16FromNode(node);
break;
}
// Skip weaker nodes and same-level tailored nodes.
}
if(previousWeight16 == weight16) {
// The preceding weight has a node,
// maybe with following weaker or tailored nodes.
// Reset to the last of them.
index = previousIndex;
} else {
// A non-common weight is only possible from a root CE.
// Find the higher-level weights, which must all be explicit,
// and then find the preceding weight for this level.
long previousWeight16 = 0;
int previousWeightIndex = -1;
int i = index;
if(strength == Collator.SECONDARY) {
long p;
do {
i = previousIndexFromNode(node);
node = nodes.elementAti(i);
if(strengthFromNode(node) == Collator.SECONDARY && !isTailoredNode(node) &&
previousWeightIndex < 0) {
previousWeightIndex = i;
previousWeight16 = weight16FromNode(node);
}
} while(strengthFromNode(node) > Collator.PRIMARY);
assert(!isTailoredNode(node));
p = weight32FromNode(node);
weight16 = rootElements.getSecondaryBefore(p, weight16);
} else {
long p;
int s;
do {
i = previousIndexFromNode(node);
node = nodes.elementAti(i);
if(strengthFromNode(node) == Collator.TERTIARY && !isTailoredNode(node) &&
previousWeightIndex < 0) {
previousWeightIndex = i;
previousWeight16 = weight16FromNode(node);
}
} while(strengthFromNode(node) > Collator.SECONDARY);
assert(!isTailoredNode(node));
if(strengthFromNode(node) == Collator.SECONDARY) {
s = weight16FromNode(node);
do {
i = previousIndexFromNode(node);
node = nodes.elementAti(i);
} while(strengthFromNode(node) > Collator.PRIMARY);
assert(!isTailoredNode(node));
} else {
assert(!nodeHasBefore2(node));
s = Collation.COMMON_WEIGHT16;
}
p = weight32FromNode(node);
weight16 = rootElements.getTertiaryBefore(p, s, weight16);
assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
}
// Find or insert the new explicit weight before the current one.
if(previousWeightIndex >= 0 && weight16 == previousWeight16) {
// Tailor after the last node between adjacent root nodes.
index = previousIndex;
} else {
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
index = insertNodeBetween(previousIndex, index, node);
}
// Insert a node with the preceding weight, reset to that.
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
index = insertNodeBetween(previousIndex, index, node);
}
} else {
// Found a stronger node with implied strength-common weight.
long hasBefore3 = 0;
if(strength == Collator.SECONDARY) {
assert(!nodeHasBefore2(node));
// Move the HAS_BEFORE3 flag from the parent node
// to the new secondary common node.
hasBefore3 = node & HAS_BEFORE3;
node = (node & ~(long)HAS_BEFORE3) | HAS_BEFORE2;
} else {
assert(!nodeHasBefore3(node));
node |= HAS_BEFORE3;
}
nodes.setElementAt(node, index);
int nextIndex = nextIndexFromNode(node);
// Insert default nodes with weights 01 and 05, reset to the 01 node.
node = nodeFromWeight16(Collation.BEFORE_WEIGHT16) | nodeFromStrength(strength);
index = insertNodeBetween(index, nextIndex, node);
node = nodeFromWeight16(Collation.COMMON_WEIGHT16) | hasBefore3 |
nodeFromStrength(strength);
insertNodeBetween(index, nextIndex, node);
int weight16 = getWeight16Before(index, node, strength);
index = findOrInsertWeakNode(index, weight16, strength);
}
// Strength of the temporary CE = strength of its reset position.
// Code above raises an error if the before-strength is stronger.
@ -270,6 +227,52 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
}
/**
* Returns the secondary or tertiary weight preceding the current node's weight.
* node=nodes[index].
*/
private int getWeight16Before(int index, long node, int level) {
assert(strengthFromNode(node) < level || !isTailoredNode(node));
// Collect the root CE weights if this node is for a root CE.
// If it is not, then return the low non-primary boundary for a tailored CE.
int t;
if(strengthFromNode(node) == Collator.TERTIARY) {
t = weight16FromNode(node);
} else {
t = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight.
}
while(strengthFromNode(node) > Collator.SECONDARY) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
}
if(isTailoredNode(node)) {
return Collation.BEFORE_WEIGHT16;
}
int s;
if(strengthFromNode(node) == Collator.SECONDARY) {
s = weight16FromNode(node);
} else {
s = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight.
}
while(strengthFromNode(node) > Collator.PRIMARY) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
}
if(isTailoredNode(node)) {
return Collation.BEFORE_WEIGHT16;
}
// [p, s, t] is a root CE. Return the preceding weight for the requested level.
long p = weight32FromNode(node);
int weight16;
if(level == Collator.SECONDARY) {
weight16 = rootElements.getSecondaryBefore(p, s);
} else {
weight16 = rootElements.getTertiaryBefore(p, s, t);
assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
}
return weight16;
}
private long getSpecialResetPosition(CharSequence str) {
assert(str.length() == 2);
long ce;
@ -557,7 +560,7 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
// down to the requested level/strength.
// Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
assert((ce & 0xc0) == 0);
int index = findOrInsertNodeForPrimary(ce >>> 32 );
int index = findOrInsertNodeForPrimary(ce >>> 32);
if(strength >= Collator.SECONDARY) {
int lower32 = (int)ce;
index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY);
@ -619,17 +622,44 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
/** Finds or inserts the node for a secondary or tertiary weight. */
private int findOrInsertWeakNode(int index, int weight16, int level) {
assert(0 <= index && index < nodes.size());
assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY);
assert(weight16 == 0 || weight16 >= Collation.COMMON_WEIGHT16);
// Only reset-before inserts common weights.
if(weight16 == Collation.COMMON_WEIGHT16) {
return findCommonNode(index, level);
}
// If this will be the first below-common weight for the parent node,
// then we will also need to insert a common weight after it.
long node = nodes.elementAti(index);
assert(strengthFromNode(node) < level); // parent node is stronger
if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) {
int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
if((node & hasThisLevelBefore) == 0) {
// The parent node has an implied level-common weight.
long commonNode =
nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level);
if(level == Collator.SECONDARY) {
// Move the HAS_BEFORE3 flag from the parent node
// to the new secondary common node.
commonNode |= node & HAS_BEFORE3;
node &= ~(long)HAS_BEFORE3;
}
nodes.setElementAt(node | hasThisLevelBefore, index);
// Insert below-common-weight node.
int nextIndex = nextIndexFromNode(node);
node = nodeFromWeight16(weight16) | nodeFromStrength(level);
index = insertNodeBetween(index, nextIndex, node);
// Insert common-weight node.
insertNodeBetween(index, nextIndex, commonNode);
// Return index of below-common-weight node.
return index;
}
}
// Find the root CE's weight for this level.
// Postpone insertion if not found:
// Insert the new root node before the next stronger node,
// or before the next root node with the same strength and a larger weight.
long node = nodes.elementAti(index);
int nextIndex;
while((nextIndex = nextIndexFromNode(node)) != 0) {
node = nodes.elementAti(nextIndex);
@ -709,7 +739,7 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
/**
* Finds the node which implies or contains a common=05 weight of the given strength
* (secondary or tertiary).
* (secondary or tertiary), if the current node is stronger.
* Skips weaker nodes and tailored nodes if the current node is stronger
* and is followed by an explicit-common-weight node.
* Always returns the input index if that node is no stronger than the given strength.
@ -728,13 +758,14 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
index = nextIndexFromNode(node);
node = nodes.elementAti(index);
assert(!isTailoredNode(node) && strengthFromNode(node) == strength &&
weight16FromNode(node) == Collation.BEFORE_WEIGHT16);
weight16FromNode(node) < Collation.COMMON_WEIGHT16);
// Skip to the explicit common node.
do {
index = nextIndexFromNode(node);
node = nodes.elementAti(index);
assert(strengthFromNode(node) >= strength);
} while(isTailoredNode(node) || strengthFromNode(node) > strength);
} while(isTailoredNode(node) || strengthFromNode(node) > strength ||
weight16FromNode(node) < Collation.COMMON_WEIGHT16);
assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16);
return index;
}
@ -1097,6 +1128,9 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
CollationWeights secondaries = new CollationWeights();
CollationWeights tertiaries = new CollationWeights();
long[] nodesArray = nodes.getBuffer();
if(DEBUG) {
System.out.println("\nCollationBuilder.makeTailoredCEs()");
}
for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
int i = rootPrimaryIndexes.elementAti(rpi);
@ -1143,11 +1177,11 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
// Gap at the beginning of the tertiary CE range.
t = rootElements.getTertiaryBoundary() - 0x100;
tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK;
} else if(t == Collation.BEFORE_WEIGHT16) {
tLimit = Collation.COMMON_WEIGHT16;
} else if(!pIsTailored && !sIsTailored) {
// p and s are root weights.
tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
} else if(t == Collation.BEFORE_WEIGHT16) {
tLimit = Collation.COMMON_WEIGHT16;
} else {
// [p, s] is tailored.
assert(t == Collation.COMMON_WEIGHT16);
@ -1185,11 +1219,11 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
// Gap at the beginning of the secondary CE range.
s = rootElements.getSecondaryBoundary() - 0x100;
sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16);
} else if(s == Collation.BEFORE_WEIGHT16) {
sLimit = Collation.COMMON_WEIGHT16;
} else if(!pIsTailored) {
// p is a root primary.
sLimit = rootElements.getSecondaryAfter(pIndex, s);
} else if(s == Collation.BEFORE_WEIGHT16) {
sLimit = Collation.COMMON_WEIGHT16;
} else {
// p is a tailored primary.
assert(s == Collation.COMMON_WEIGHT16);
@ -1203,6 +1237,11 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
secondaries.initForSecondary();
if(!secondaries.allocWeights(s, sLimit, sCount)) {
// C++ U_BUFFER_OVERFLOW_ERROR
if(DEBUG) {
System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n",
alignWeightRight(s), alignWeightRight(sLimit),
alignWeightRight(sCount));
}
throw new UnsupportedOperationException("secondary tailoring gap too small");
}
sIsTailored = true;
@ -1377,15 +1416,13 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
/** At most 1M nodes, limited by the 20 bits in node bit fields. */
private static final int MAX_INDEX = 0xfffff;
/**
* Node bit 6 is set on a primary node if there are tailored nodes
* with secondary values below the common secondary weight (05),
* from a reset-secondary-before (&[before 2]).
* Node bit 6 is set on a primary node if there are nodes
* with secondary values below the common secondary weight (05).
*/
private static final int HAS_BEFORE2 = 0x40;
/**
* Node bit 5 is set on a primary or secondary node if there are tailored nodes
* with tertiary values below the common tertiary weight (05),
* from a reset-tertiary-before (&[before 3]).
* Node bit 5 is set on a primary or secondary node if there are nodes
* with tertiary values below the common tertiary weight (05).
*/
private static final int HAS_BEFORE3 = 0x20;
/**
@ -1496,15 +1533,16 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
* A node of a given strength normally implies "common" weights on weaker levels.
*
* A node with HAS_BEFORE2 must be immediately followed by
* a secondary node with BEFORE_WEIGHT16, then a secondary tailored node,
* a secondary node with an explicit below-common weight, then a secondary tailored node,
* and later an explicit common-secondary node.
* (&[before 2] resets to the BEFORE_WEIGHT16 node so that
* The below-common weight can be a root weight,
* or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
* or before the lowest root weight.
* (&[before 2] resets to an explicit secondary node so that
* the following addRelation(secondary) tailors right after that.
* If we did not have this node and instead were to reset on the primary node,
* then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
*
* All secondary tailored nodes between these two explicit ones
* will be assigned lower-than-common secondary weights.
* If the flag is not set, then there are no explicit secondary nodes
* with the common or lower weights.
*

View File

@ -254,7 +254,7 @@ public final class CollationRootElements {
} else {
index = findPrimary(p) + 1;
previousSec = Collation.BEFORE_WEIGHT16;
sec = Collation.COMMON_WEIGHT16;
sec = (int)getFirstSecTerForPrimary(index) >>> 16;
}
assert(s >= sec);
while(s > sec) {
@ -285,7 +285,7 @@ public final class CollationRootElements {
} else {
index = findPrimary(p) + 1;
previousTer = Collation.BEFORE_WEIGHT16;
secTer = Collation.COMMON_SEC_AND_TER_CE;
secTer = getFirstSecTerForPrimary(index);
}
long st = ((long)s << 16) | t;
while(st > secTer) {
@ -339,37 +339,54 @@ public final class CollationRootElements {
/**
* Returns the secondary weight after [p, s] where index=findPrimary(p)
* except use index=0 for p=0.
*
* <p>Must return a weight for every root [p, s] as well as for every weight
* returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
*
* <p>Exception: [0, 0] is handled by the CollationBuilder:
* Both its lower and upper boundaries are special.
*/
int getSecondaryAfter(int index, int s) {
long secTer;
int secLimit;
if(index == 0) {
// primary = 0
assert(s != 0);
index = (int)elements[IX_FIRST_SECONDARY_INDEX];
secTer = elements[index];
// Gap at the end of the secondary CE range.
secLimit = 0x10000;
} else {
assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]);
++index;
secTer = getFirstSecTerForPrimary(index + 1);
// If this is an explicit sec/ter unit, then it will be read once more.
// Gap for secondaries of primary CEs.
secLimit = getSecondaryBoundary();
}
for(;;) {
long secTer = elements[index];
if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
int sec = (int)(secTer >> 16);
if(sec > s) { return sec; }
++index;
secTer = elements[++index];
if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
}
}
/**
* Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
* except use index=0 for p=0.
*
* <p>Must return a weight for every root [p, s, t] as well as for every weight
* returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
*
* <p>Exception: [0, 0, 0] is handled by the CollationBuilder:
* Both its lower and upper boundaries are special.
*/
int getTertiaryAfter(int index, int s, int t) {
long secTer;
int terLimit;
if(index == 0) {
// primary = 0
if(s == 0) {
assert(t != 0);
index = (int)elements[IX_FIRST_TERTIARY_INDEX];
// Gap at the end of the tertiary CE range.
terLimit = 0x4000;
@ -378,22 +395,44 @@ public final class CollationRootElements {
// Gap for tertiaries of primary/secondary CEs.
terLimit = getTertiaryBoundary();
}
secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
} else {
assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]);
++index;
secTer = getFirstSecTerForPrimary(index + 1);
// If this is an explicit sec/ter unit, then it will be read once more.
terLimit = getTertiaryBoundary();
}
long st = (((long)s & 0xffffffffL) << 16) | t;
for(;;) {
long secTer = elements[index];
if(secTer > st) {
assert((secTer >> 16) == s);
return (int)secTer & 0xffff;
}
secTer = elements[++index];
// No tertiary greater than t for this primary+secondary.
if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
secTer &= ~SEC_TER_DELTA_FLAG;
if(secTer > st) { return (int)secTer & 0xffff; }
++index;
}
}
/**
* Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1.
*/
private long getFirstSecTerForPrimary(int index) {
long secTer = elements[index];
if((secTer & SEC_TER_DELTA_FLAG) == 0) {
// No sec/ter delta.
return Collation.COMMON_SEC_AND_TER_CE;
}
secTer &= ~SEC_TER_DELTA_FLAG;
if(secTer > Collation.COMMON_SEC_AND_TER_CE) {
// Implied sec/ter.
return Collation.COMMON_SEC_AND_TER_CE;
}
// Explicit sec/ter below common/common.
return secTer;
}
/**
* Finds the largest index i where elements[i]<=p.
* Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:75ed6e7d7359c336e87fbc632981e5e26fbbab5550b862fb7e00b2a5acf33078
size 11802139
oid sha256:d7bf72e445a207052fe2e2de0d70a989b69bc55da3df272f8e3096d6d9cb2ad0
size 11801973

View File

@ -2476,3 +2476,53 @@
* compare
<1 AA
<2 aą
** test: tailor tertiary-after a common tertiary where there is a lower one
# Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one.
# See ICU ticket 11448 & CLDR ticket 7222.
@ rules
&あ<<<x<<<y<<<z
* compare
<1 ぁ
<3 あ
<3 x
<3 y
<3 z
<3 ァ
<1 い
** test: tailor tertiary-after a below-common tertiary
@ rules
&ぁ<<<x<<<y<<<z
* compare
<1 ぁ
<3 x
<3 y
<3 z
<3 あ
<3 ァ
<1 い
** test: tailor tertiary-before a common tertiary where there is a lower one
@ rules
&[before 3]あ<<<x<<<y<<<z
* compare
<1 ぁ
<3 x
<3 y
<3 z
<3 あ
<3 ァ
<1 い
** test: tailor tertiary-before a below-common tertiary
@ rules
&[before 3]ぁ<<<x<<<y<<<z
* compare
<1 x
<3 y
<3 z
<3 ぁ
<3 あ
<3 ァ
<1 い

View File

@ -811,7 +811,26 @@ public class CollationTest extends TestFmwk {
// Simple primary CE.
++index;
pri = p;
secTer = Collation.COMMON_SEC_AND_TER_CE;
// Does this have an explicit below-common sec/ter unit,
// or does it imply a common one?
if(index == length) {
secTer = Collation.COMMON_SEC_AND_TER_CE;
} else {
secTer = elements[index];
if((secTer & CollationRootElements.SEC_TER_DELTA_FLAG) == 0) {
// No sec/ter delta.
secTer = Collation.COMMON_SEC_AND_TER_CE;
} else {
secTer &= ~CollationRootElements.SEC_TER_DELTA_FLAG;
if(secTer > Collation.COMMON_SEC_AND_TER_CE) {
// Implied sec/ter.
secTer = Collation.COMMON_SEC_AND_TER_CE;
} else {
// Explicit sec/ter below common/common.
++index;
}
}
}
return true;
}