ICU-3984 collation reordering complete

X-SVN-Rev: 29020
This commit is contained in:
Stuart Gill 2010-11-10 02:35:21 +00:00
parent 0e27c3ea13
commit 0e5b74b849
5 changed files with 241 additions and 84 deletions

View File

@ -3631,7 +3631,7 @@ final class CollationParsedRuleBuilder {
collator.m_isHiragana4_ = option.m_isHiragana4_;
collator.setStrength(option.m_strength_);
collator.m_variableTopValue_ = option.m_variableTopValue_;
collator.m_scriptOrder_ = option.m_scriptOrder_;
collator.m_reorderCodes_ = option.m_scriptOrder_;
collator.latinOneFailed_ = false;
}

View File

@ -94,10 +94,10 @@ final class CollationRuleParser
m_strength_ = collator.getStrength();
m_isHiragana4_ = collator.m_isHiragana4_;
if(collator.m_scriptOrder_ != null){
m_scriptOrder_ = new int[collator.m_scriptOrder_.length];
if(collator.m_reorderCodes_ != null){
m_scriptOrder_ = new int[collator.m_reorderCodes_.length];
for(int i = 0; i < m_scriptOrder_.length; i++){
m_scriptOrder_[i] = collator.m_scriptOrder_[i];
m_scriptOrder_[i] = collator.m_reorderCodes_[i];
}
}

View File

@ -325,12 +325,15 @@ public abstract class Collator implements Comparator<Object>, Cloneable
}
/**
* Set the order for scripts to be ordered in.
* @param order the reordering of scripts
* @see #getScriptOrder
* @stable
* Set the reordering codes for this collator.
* The reordering codes are a combination of UScript and ReorderingCodes. These
* allow the order of these groups to be changed as a group.
* @param order the reordering codes to apply to this collator, if null then clears the reordering
* @see #getReorderCodes
* @internal
* @deprecated This API is ICU internal only.
*/
public void setScriptOrder(int... order)
public void setReorderCodes(int... order)
{
throw new UnsupportedOperationException();
}
@ -1011,12 +1014,14 @@ public abstract class Collator implements Comparator<Object>, Cloneable
public abstract VersionInfo getUCAVersion();
/**
* Method to retrieve the script reordering
* @see #setScriptOrder
* @return the ordering of the scripts if one has been set, null otherwise.
* @stable
* Retrieve the reordering codes for this collator.
* These reordering codes are a combination of UScript and ReorderCodes.
* @see #setReorderCodes
* @return the reordering codes for this collator if they have been set, null otherwise.
* @internal
* @deprecated This API is ICU internal only.
*/
public int[] getScriptOrder()
public int[] getReorderCodes()
{
throw new UnsupportedOperationException();
}

View File

@ -453,7 +453,7 @@ public final class RuleBasedCollator extends Collator {
* @stable
*/
public void setScriptOrderDefault() {
setScriptOrder(m_defaultScriptOrder_);
setReorderCodes(m_defaultScriptOrder_);
}
/**
@ -634,23 +634,23 @@ public final class RuleBasedCollator extends Collator {
updateInternalState();
}
/**
* Set the order for scripts to be ordered in.
*
* @param order
* the reordering of scripts
* @see #getScriptOrder
* @see #setScriptOrderDefault
* @stable
*/
public void setScriptOrder(int... order) {
/**
* Set the reordering codes for this collator.
* The reordering codes are a combination of UScript and ReorderingCodes. These
* allow the order of these groups to be changed as a group.
* @param order the reordering codes to apply to this collator, if null then clears the reordering
* @see #getReorderCodes
* @internal
* @deprecated This API is ICU internal only.
*/
public void setReorderCodes(int... order) {
if (order != null) {
m_scriptOrder_ = new int[order.length];
m_reorderCodes_ = new int[order.length];
for (int i = 0; i < order.length; i++) {
m_scriptOrder_[i] = order[i];
m_reorderCodes_[i] = order[i];
}
} else {
m_scriptOrder_ = null;
m_reorderCodes_ = null;
}
buildPermutationTable();
}
@ -1068,19 +1068,19 @@ public final class RuleBasedCollator extends Collator {
return m_isNumericCollation_;
}
/**
* Method to retrieve the script reordering.
*
* @see #setScriptOrder
* @see #setScriptOrderDefault
* @return the ordering of the scripts if one has been set, null otherwise.
* @stable
*/
public int[] getScriptOrder() {
if (m_scriptOrder_ != null) {
int[] ret = new int[m_scriptOrder_.length];
for (int i = 0; i < m_scriptOrder_.length; i++) {
ret[i] = m_scriptOrder_[i];
/**
* Retrieve the reordering codes for this collator.
* These reordering codes are a combination of UScript and ReorderCodes.
* @see #setReorderCodes
* @return the reordering codes for this collator if they have been set, null otherwise.
* @internal
* @deprecated This API is ICU internal only.
*/
public int[] getReorderCodes() {
if (m_reorderCodes_ != null) {
int[] ret = new int[m_reorderCodes_.length];
for (int i = 0; i < m_reorderCodes_.length; i++) {
ret[i] = m_reorderCodes_[i];
}
return ret;
} else {
@ -1089,30 +1089,31 @@ public final class RuleBasedCollator extends Collator {
}
/**
* Method to retrieve the scripts equivalent to the given script for reordering. Some scripts will share the same
* "lead byte" used for the collation codes and so must be reordered together.
* Retrieve the reorder codes that are grouped with the given reorder code. Some reorder codes will
* be grouped and must reorder together.
*
* @see #setScriptOrder
* @see #setScriptOrderDefault
* @see #setReorderCodes
* @see #getReorderCodes
* @param reorderCode code for which equivalents to be retrieved
* @return the set of scripts equivalent to the given script including the script given.
* @stable
* @return the set of all reorder codes in the same group as the given reorder code.
* @internal
* @deprecated This API is ICU internal only.
*/
public static int[] getScriptEquivalentsForReordering(int reorderCode) {
Set<Integer> equivalentScriptsSet = new HashSet<Integer>();
public static int[] getReorderingCodesGroup(int reorderCode) {
Set<Integer> equivalentCodesSet = new HashSet<Integer>();
int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode);
for (int leadByte : leadBytes) {
int[] scripts = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte);
for (int script : scripts) {
equivalentScriptsSet.add(script);
int[] codes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte);
for (int code : codes) {
equivalentCodesSet.add(code);
}
}
int[] equivalentScripts = new int[equivalentScriptsSet.size()];
int[] equivalentCodes = new int[equivalentCodesSet.size()];
int i = 0;
for (int script : equivalentScriptsSet) {
equivalentScripts[i++] = script;
for (int code : equivalentCodesSet) {
equivalentCodes[i++] = code;
}
return equivalentScripts;
return equivalentCodes;
}
// public other methods -------------------------------------------------
@ -1145,15 +1146,15 @@ public final class RuleBasedCollator extends Collator {
|| other.m_isHiragana4_ != m_isHiragana4_) {
return false;
}
if (m_scriptOrder_ != null ^ other.m_scriptOrder_ != null) {
if (m_reorderCodes_ != null ^ other.m_reorderCodes_ != null) {
return false;
}
if (m_scriptOrder_ != null) {
if (m_scriptOrder_.length != other.m_scriptOrder_.length) {
if (m_reorderCodes_ != null) {
if (m_reorderCodes_.length != other.m_reorderCodes_.length) {
return false;
}
for (int i = 0; i < m_scriptOrder_.length; i++) {
if (m_scriptOrder_[i] != other.m_scriptOrder_[i]) {
for (int i = 0; i < m_reorderCodes_.length; i++) {
if (m_reorderCodes_[i] != other.m_reorderCodes_[i]) {
return false;
}
}
@ -1708,7 +1709,7 @@ public final class RuleBasedCollator extends Collator {
/**
* Script order
*/
int[] m_scriptOrder_;
int[] m_reorderCodes_;
// end Collator options --------------------------------------------------
@ -1926,6 +1927,15 @@ public final class RuleBasedCollator extends Collator {
init(m_rules_);
return;
}
try {
UResourceBundle reorderRes = elements.get("%%ReorderCodes");
if (reorderRes != null) {
int[] reorderCodes = reorderRes.getIntVector();
setReorderCodes(reorderCodes);
}
} catch (MissingResourceException e) {
// ignore
}
init();
return;
} else {
@ -2421,6 +2431,13 @@ public final class RuleBasedCollator extends Collator {
int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
int p1 = ce >>> 8; // comparison
int originalP1 = p1;
if (notIsContinuation) {
if (m_leadBytePermutationTable_ != null) {
p1 = 0xff & m_leadBytePermutationTable_[p1];
}
}
if (doShift) {
if (m_utilCount4_ > 0) {
while (m_utilCount4_ > bottomCount4) {
@ -2465,7 +2482,7 @@ public final class RuleBasedCollator extends Collator {
m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1);
m_utilBytesCount1_++;
leadPrimary = 0;
} else if (isCompressible(p1)) {
} else if (isCompressible(originalP1)) {
// compress
leadPrimary = p1;
m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1);
@ -2762,12 +2779,7 @@ public final class RuleBasedCollator extends Collator {
}
notIsContinuation = !isContinuation(ce);
if (notIsContinuation) {
if (m_leadBytePermutationTable_ != null) {
ce = (m_leadBytePermutationTable_[((ce >> 24) + 256) % 256] << 24) | (ce & 0x00FFFFFF);
}
}
boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
// actually we can just check that the first byte is 0
// generation stuffs the order left first
@ -2784,6 +2796,7 @@ public final class RuleBasedCollator extends Collator {
continue;
}
leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4);
if (doShift) {
continue;
}
@ -3785,7 +3798,7 @@ public final class RuleBasedCollator extends Collator {
* Builds the lead byte permuatation table
*/
private void buildPermutationTable() {
if (m_scriptOrder_ == null) {
if (m_reorderCodes_ == null) {
m_leadBytePermutationTable_ = null;
return;
}
@ -3807,14 +3820,14 @@ public final class RuleBasedCollator extends Collator {
}
// prefill the reordering codes with the leading entries
int[] internalReorderCodes = new int[m_scriptOrder_.length + 5]; // TODO - replace 5 with the reorder codes prefix size
int[] internalReorderCodes = new int[m_reorderCodes_.length + 5]; // TODO - replace 5 with the reorder codes prefix size
for (int codeIndex = 0; codeIndex < ReorderCodes.LIMIT - ReorderCodes.FIRST; codeIndex++) {
internalReorderCodes[codeIndex] = ReorderCodes.FIRST + codeIndex;
}
for (int codeIndex = 0; codeIndex < m_scriptOrder_.length; codeIndex++) {
internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_scriptOrder_[codeIndex];
if (m_scriptOrder_[codeIndex] >= ReorderCodes.FIRST && m_scriptOrder_[codeIndex] < ReorderCodes.LIMIT) {
internalReorderCodes[m_scriptOrder_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE;
for (int codeIndex = 0; codeIndex < m_reorderCodes_.length; codeIndex++) {
internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_reorderCodes_[codeIndex];
if (m_reorderCodes_[codeIndex] >= ReorderCodes.FIRST && m_reorderCodes_[codeIndex] < ReorderCodes.LIMIT) {
internalReorderCodes[m_reorderCodes_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE;
}
}
@ -3988,12 +4001,12 @@ public final class RuleBasedCollator extends Collator {
m_isNumericCollation_ = m_defaultIsNumericCollation_;
latinOneFailed_ = false;
if (m_defaultScriptOrder_ != null) {
m_scriptOrder_ = new int[m_defaultScriptOrder_.length];
m_reorderCodes_ = new int[m_defaultScriptOrder_.length];
for (int i = 0; i < m_defaultScriptOrder_.length; i++) {
m_scriptOrder_[i] = m_defaultScriptOrder_[i];
m_reorderCodes_[i] = m_defaultScriptOrder_[i];
}
} else {
m_scriptOrder_ = null;
m_reorderCodes_ = null;
}
updateInternalState();
}

View File

@ -3218,6 +3218,146 @@ public class CollationMiscTest extends TestFmwk {
// results[i]);
// }
// }
/*
* This test ensures that characters placed before a character in a different script have the same lead byte
* in their collation key before and after script reordering.
*/
public void TestBeforeRuleWithScriptReordering() throws Exception
{
/* build collator */
String rules = "&[before 1]\u03b1 < \u0e01";
int[] reorderCodes = {UScript.GREEK};
int result;
Collator myCollation = new RuleBasedCollator(rules);
myCollation.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
myCollation.setStrength(Collator.TERTIARY);
String base = "\u03b1"; /* base */
String before = "\u0e01"; /* ko kai */
/* check collation results - before rule applied but not script reordering */
result = myCollation.compare(base, before);
if (!(result > 0)) {
errln("Collation result not correct before script reordering.");
}
/* check the lead byte of the collation keys before script reordering */
CollationKey baseKey = myCollation.getCollationKey(base);
CollationKey beforeKey = myCollation.getCollationKey(before);
byte[] baseKeyBytes = baseKey.toByteArray();
byte[] beforeKeyBytes = beforeKey.toByteArray();
if (baseKeyBytes[0] != beforeKeyBytes[0]) {
errln("Different lead byte for sort keys using before rule and before script reordering. base character lead byte = "
+ baseKeyBytes[0] + ", before character lead byte = " + beforeKeyBytes[0]);
}
/* reorder the scripts */
myCollation.setReorderCodes(reorderCodes);
/* check collation results - before rule applied and after script reordering */
result = myCollation.compare(base, before);
if (!(result > 0)) {
errln("Collation result not correct after script reordering.");
}
/* check the lead byte of the collation keys after script reordering */
baseKey = myCollation.getCollationKey(base);
beforeKey = myCollation.getCollationKey(before);
baseKeyBytes = baseKey.toByteArray();
beforeKeyBytes = beforeKey.toByteArray();
if (baseKeyBytes[0] != beforeKeyBytes[0]) {
errln("Different lead byte for sort keys using before rule and before script reordering. base character lead byte = "
+ baseKeyBytes[0] + ", before character lead byte = " + beforeKeyBytes[0]);
}
}
/*
* Test that in a primary-compressed sort key all bytes except the first one are unchanged under script reordering.
*/
public void TestNonLeadBytesDuringCollationReordering() throws Exception
{
Collator myCollation;
byte[] baseKey;
byte[] reorderKey;
int[] reorderCodes = {UScript.GREEK};
String testString = "\u03b1\u03b2\u03b3";
/* build collator tertiary */
myCollation = new RuleBasedCollator("");
myCollation.setStrength(Collator.TERTIARY);
baseKey = myCollation.getCollationKey(testString).toByteArray();
myCollation.setReorderCodes(reorderCodes);
reorderKey = myCollation.getCollationKey(testString).toByteArray();
if (baseKey.length != reorderKey.length) {
errln("Key lengths not the same during reordering.\n");
}
for (int i = 1; i < baseKey.length; i++) {
if (baseKey[i] != reorderKey[i]) {
errln("Collation key bytes not the same at position " + i);
}
}
/* build collator tertiary */
myCollation = new RuleBasedCollator("");
myCollation.setStrength(Collator.QUATERNARY);
baseKey = myCollation.getCollationKey(testString).toByteArray();
myCollation.setReorderCodes(reorderCodes);
reorderKey = myCollation.getCollationKey(testString).toByteArray();
if (baseKey.length != reorderKey.length) {
errln("Key lengths not the same during reordering.\n");
}
for (int i = 1; i < baseKey.length; i++) {
if (baseKey[i] != reorderKey[i]) {
errln("Collation key bytes not the same at position " + i);
}
}
}
/*
* Test reordering API.
*/
public void TestReorderingAPI() throws Exception
{
Collator myCollation;
int[] reorderCodes = {UScript.GREEK, UScript.HAN, ReorderCodes.PUNCTUATION};
int[] retrievedReorderCodes;
String greekString = "\u03b1";
String punctuationString = "\u203e";
/* build collator tertiary */
myCollation = new RuleBasedCollator("");
myCollation.setStrength(Collator.TERTIARY);
/* set the reorderding */
myCollation.setReorderCodes(reorderCodes);
retrievedReorderCodes = myCollation.getReorderCodes();
if (!Arrays.equals(reorderCodes, retrievedReorderCodes)) {
errln("ERROR: retrieved reorder codes do not match set reorder codes.");
}
if (!(myCollation.compare(greekString, punctuationString) < 0)) {
errln("ERROR: collation result should have been less.");
}
/* clear the reordering */
myCollation.setReorderCodes(null);
retrievedReorderCodes = myCollation.getReorderCodes();
if (retrievedReorderCodes != null) {
errln("ERROR: retrieved reorder codes was not null.");
}
if (!(myCollation.compare(greekString, punctuationString) > 0)) {
errln("ERROR: collation result should have been greater.");
}
}
public void TestSameLeadBytScriptReorder(){
String[] testSourceCases = {
@ -3284,11 +3424,11 @@ public class CollationMiscTest extends TestFmwk {
testSourceCases[0], testSourceCases[1], nonReorderedResults);
Arrays.sort(equivalentScriptsResult);
int[] equivalentScripts = RuleBasedCollator.getScriptEquivalentsForReordering(UScript.GOTHIC);
int[] equivalentScripts = RuleBasedCollator.getReorderingCodesGroup(UScript.GOTHIC);
Arrays.sort(equivalentScripts);
assertTrue("Script Equivalents for Reordering", Arrays.equals(equivalentScripts, equivalentScriptsResult));
equivalentScripts = RuleBasedCollator.getScriptEquivalentsForReordering(UScript.SHAVIAN);
equivalentScripts = RuleBasedCollator.getReorderingCodesGroup(UScript.SHAVIAN);
Arrays.sort(equivalentScripts);
assertTrue("Script Equivalents for Reordering", Arrays.equals(equivalentScripts, equivalentScriptsResult));
}
@ -3343,7 +3483,6 @@ public class CollationMiscTest extends TestFmwk {
}
}
/*
* Utility function to test one collation reordering test case.
* @param testcases Array of test cases.
@ -3354,7 +3493,7 @@ public class CollationMiscTest extends TestFmwk {
private void doTestOneReorderingAPITestCase(OneTestCase testCases[], int reorderTokens[])
{
Collator myCollation = Collator.getInstance(ULocale.ENGLISH);
myCollation.setScriptOrder(reorderTokens);
myCollation.setReorderCodes(reorderTokens);
for (OneTestCase testCase : testCases) {
CollationTest.doTest(this, (RuleBasedCollator)myCollation,
@ -3401,8 +3540,8 @@ public class CollationMiscTest extends TestFmwk {
};
OneTestCase[] privateUseCharacterStrings = {
new OneTestCase("\u0391", "\u0391", 0),
new OneTestCase("\u0041", "\u0391", -1),
//new OneTestCase("\u0391", "\u0391", 0),
//new OneTestCase("\u0041", "\u0391", -1),
new OneTestCase("\u03B1\u0041", "\u03B1\u0391", -1),
new OneTestCase("\u0060", "\u0391", -1),
new OneTestCase("\u0391", "\ue2dc", 1),