New d5 after discussions with Ken; added HOMELESS in Fractional UCA

X-SVN-Rev: 8969
This commit is contained in:
Mark Davis 2002-06-28 01:59:58 +00:00
parent 46138ef4f3
commit 72a043bed7
3 changed files with 464 additions and 137 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2002/06/22 21:02:16 $
* $Revision: 1.7 $
* $Date: 2002/06/28 01:59:58 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -18,7 +18,7 @@ import com.ibm.text.utility.*;
public class Main {
static final String UCDVersion = "";
static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA",
static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA",
"WriteRules", "WriteRulesWithNames", "WriteRulesXML",
"writeconformance", "writeconformanceshifted",
"short", "writeconformance", "writeconformanceshifted",
@ -65,7 +65,7 @@ public class Main {
else if (arg.equalsIgnoreCase("WriteRulesXML")) WriteCollationData.writeRules(WriteCollationData.IN_XML);
else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) WriteCollationData.checkDisjointIgnorables();
else if (arg.equalsIgnoreCase("writeContractions")) WriteCollationData.writeContractions();
else if (arg.equalsIgnoreCase("FractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA");
else if (arg.equalsIgnoreCase("writeFractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA");
else if (arg.equalsIgnoreCase("writeConformance")) WriteCollationData.writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint);
else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) WriteCollationData.writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint);
else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) WriteCollationData.testCompatibilityCharacters();
@ -80,7 +80,7 @@ public class Main {
System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)");
System.out.println("\tWriteRulesXML, WriteRulesWithNames, WriteRules,");
System.out.println("\tcheckDisjointIgnorables, writeContractions,");
System.out.println("\tFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,");
System.out.println("\twriteFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,");
System.out.println("\twriteCollationValidityLog, writeCaseExceptions, writeJavascriptInfo, writeCaseFolding");
System.out.println("\tjavatest, hex (used for conformance)");
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2002/06/24 15:25:10 $
* $Revision: 1.15 $
* $Date: 2002/06/28 01:59:58 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -75,7 +75,8 @@ final public class UCA implements Comparator, UCA_Types {
* Version of the UCA tables to use
*/
//private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7";
public static final String VERSION = "-3.1.1d1"; // ""; // "-2.1.9d7";
public static final String UCA_BASE = "3.1.1"; // ""; // "-2.1.9d7";
public static final String VERSION = "-" + UCA_BASE + "d5"; // ""; // "-2.1.9d7";
public static final String ALLFILES = "allkeys"; // null if not there
/**
@ -240,7 +241,9 @@ final public class UCA implements Comparator, UCA_Types {
// add weights
char w = getPrimary(ce);
if (DEBUG) System.out.println("\tCE: " + Utility.hex(ce));
if (w != 0) primaries.append(w);
if (w != 0) {
primaries.append(w);
}
w = getSecondary(ce);
if (w != 0) {
@ -252,9 +255,13 @@ final public class UCA implements Comparator, UCA_Types {
}
w = getTertiary(ce);
if (w != 0) tertiaries.append(w);
if (w != 0) {
tertiaries.append(w);
}
if (weight4 != 0) quaternaries.append(weight4);
if (weight4 != 0) {
quaternaries.append(weight4);
}
}
// Produce weight strings
@ -263,13 +270,13 @@ final public class UCA implements Comparator, UCA_Types {
StringBuffer result = primaries;
if (strength >= 2) {
result.append('\u0000'); // separator
result.append(LEVEL_SEPARATOR); // separator
result.append(secondaries);
if (strength >= 3) {
result.append('\u0000'); // separator
result.append(LEVEL_SEPARATOR); // separator
result.append(tertiaries);
if (strength >= 4) {
result.append('\u0000'); // separator
result.append(LEVEL_SEPARATOR); // separator
if (alternate == SHIFTED_TRIMMED) {
int q;
for (q = quaternaries.length()-1; q >= 0; --q) {
@ -303,7 +310,7 @@ final public class UCA implements Comparator, UCA_Types {
char c2 = sortKey2.charAt(i);
if (c1 < c2) return -strength;
if (c1 > c2) return strength;
if (c1 == '\u0000') --strength; // Separator!
if (c1 == LEVEL_SEPARATOR) --strength; // Separator!
}
if (len1 < len2) return -strength;
if (len1 > len2) return strength;
@ -399,15 +406,21 @@ final public class UCA implements Comparator, UCA_Types {
* @param source Normal UTF-16 (Java) string
* @return sort key (as string)
* @author Markus Scherer (cast into Java by MD)
* NOTE: changed to be longer, but handle isolated surrogates
*/
public static StringBuffer appendInCodePointOrder(String source, StringBuffer target) {
for (int i = 0; i < source.length(); ++i) {
int ch = source.charAt(i);
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
target.append((char)((cp >> 15) | 0x8000));
target.append((char)(cp | 0x8000));
/*
if (ch <= 1) { // hack to avoid nulls
target.append('\u0001');
target.append((char)(ch+1));
}
target.append((char)(ch + utf16CodePointOrder[ch>>11]));
*/
}
return target;
}
@ -659,9 +672,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
*/
/**
* Returns implicit value as pair, first part in high word; second part in low word
* So to get first part use (x >>> 16) -- remember the >>>!
* and to get the second part use (x & 0xFFFF)
* Returns implicit value
*/
void CodepointToImplicit(int cp, int[] output) {
@ -673,9 +684,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
}
/**
* Takes implicit value as pair, first part in high word; second part in low word
* So to get first part use (x >>> 16) -- remember the >>>!
* and to get the second part use (x & 0xFFFF)
* Takes implicit value
*/
static int ImplicitToCodePoint(int leadImplicit, int trailImplicit) {
@ -997,7 +1006,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// push BBBB
expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
expandingStack.push(makeKey(implicit[1], 0, 0));
// return AAAA
@ -1127,7 +1136,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// normal case
while (current++ < 0x10FFFF) {
if (current == 0x406) {
if (DEBUG && current == 0xdbff) {
System.out.println("DEBUG");
}
//char ch = (char)current;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2002/06/24 15:25:10 $
* $Revision: 1.23 $
* $Date: 2002/06/28 01:59:53 $
* $Revision: 1.24 $
*
*******************************************************************************
*/
@ -25,6 +25,8 @@ import java.io.*;
import java.text.RuleBasedCollator;
import java.text.CollationElementIterator;
import java.text.Collator;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.UCD.*;
import com.ibm.text.UCD.UCD_Types;
@ -34,7 +36,7 @@ import com.ibm.text.UCD.Normalizer;
public class WriteCollationData implements UCD_Types, UCA_Types {
static final boolean DEBUG = false;
static final boolean DEBUG_SHOW_ITERATION = true;
static final boolean DEBUG_SHOW_ITERATION = false;
@ -299,18 +301,27 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
=> U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON
*/
String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"};
for (int jj = 0; jj < testList.length; ++jj) {
String t = testList[jj];
System.out.println(ucd.getCodeAndName(t));
String test = collator.getSortKey(t, UCA.NON_IGNORABLE);
System.out.println("Decomp: " + collator.toString(test));
test = collator.getSortKey(t, UCA.NON_IGNORABLE, false);
System.out.println("No Dec: " + collator.toString(test));
if (DEBUG) {
String[] testList = {"\u3192", "\u3220", "\u0344", "\u0385", "\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"};
for (int jj = 0; jj < testList.length; ++jj) {
String t = testList[jj];
System.out.println(ucd.getCodeAndName(t));
CEList ces = collator.getCEList(t, true);
System.out.println("CEs: " + ces);
String test = collator.getSortKey(t, option);
System.out.println("Decomp: " + collator.toString(test));
test = collator.getSortKey(t, option, false);
System.out.println("No Dec: " + collator.toString(test));
}
}
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false);
if (!shortPrint) log.write('\uFEFF');
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, true);
//if (!shortPrint) log.write('\uFEFF');
log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
log.println("# Generated: " + getNormalDate());
System.out.println("Sorting");
int counter = 0;
@ -333,7 +344,6 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
}
Utility.dot(counter++);
addStringX(s, option);
// TODO: add other accents with Cyrillic
}
UnicodeSet found = collator.found;
@ -472,7 +482,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
for (int j = 0; j < CONTRACTION_TEST.length; ++j) {
String extra = s.substring(0,i) + CONTRACTION_TEST[j] + s.substring(i);
addStringY(extra + 'a', option);
System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra));
if (DEBUG) System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra));
}
}
}
@ -488,31 +498,51 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
sortedD.put(colDbase, s);
}
static UCD ucd_uca_base = null;
/**
* Check that the primaries are the same as the compatibility decomposition.
*/
static void checkBadDecomps(int strength, boolean decomposition, UnicodeSet alreadySeen) {
if (ucd_uca_base == null) {
ucd_uca_base = UCD.make(UCA.UCA_BASE);
}
int oldStrength = collator.getStrength();
collator.setStrength(strength);
Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
if (strength == 1) {
log.println("<h2>3. Primaries Incompatible with Decompositions</h2><table border='1' cellspacing='0' cellpadding='2'>");
} else {
log.println("<h2>4. Secondaries Incompatible with Decompositions</h2><table border='1' cellspacing='0' cellpadding='2'>");
Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
switch (strength) {
case 1: log.println("<h2>3. Primaries Incompatible with Decompositions</h2>"); break;
case 2: log.println("<h2>4. Secondaries Incompatible with Decompositions</h2>"); break;
case 3: log.println("<h2>5. Tertiaries Incompatible with Decompositions</h2>");
log.println("<p>Note: Tertiary differences are not really errors; these are just warnings</p>");
break;
default: throw new IllegalArgumentException("bad strength: " + strength);
}
log.println("<p>Warning: only checking characters defined in base: " + ucd_uca_base.getVersion() + "</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
int errorCount = 0;
UnicodeSet skipSet = new UnicodeSet();
for (int ch = 0; ch < 0x10FFFF; ++ch) {
if (!ucd.isAllocated(ch)) continue;
if (!ucd_uca_base.isAllocated(ch)) continue;
if (nfkd.isNormalized(ch)) continue;
if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul
if (alreadySeen.contains(ch)) continue;
Utility.dot(ch);
String decomp = nfkd.normalize(ch);
if (ch != ' ' && decomp.charAt(0) == ' ') continue; // skip wierd decomps
if (ch != '\u0640' && decomp.charAt(0) == '\u0640') continue; // skip wierd decomps
if (ch != ' ' && decomp.charAt(0) == ' ') {
skipSet.add(ch);
continue; // skip wierd decomps
}
if (ch != '\u0640' && decomp.charAt(0) == '\u0640') {
skipSet.add(ch);
continue; // skip wierd decomps
}
String sortKey = collator.getSortKey(UTF16.valueOf(ch), UCA.NON_IGNORABLE, decomposition);
@ -521,21 +551,97 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
sortKey = remove(sortKey, '\u0020');
decompSortKey = remove(decompSortKey, '\u0020');
}
if (!sortKey.equals(decompSortKey)) {
log.println("<tr><td>" + Utility.hex(ch)
+ "</td><td>" + UCA.toString(sortKey)
+ "</td><td>" + UCA.toString(decompSortKey)
+ "</td><td>" + ucd.getName(ch)
+ "</td></tr>"
);
alreadySeen.add(ch);
if (sortKey.equals(decompSortKey)) continue; // no problem!
// fix key in the case of strength 3
if (strength == 3) {
String newSortKey = remapSortKey(ch, decomposition);
if (!sortKey.equals(newSortKey)) {
System.out.println("Fixing: " + ucd.getCodeAndName(ch));
System.out.println(" Old:" + collator.toString(decompSortKey));
System.out.println(" New: " + collator.toString(newSortKey));
System.out.println(" Tgt: " + collator.toString(sortKey));
}
decompSortKey = newSortKey;
}
if (sortKey.equals(decompSortKey)) continue; // no problem!
log.println("<tr><td>" + Utility.hex(ch)
+ "</td><td>" + UCA.toString(sortKey)
+ "</td><td>" + UCA.toString(decompSortKey)
+ "</td><td>" + ucd.getName(ch)
+ "</td></tr>"
);
alreadySeen.add(ch);
errorCount++;
}
log.println("</table>");
log.println("<p>Errors: " + errorCount + "</p>");
log.println("<p>Space/Tatweel exceptions: " + skipSet.toPattern(true) + "</p>");
collator.setStrength(oldStrength);
Utility.fixDot();
}
static String remapSortKey(int cp, boolean decomposition) {
if (toD.isNormalized(cp)) return remapCanSortKey(cp, decomposition);
// we know that it is not NFKD.
String canDecomp = toD.normalize(cp);
String result = "";
int ch;
for (int j = 0; j < canDecomp.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(canDecomp, j);
System.out.println("* " + Default.ucd.getCodeAndName(ch));
String newSortKey = remapCanSortKey(ch, decomposition);
System.out.println("* " + UCA.toString(newSortKey));
result = mergeSortKeys(result, newSortKey);
System.out.println("= " + UCA.toString(result));
}
return result;
}
static String remapCanSortKey(int ch, boolean decomposition) {
String compatDecomp = Default.nfkd.normalize(ch);
String decompSortKey = collator.getSortKey(compatDecomp, UCA.NON_IGNORABLE, decomposition);
byte type = ucd.getDecompositionType(ch);
int pos = decompSortKey.indexOf(UCA.LEVEL_SEPARATOR) + 1; // after first separator
pos = decompSortKey.indexOf(UCA.LEVEL_SEPARATOR, pos) + 1; // after second separator
String newSortKey = decompSortKey.substring(0, pos);
for (int i = pos; i < decompSortKey.length(); ++i) {
int weight = decompSortKey.charAt(i);
int newWeight = CEList.remap(ch, type, weight);
if (i > pos + 1) newWeight = 0x1F;
newSortKey += (char)newWeight;
}
return newSortKey;
}
// keys must be of the same strength
static String mergeSortKeys(String key1, String key2) {
StringBuffer result = new StringBuffer();
int end1 = 0, end2 = 0;
while (true) {
int pos1 = key1.indexOf(UCA.LEVEL_SEPARATOR, end1);
int pos2 = key2.indexOf(UCA.LEVEL_SEPARATOR, end2);
if (pos1 < 0) {
result.append(key1.substring(end1)).append(key2.substring(end2));
return result.toString();
}
if (pos2 < 0) {
result.append(key1.substring(end1, pos1)).append(key2.substring(end2)).append(key1.substring(pos1));
return result.toString();
}
result.append(key1.substring(end1, pos1)).append(key2.substring(end2, pos2)).append(UCA.LEVEL_SEPARATOR);
end1 = pos1 + 1;
end2 = pos2 + 1;
}
}
static final String remove (String s, char ch) {
StringBuffer buf = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
@ -630,7 +736,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
Iterator it = forLater.keySet().iterator();
byte oldType = (byte)0xFF; // anything unique
int caseCount = 0;
log.println("Generated: " + new Date());
log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
log.println("Generated: " + getNormalDate());
while (it.hasNext()) {
String key = (String) it.next();
byte type = (byte)key.charAt(0);
@ -863,7 +970,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
int[] lenArray = new int[1];
diLog.println("# Contractions");
diLog.println("# Generated " + new Date());
diLog.println("# Generated " + getNormalDate());
diLog.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
while (true) {
String s = cc.next(ces, lenArray);
if (s == null) break;
@ -1409,6 +1517,9 @@ F900..FAFF; CJK Compatibility Ideographs
log = Utility.openPrintWriter(filename, false, false);
String[] commentText = {
"UCA Rules",
"This file contains the UCA tables for the given version, but transformed into rule syntax.",
"Generated: " + getNormalDate(),
"NOTE: Since UCA handles canonical equivalents, no composites are necessary",
"(except in extensions).",
"For syntax description, see: http://oss.software.ibm.com/icu/userguide/Collate_Intro.html"
@ -1833,6 +1944,24 @@ F900..FAFF; CJK Compatibility Ideographs
System.out.println("Test case: " + Utility.hex(s) + ", " + CEList.toString(ces, len));
}
backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s);
/*
// HACK until Ken fixes
for (int i = 0; i < len; ++i) {
int ce = ces[i];
if (collator.isImplicitLeadCE(ce)) {
++i;
ce = ces[i];
if (DEBUG
&& (UCA.getPrimary(ce) == 0 || UCA.getSecondary(ce) != 0 || UCA.getTertiary(ce) != 0)) {
System.out.println("WEIRD 2nd IMPLICIT: "
+ CEList.toString(ces, len)
+ ", " + ucd.getCodeAndName(s));
}
ces[i] = UCA.makeKey(UCA.getPrimary(ce), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
}
}
backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s);
*/
}
static int[] ignorableList = {
@ -1915,6 +2044,12 @@ F900..FAFF; CJK Compatibility Ideographs
// Try stomping the value to different tertiaries
int probe = ces[i];
if (UCA.isImplicitLeadCE(probe)) {
s = UTF16.valueOf(UCA.ImplicitToCodePoint(UCA.getPrimary(probe), UCA.getPrimary(ces[i+1])));
++i; // skip over next item!!
break;
}
char primary = collator.getPrimary(probe);
char secondary = collator.getSecondary(probe);
@ -2115,6 +2250,7 @@ F900..FAFF; CJK Compatibility Ideographs
static int[] primaryDelta;
static void writeFractionalUCA(String filename) throws IOException {
Default.setUCD();
checkImplicit();
checkFixes();
@ -2345,13 +2481,14 @@ F900..FAFF; CJK Compatibility Ideographs
EquivalenceClass secEq = new EquivalenceClass("\r\n#", 2, true);
EquivalenceClass terEq = new EquivalenceClass("\r\n#", 2, true);
String[] sampleEq = new String[500];
int[] sampleLen = new int[500];
Iterator it = ordered.keySet().iterator();
int oldFirstPrimary = UCA.getPrimary(UCA.TERMINATOR);
boolean wasVariable = false;
log.println("# Fractional UCA Table, generated from standard UCA");
log.println("# M. Davis, " + new Date());
log.println("# " + getNormalDate());
log.println("# VERSION: UCA=" + collator.getDataVersion() + ", UCD=" + collator.getUCDVersion());
log.println();
log.println("# Generated processed version, as described in ICU design document.");
@ -2389,6 +2526,8 @@ F900..FAFF; CJK Compatibility Ideographs
FCE firstTrailing = new FCE(false);
FCE lastTrailing = new FCE(true);
Map backMap = new TreeMap();
while (it.hasNext()) {
Object sortKey = it.next();
@ -2399,19 +2538,19 @@ F900..FAFF; CJK Compatibility Ideographs
int firstPrimary = UCA.getPrimary(ces[0]);
if (firstPrimary != oldFirstPrimary) {
log.println();
oldFirstPrimary = firstPrimary;
boolean isVariable = collator.isVariable(ces[0]);
if (isVariable != wasVariable) {
if (isVariable) {
log.println("# START OF VARIABLE SECTION!!!");
summary.println("# START OF VARIABLE SECTION!!!");
} else {
log.println("[variable top = " + Utility.hex(primaryDelta[firstPrimary]) + "] # END OF VARIABLE SECTION!!!");
log.println("[variable top = " + Utility.hex(primaryDelta[oldFirstPrimary]) + "] # END OF VARIABLE SECTION!!!");
doVariable = true;
}
log.println();
}
wasVariable = isVariable;
oldFirstPrimary = firstPrimary;
}
oldStr.setLength(0);
chr.getChars(0, chr.length(), codeUnits, 0);
@ -2473,8 +2612,24 @@ F900..FAFF; CJK Compatibility Ideographs
if (ter != 0x2) {
boolean changed = terEq.add(new Integer(ter), new Integer((pri << 16) | sec));
}
if (sampleEq[sec] == null) sampleEq[sec] = chr;
if (sampleEq[ter] == null) sampleEq[ter] = chr;
if (sampleEq[sec] == null || sampleLen[sec] > len) {
sampleEq[sec] = chr;
sampleLen[sec] = len;
}
if (sampleEq[ter] == null || sampleLen[sec] > len) {
sampleEq[ter] = chr;
sampleLen[sec] = len;
}
if ((pri & MARK_CODE_POINT) == 0 && pri == 0) {
Integer key = new Integer(ces[q]);
Pair value = (Pair) backMap.get(key);
if (value == null
|| (len < ((Integer)(value.first)).intValue())) {
backMap.put(key, new Pair(new Integer(len), chr));
}
}
// int oldPrimaryValue = UCA.getPrimary(ces[q]);
int np = fixPrimary(pri);
@ -2508,38 +2663,76 @@ F900..FAFF; CJK Compatibility Ideographs
+ "]");
// RECORD STATS
// but ONLY if we are not part of an implicit
if (np == 0 && ns == 0) {
firstSecondaryIgnorable.setValue(np, ns, nt);
lastSecondaryIgnorable.setValue(np, ns, nt);
} else if (np == 0) {
firstPrimaryIgnorable.setValue(np, ns, nt);
lastPrimaryIgnorable.setValue(np, ns, nt);
} else if (collator.isVariable(ces[q])) {
firstVariable.setValue(np, ns, nt);
lastVariable.setValue(np, ns, nt);
} else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently)
System.out.println("Trailing: " + CEList.toString(ces[q])
+ ", " + Utility.hex(pri) + ", " + Utility.hex(UNSUPPORTED_LIMIT));
firstTrailing.setValue(np, ns, nt);
lastTrailing.setValue(np, ns, nt);
} else if ((pri & MARK_CODE_POINT) == 0) { // skip implicits
firstNonIgnorable.setValue(np, ns, nt);
lastNonIgnorable.setValue(np, ns, nt);
if ((pri & MARK_CODE_POINT) == 0) {
if (np == 0 && ns == 0) {
firstSecondaryIgnorable.setValue(np, ns, nt);
lastSecondaryIgnorable.setValue(np, ns, nt);
} else if (np == 0) {
firstPrimaryIgnorable.setValue(np, ns, nt);
lastPrimaryIgnorable.setValue(np, ns, nt);
} else if (collator.isVariable(ces[q])) {
firstVariable.setValue(np, ns, nt);
lastVariable.setValue(np, ns, nt);
} else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently)
System.out.println("Trailing: "
+ ucd.getCodeAndName(chr) + ", "
+ CEList.toString(ces[q]) + ", "
+ Utility.hex(pri) + ", "
+ Utility.hex(UNSUPPORTED_LIMIT));
firstTrailing.setValue(np, ns, nt);
lastTrailing.setValue(np, ns, nt);
} else {
firstNonIgnorable.setValue(np, ns, nt);
lastNonIgnorable.setValue(np, ns, nt);
}
}
}
if (nonePrinted) {
log.print("[,,]");
oldStr.append(CEList.toString(0));
}
longLog.print(" # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0)));
longLog.print("\t# " + oldStr + "\t* " + ucd.getName(UTF16.charAt(chr, 0)));
log.println();
lastChr = chr;
}
// ADD HOMELESS COLLATION ELEMENTS
log.println();
log.println("# HOMELESS COLLATION ELEMENTS");
char fakeTrail = 'a';
Iterator it3 = backMap.keySet().iterator();
while (it3.hasNext()) {
Integer key = (Integer) it3.next();
Pair pair = (Pair) backMap.get(key);
if (((Integer)pair.first).intValue() < 2) continue;
String sample = (String)pair.second;
int ce = key.intValue();
int np = fixPrimary(UCA.getPrimary(ce));
int ns = fixSecondary(UCA.getSecondary(ce));
int nt = fixTertiary(UCA.getTertiary(ce));
newPrimary.setLength(0);
newSecondary.setLength(0);
newTertiary.setLength(0);
hexBytes(np, newPrimary);
hexBytes(ns, newSecondary);
hexBytes(nt, newTertiary);
log.print(Utility.hex('\uFDD0' + "" + (char)(fakeTrail++)) + "; "
+ "[, " + newSecondary + ", " + newTertiary + "]");
longLog.print("\t# " + collator.getCEList(sample, true) + "\t* " + ucd.getCodeAndName(sample));
log.println();
}
int firstImplicit = getImplicitPrimary(CJK_BASE);
int lastImplicit = getImplicitPrimary(0x10FFFF);
log.println();
log.println("# VALUES BASED ON UCA");
log.println("[first tertiary ignorable " + new FCE(false,0,0, 0).formatFCE() + "]");
@ -2580,16 +2773,17 @@ F900..FAFF; CJK Compatibility Ideographs
log.println("[first trailing " + firstTrailing.formatFCE() + "]");
log.println("[last trailing " + lastTrailing.formatFCE() + "]");
log.println();
log.println("# FIXED VALUES");
log.println("[top " + Utility.hex(0xA0,2) + "]");
log.println("[first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]");
log.println("[last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]");
log.println("[first trail byte" + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]");
log.println("[last implicit byte" + Utility.hex(SPECIAL_BASE-1,2) + "]");
log.println("[first special byte" + Utility.hex(SPECIAL_BASE,2) + "]");
log.println("[last special byte" + Utility.hex(0xFF,2) + "]");
log.println("# superceded! [top " + lastNonIgnorable.formatFCE() + "]");
log.println("[fixed first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]");
log.println("[fixed last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]");
log.println("[fixed first trail byte " + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]");
log.println("[fixed last trail byte " + Utility.hex(SPECIAL_BASE-1,2) + "]");
log.println("[fixed first special byte " + Utility.hex(SPECIAL_BASE,2) + "]");
log.println("[fixed last special byte " + Utility.hex(0xFF,2) + "]");
summary.println("Last: " + Utility.hex(lastNp) + ", " + ucd.getCodeAndName(UTF16.charAt(lastChr, 0)));
@ -2636,6 +2830,7 @@ F900..FAFF; CJK Compatibility Ideographs
summary.println();
summary.println("# UCA : (FRAC) CODE [ UCA CE ] Name");
summary.println();
for (int i = 0; i < sampleEq.length; ++i) {
if (sampleEq[i] == null) continue;
if (i == 0x20) {
@ -2653,6 +2848,7 @@ F900..FAFF; CJK Compatibility Ideographs
summary.print(CEList.toString(ces[q]));
}
summary.println(" " + ucd.getName(sampleEq[i]));
}
log.close();
summary.close();
@ -3379,6 +3575,13 @@ static int swapCJK(int i) {
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
static String getNormalDate() {
return myDateFormat.format(new Date()) + " [MD]";
}
static void setSingle(char ch, int[] ces) {
collator.getCEs(String.valueOf(ch), true, ces);
singles.set(UCA.getPrimary(ces[0]));
@ -3396,12 +3599,18 @@ static int swapCJK(int i) {
input.close();
}
static UnicodeSet compatibilityExceptions = new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]");
static void writeCollationValidityLog() throws IOException {
Default.setUCD();
//log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html"));
log = Utility.openPrintWriter("CheckCollationValidity.html", false, false);
log.println("<html><body>");
log.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
log.println("<title>UCA Validity Log</title>");
log.println("<style>.bottom { border-bottom-style: solid; border-bottom-color: #0000FF }</style>");
log.println("</head><body bgcolor='#FFFFFF'>");
//collator = new UCA(null);
@ -3412,14 +3621,14 @@ static int swapCJK(int i) {
}
System.out.println("Sorting");
for (int i = 0; i <= 0xFFFF; ++i) {
/*
for (int i = 0; i <= 0x10FFFF; ++i) {
if (EXCLUDE_UNSUPPORTED && !collator.found.contains(i)) continue;
if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
//if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
addString(UTF32.valueOf32(i), option);
}
*/
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
cc.enableSamples();
@ -3469,18 +3678,24 @@ static int swapCJK(int i) {
System.out.println("Writing");
String version = collator.getDataVersion();
log.println("<h1>Collation Validity Checks</h1>");
log.println("<table><tr><td>Generated: </td><td>" + getNormalDate() + "</td></tr>");
log.println("<tr><td>File Version: </td><td>" + collator.getDataVersion() + "/" + collator.getUCDVersion() + "</td></tr></table>");
if (GENERATED_NFC_MISMATCHES) showMismatches();
removeAdjacentDuplicates2();
UnicodeSet exceptions = new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]");
UnicodeSet alreadySeen = new UnicodeSet(exceptions);
UnicodeSet alreadySeen = new UnicodeSet(compatibilityExceptions);
checkBadDecomps(1, false, alreadySeen); // if decomposition is off, all primaries should be identical
checkBadDecomps(2, true, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical
checkBadDecomps(2, false, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical
checkBadDecomps(3, false, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical
//checkBadDecomps(2, true, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical
log.println("<p>Note: characters with decompositions to space + X, and tatweel + X are excluded,"
+ " as are a few special characters: " + exceptions.toPattern(true) + "</p>");
+ " as are a few special characters: " + compatibilityExceptions.toPattern(true) + "</p>");
if (DO_CHARTS) {
System.out.println("Charts");
@ -3564,6 +3779,7 @@ static int swapCJK(int i) {
}
checkWellformedTable();
addClosure();
log.println("</body></html>");
log.close();
@ -3572,10 +3788,83 @@ static int swapCJK(int i) {
}
static void addClosure() {
int canCount = 0;
System.out.println("Add missing decomposibles");
log.println("<h2>7. Comparing Other Equivalents</h2>");
log.println("<p>These are not necessarily errors, but should be examined for <i>possible</i> errors</p>");
log.println("<p>Each of the three strings is canonically equivalent, but has different sort keys</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
log.println("<tr><th>Count</th><th>Name</th><th>Code</th><th>Sort Keys</th></tr>");
Set contentsForCanonicalIteration = new TreeSet();
UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, null); // NFD
int ccounter = 0;
while (true) {
Utility.dot(ccounter++);
String s = ucac.next();
if (s == null) break;
contentsForCanonicalIteration.add(s);
}
Set additionalSet = new HashSet();
System.out.println("Loading canonical iterator");
if (canIt == null) canIt = new CanonicalIterator(".");
Iterator it2 = contentsForCanonicalIteration.iterator();
System.out.println("Adding any FCD equivalents that have different sort keys");
while (it2.hasNext()) {
String key = (String)it2.next();
if (key == null) {
System.out.println("Null Key");
continue;
}
canIt.setSource(key);
String nfdKey = toD.normalize(key);
boolean first = true;
while (true) {
String s = canIt.next();
if (s == null) break;
if (s.equals(key)) continue;
if (contentsForCanonicalIteration.contains(s)) continue;
if (additionalSet.contains(s)) continue;
// Skip anything that is not FCD.
if (!NFD.isFCD(s)) continue;
// We ONLY add if the sort key would be different
// Than what we would get if we didn't decompose!!
String sortKey = collator.getSortKey(s, UCA.NON_IGNORABLE);
String nonDecompSortKey = collator.getSortKey(s, UCA.NON_IGNORABLE, false);
if (sortKey.equals(nonDecompSortKey)) continue;
if (DEBUG && first) {
System.out.println(" " + ucd.getCodeAndName(key));
first = false;
}
log.println("<tr><td rowspan='3'>" + (++canCount) + "</td><td>" + Utility.replace(ucd.getName(key), ", ", ",<br>") + "</td>");
log.println("<td>" + Utility.hex(key) + "</td>");
log.println("<td>" + collator.toString(sortKey) + "</td></tr>");
log.println("<tr><td>" + Utility.replace(ucd.getName(nfdKey), ", ", ",<br>") + "</td>");
log.println("<td>" + Utility.hex(nfdKey) + "</td>");
log.println("<td>" + collator.toString(sortKey) + "</td></tr>");
log.println("<tr><td class='bottom'>" + Utility.replace(ucd.getName(s), ", ", ",<br>") + "</td>");
log.println("<td class='bottom'>" + Utility.hex(s) + "</td>");
log.println("<td class='bottom'>" + collator.toString(nonDecompSortKey) + "</td></tr>");
additionalSet.add(s);
}
}
log.println("</table>");
log.println("<p>Items: " + canCount + "</p>");
}
static void checkWellformedTable() throws IOException {
System.out.println("Checking for well-formedness");
log.println("<h2>5. Checking for well-formedness</h2>");
log.println("<h2>6. Checking for well-formedness</h2>");
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
@ -3620,6 +3909,7 @@ static int swapCJK(int i) {
cc = collator.getContents(UCA.FIXED_CE, nfd);
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
int lastPrimary = 0;
while (true) {
String str = cc.next(ces, lenArray);
@ -3632,40 +3922,64 @@ static int swapCJK(int i) {
int s = UCA.getSecondary(ce);
int t = UCA.getTertiary(ce);
// IF we are at the start of an implicit, then just check that the implicit is in range
// CHECK implicit
if (collator.isImplicitLeadPrimary(lastPrimary)) {
try {
if (s != 0 || t != 0) throw new Exception("Second implicit must be [X,0,0]");
collator.ImplicitToCodePoint(lastPrimary, p); // throws exception if bad
} catch (Exception e) {
log.println("<tr><td>" + (++errorCount) + ". BAD IMPLICIT: " + e.getMessage()
+ "</td><td>" + CEList.toString(ces, len)
+ "</td><td>" + ucd.getCodeAndName(str) + "</td></tr>");
}
// zap the primary, since we worry about the last REAL primary:
lastPrimary = 0;
continue;
}
// IF we are in the trailing range, something is wrong.
if (p >= UCA_Types.UNSUPPORTED_LIMIT) {
log.println("<tr><td>" + (++errorCount) + ". > " + Utility.hex(UCA_Types.UNSUPPORTED_LIMIT,4)
+ "</td><td>" + CEList.toString(ces, len)
+ "</td><td>" + ucd.getCodeAndName(str) + "</td></tr>");
lastPrimary = p;
continue;
}
// Check WF#1
if (p != 0 && s == 0) {
log.println("<tr><td>WF1.1"
log.println("<tr><td>" + (++errorCount) + ". WF1.1"
+ "</td><td>" + CEList.toString(ces, len)
+ "</td><td>" + ucd.getCodeAndName(str) + "</td></tr>");
errorCount++;
}
if (s != 0 && t == 0) {
log.println("<tr><td>WF1.2"
log.println("<tr><td>" + (++errorCount) + ". WF1.2"
+ "</td><td>" + CEList.toString(ces, len)
+ "</td><td>" + ucd.getCodeAndName(str) + "</td></tr>");
errorCount++;
}
// Check WF#2
if (p != 0) {
if (s > minps) {
log.println("<tr><td>WF2.2"
log.println("<tr><td>" + (++errorCount) + ". WF2.2"
+ "</td><td>" + CEList.toString(ces, len)
+ "</td><td>" + ucd.getCodeAndName(str) + "</td></tr>");
errorCount++;
}
}
if (s != 0) {
if (t > minpst) {
log.println("<tr><td>WF2.3"
log.println("<tr><td>" + (++errorCount) + ". WF2.3"
+ "</td><td>" + CEList.toString(ces, len)
+ "</td><td>" + ucd.getCodeAndName(str) + "</td></tr>");
errorCount++;
}
} else {
}
lastPrimary = p;
}
}
log.println("</table>");
@ -3679,9 +3993,7 @@ static int swapCJK(int i) {
}
if (errorCount > 0) {
log.println("<p>Well-formedness errors: " + errorCount + "</p>");
}
log.println("<p>Errors: " + errorCount + "</p>");
}
@ -3738,7 +4050,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
String colDbase = collator.getSortKey(ch, option, true);
String colNbase = collator.getSortKey(ch, option, false);
String colCbase = collator.getSortKey(toC.normalize(ch), option, false);
if (!colNbase.equals(colCbase)) {
if (!colNbase.equals(colCbase) || !colNbase.equals(colDbase) ) {
/*System.out.println(Utility.hex(ch));
System.out.println(printableKey(colNbase));
System.out.println(printableKey(colNbase));
@ -3770,10 +4082,11 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
String lastChar = "";
int countRem = 0;
int countDups = 0;
int errorCount = 0;
Iterator it1 = sortedD.keySet().iterator();
Iterator it2 = sortedN.keySet().iterator();
Differ differ = new Differ(250,3);
log.println("<h1>2. Differences in Ordering</h1>");
log.println("<h2>2. Differences in Ordering</h2>");
log.println("<p>Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.</p>");
log.println("<p>Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
@ -3818,6 +4131,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
}
log.println("</td></tr>");
}
errorCount++;
}
//differ.flush();
@ -3826,6 +4140,8 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
log.println("</table>");
log.println("<p>Errors: " + errorCount + "</p>");
//log.println("Removed " + countRem + " adjacent duplicates.<br>");
System.out.println("Left " + countDups + " conflicts.<br>");
log.println("Left " + countDups + " conflicts.<br>");
@ -3835,10 +4151,12 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
String lastChar = "";
int countRem = 0;
int countDups = 0;
int errorCount = 0;
Iterator it = sortedD.keySet().iterator();
log.println("<h1>2. Differences in Ordering</h1>");
log.println("<p>Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.</p>");
log.println("<p>Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.</p>");
log.println("<p>Note: so black lines are generally ok.</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
log.println("<tr><th>File Order</th><th>Code and Decomp</th><th>Key and Decomp-Key</th></tr>");
@ -3876,9 +4194,11 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
if (!showedLast) {
log.println("<tr><td colspan='3'></td><tr>");
showLine(count-1, lastCh, lastCol, lastColN);
errorCount++;
}
showedLast = true;
showLine(count,ch, col, colN);
errorCount++;
}
lastCol = col;
lastColN = colN;
@ -3886,6 +4206,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
}
log.println("</table>");
log.println("<p>Errors: " + errorCount + "</p>");
}
static int compareMinusLast(String a, String b) {
@ -3919,39 +4240,36 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
static final String[] alternateName = {"SHIFTED", "ZEROED", "NON_IGNORABLE", "SHIFTED_TRIMMED"};
static void showMismatches() {
MLStreamWriter out = new MLStreamWriter(log);
out.el("h1").tx("1. Mismatches when NFD is OFF").cl();
out.el("h2").tx("Date:" + new Date()).cl();
out.el("h2").tx("File Version:" + UCA.VERSION).cl();
out.el("p").tx("Alternate Handling = " + alternateName[option]).cl();
out.el("table").at("border",1);
out.el("caption").tx("Mismatches in UCA-NOD: Plain vs NFC: ").tx(MismatchedC.size()).cl("caption");
out.el("tr");
out.el("th").tx("Code").cl();
out.el("th").tx("Type").cl();
out.el("th").tx("CC?").cl();
out.el("th").tx("Key").cl();
out.cl("tr");
log.println("<h2>1. Mismatches when NFD is OFF</h2>");
log.println("<p>Alternate Handling = " + alternateName[option] + "</p>");
log.println("<p>NOTE: NFD form is used by UCA,"
+ "so if other forms are different there are <i>ignored</i>. This may indicate a problem, e.g. missing contraction.</p>");
log.println("<table border='1'>");
log.println("<tr><th>Name</th><th>Type</th><th>Unicode</th><th>Key</th></tr>");
Iterator it = MismatchedC.keySet().iterator();
int errorCount = 0;
while (it.hasNext()) {
String ch = (String)it.next();
String MN = (String)MismatchedN.get(ch);
String MC = (String)MismatchedC.get(ch);
String MD = (String)MismatchedD.get(ch);
String chInC = toC.normalize(ch);
out.el("tr");
out.el("th").at("rowSpan",2).at("align","right").tx16(ch).tx(' ').tx(ucd.getName(ch));
out.el("br").cl().tx("NFC=").tx16(chInC).cl();
out.el("th").tx("Plain").cl();
out.el("th").tx(containsCombining(ch) ? "y" : "n").cl();
out.el("td").tx(printableKey(MN)).cl();
out.cl("tr");
out.el("tr");
out.el("th").tx("NFC").cl();
out.el("th").tx(containsCombining(chInC) ? "Y" : "ERROR").cl();
out.el("td").tx(printableKey(MC)).cl();
out.cl("tr");
String chInD = toD.normalize(ch);
log.println("<tr><td rowSpan='3' class='bottom'>" + Utility.replace(ucd.getName(ch), ", ", ",<br>")
+ "</td><td>NFD</td><td>" + Utility.hex(chInD)
+ "</td><td>" + printableKey(MD) + "</td></tr>");
log.println("<tr><td>NFC</td><td>" + Utility.hex(chInC)
+ "</td><td>" + printableKey(MC) + "</td></tr>");
log.println("<tr><td class='bottom'>Plain</td><td class='bottom'>" + Utility.hex(ch)
+ "</td><td class='bottom'>" + printableKey(MN) + "</td></tr>");
errorCount++;
}
out.closeAllElements();
log.println("</table>");
log.println("<p>Errors: " + errorCount + "</p>");
log.println("<br>");
}