Completed most of the upgrade to UCA 3.1.1;

involved double CEs for Han; fixing back-mappings, etc.
 did a bit of code cleanup too.
Remaining to do: backmap from UCA double CEs to original character codes,
 for constructing Fractional UCA.

X-SVN-Rev: 8754
This commit is contained in:
Mark Davis 2002-05-31 01:41:04 +00:00
parent ce883f6d81
commit 693b0c9b91
15 changed files with 492 additions and 166 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.3 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -156,6 +156,15 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
return result.toString();
}
public static String toString(int[] ces, int len) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < len; ++i) {
if (i != 0) result.append(' ');
result.append(toString(ces[i]));
}
return result.toString();
}
public static String toString(int ce) {
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
+ Utility.hex(UCA.getSecondary(ce)) + "."

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
* $Date: 2002/04/23 01:59:14 $
* $Revision: 1.8 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -375,7 +375,7 @@ public class GenOverlap implements UCD_Types {
System.out.println("debug");
}
boolean mashLast = false;
if (nfkd.normalizationDiffers(cp)) {
if (!nfkd.isNormalized(cp)) {
String decomp = nfkd.normalize(cp);
String canon = nfd.normalize(cp);
len = collator.getCEs(decomp, true, ces);
@ -578,7 +578,7 @@ public class GenOverlap implements UCD_Types {
if (UTF16.countCodePoint(s) != 1) continue; // skip ligatures
int cp = UTF16.charAt(s, 0);
if (nfkd.normalizationDiffers(cp)) continue;
if (!nfkd.isNormalized(cp)) continue;
int script = ucd.getScript(cp);
int len = lenArray[0];
@ -607,7 +607,7 @@ public class GenOverlap implements UCD_Types {
Utility.dot(counter++);
if (!ucd.isAllocated(cp)) continue;
if (nfkd.normalizationDiffers(cp)) continue;
if (!nfkd.isNormalized(cp)) continue;
if (ucd.getCategory(cp) == Lu) continue; // don't count case
String scp = UTF16.valueOf(cp);

View File

@ -5,18 +5,20 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2002/05/29 23:18:15 $
* $Revision: 1.2 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public class Main {
static final String UCDVersion = "";
static final String[] ICU_FILES = {"FractionalUCA", "writeconformance", "writeconformanceshifted", "WriteRules"};
public static void main(String args[]) throws Exception {
@ -36,7 +38,10 @@ public class Main {
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.equalsIgnoreCase("WriteRulesWithNames")) WriteCollationData.writeRules(WriteCollationData.WITH_NAMES);
System.out.println("OPTION: " + arg);
if (arg.equalsIgnoreCase("ICU")) args = Utility.append(args, ICU_FILES);
else if (arg.equalsIgnoreCase("WriteRulesWithNames")) WriteCollationData.writeRules(WriteCollationData.WITH_NAMES);
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference();

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2002/04/23 01:59:14 $
* $Revision: 1.10 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
@ -108,6 +108,8 @@ final public class UCA implements Comparator {
static final boolean RECORDING_DATA = false;
static final boolean RECORDING_CHARS = true;
private UCD ucd;
// =============================================================
// Main Methods
// =============================================================
@ -129,7 +131,8 @@ final public class UCA implements Comparator {
toD = new Normalizer(Normalizer.NFD, unicodeVersion);
}
ucdVersion = UCD.make(unicodeVersion).getVersion();
ucd = UCD.make(unicodeVersion);
ucdVersion = ucd.getVersion();
// either get the full sources, or just a demo set
if (fullData) {
@ -478,7 +481,9 @@ final public class UCA implements Comparator {
* CE Type
*/
static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2,
FIXED_CE = 3, HANGUL_CE = 5, SURROGATE_CE = 6, UNSUPPORTED_CE = 7;
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
FIXED_CE = 3;
// SURROGATE_CE = 6,
/**
* Returns the char associated with a FIXED value
@ -502,12 +507,13 @@ final public class UCA implements Comparator {
// Special check for Han, Hangul
if (isHangul(ch)) return HANGUL_CE;
if (isFixed(ch)) return FIXED_CE;
if (isCJK(ch)) return CJK_CE;
if (isCJK_AB(ch)) return CJK_AB_CE;
// special check for unsupported surrogate pair, 20 1/8 bits
if (0xD800 <= ch && ch <= 0xDFFF) {
return SURROGATE_CE;
}
//if (0xD800 <= ch && ch <= 0xDFFF) {
// return SURROGATE_CE;
//}
return UNSUPPORTED_CE;
}
@ -632,6 +638,12 @@ final public class UCA implements Comparator {
return result.toString();
}
static boolean isImplicitCE(int ce) {
int primary = getPrimary(ce);
return primary >= UNSUPPORTED_BASE && primary <= UNSUPPORTED_TOP;
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
@ -790,9 +802,9 @@ final public class UCA implements Comparator {
/**
* A special bit combination in a CE is used to reserve exception cases. This has the effect
* of removing 32 primary key values out of the 65536 possible.
* of removing a small number of the primary key values out of the 65536 possible.
*/
static final int EXCEPTION_CE_MASK = 0xFF000000;
static final int EXCEPTION_CE_MASK = 0xF8000000;
/**
* Used to composed Hangul and Han characters
@ -808,8 +820,13 @@ final public class UCA implements Comparator {
* There are at least 34 values, so that we can use a range for surrogates
* However, we do add to the first weight if we have surrogate pairs!
*/
public static final int UNSUPPORTED_BASE = 0xFF40;
public static final int UNSUPPORTED_TOP = 0xFFFF;
public static final int UNSUPPORTED_CJK_BASE = 0xFF40;
public static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
public static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
public static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
public static final int UNSUPPORTED_TOP = UNSUPPORTED_OTHER_BASE + 0x40;
static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
// was 0xFFC20101;
@ -821,7 +838,7 @@ final public class UCA implements Comparator {
* to be looked up (with following characters) in the contractingTable.<br>
* This isn't a MASK since there is exactly one value.
*/
static final int CONTRACTING = 0xFF310000;
static final int CONTRACTING = 0xFA310000;
/**
* Expanding characters are marked with a exception bit combination
@ -829,7 +846,7 @@ final public class UCA implements Comparator {
* This means that they map to more than one CE, which is looked up in
* the expansionTable by index. See EXCEPTION_INDEX_MASK
*/
static final int EXPANDING_MASK = 0xFF300000; // marks expanding range start
static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
/**
* This mask is used to get the index from an EXPANDING exception.
@ -976,12 +993,11 @@ final public class UCA implements Comparator {
// RECURSIVE!!!
}
// Special check for Han, YI
if (isFixed(bigChar)) {
return makeKey(bigChar, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
return 0;
}
// special check for unsupported surrogate pair, 20 1/8 bits
// special check and fix for unsupported surrogate pair, 20 1/8 bits
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
// ignore unmatched surrogates (e.g. return zero)
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
@ -990,25 +1006,38 @@ final public class UCA implements Comparator {
index++; // skip next char
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
}
if ((bigChar & 0xFFFE) == 0xFFFE) { // illegal code value, ignore!!
return 0;
}
// The result is 2 CEs. One is UNSUPPORTED + top bits, and the other
// is a primary that is the next fifteen bits
// This has the effect of putting all unsupported characters at the end,
// in code order.
// add bottom 5 bits to UNSUPPORTED, and push rest
//return UNSUPPORTED + (bigChar & 0xFFFF0000); // top bits added
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
return makeKey(UNSUPPORTED_BASE + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
/*
expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
return UNSUPPORTED + ((bigChar << 1) & 0xFFFF0000); // top 34 values plus UNSUPPORTED
*/
/*
The formula from the UCA:
BASE:
FB40 CJK Ideograph
FB80 CJK Ideograph Extension A/B
FBC0 Any other code point
AAAA = BASE + (CP >> 15);
BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
*/
// divide the three cases
int base = UNSUPPORTED_OTHER_BASE;
if (isCJK(bigChar)) base = UNSUPPORTED_CJK_BASE;
else if (isCJK_AB(bigChar)) base = UNSUPPORTED_CJK_AB_BASE;
// Now compose the two keys
// first push BBBB
// HACK: expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0));
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
// now return AAAA
return makeKey(base + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
}
if (ce == CONTRACTING) {
// Contracting is probably the most interesting (read "tricky") part
@ -1084,12 +1113,18 @@ final public class UCA implements Comparator {
return expandingStack.pop(); // pop last (guaranteed to exist!)
}
public final boolean isFixed(int bigChar) {
return (0x3400 <= bigChar && bigChar <= 0x4DB5
|| 0x4E00 <= bigChar && bigChar <= 0x9FA5
// || 0xA000 <= bigChar && bigChar <= 0xA48F
);
public final boolean isCJK(int bigChar) {
return (0x4E00 <= bigChar && bigChar <= 0x9FFF);
}
public final boolean isCJK_AB(int bigChar) {
return (0x3400 <= bigChar && bigChar <= 0x4DBF
|| 0x20000 <= bigChar && bigChar <= 0x2A6DF);
}
/*
3400..4DBF; CJK Unified Ideographs Extension A
4E00..9FFF; CJK Unified Ideographs
20000..2A6DF; CJK Unified Ideographs Extension B
*/
private final boolean isHangul(int bigChar) {
return (0xAC00 <= bigChar && bigChar <= 0xD7A3);
@ -1176,7 +1211,7 @@ final public class UCA implements Comparator {
Normalizer nfd = skipDecomps;
Iterator enum = null;
byte ceLimit;
int currentRange = Integer.MAX_VALUE; // set to ZERO to enable
int currentRange = SAMPLE_RANGES.length; // set to ZERO to enable
int startOfRange = SAMPLE_RANGES[0][0];
int endOfRange = startOfRange;
int itemInRange = startOfRange;
@ -1206,13 +1241,16 @@ final public class UCA implements Comparator {
// normal case
while (current++ < 0x10FFFF) {
//char ch = (char)current;
byte type = getCEType(current);
if (type >= ceLimit || type == CONTRACTING_CE) continue;
//if (nfd.isNormalized(current) || type == HANGUL_CE) {
//}
if (skipDecomps != null && !skipDecomps.isNormalized(current)) continue; // CHECK THIS
if (!nfd.normalizationDiffers(current) || type == HANGUL_CE) {
if (type >= ceLimit) continue;
if (skipDecomps != null && skipDecomps.normalizationDiffers(current)) continue;
}
result = UTF16.valueOf(current);
return result;
}
@ -1226,6 +1264,7 @@ final public class UCA implements Comparator {
// extra samples
if (currentRange < SAMPLE_RANGES.length) {
System.out.println("*");
try {
result = UTF16.valueOf(itemInRange);
} catch (RuntimeException e) {
@ -1274,6 +1313,7 @@ final public class UCA implements Comparator {
result.second = s;
return true;
}
}
static final int[][] SAMPLE_RANGES = {
@ -1299,7 +1339,7 @@ final public class UCA implements Comparator {
{0x100000, 0x1000FD},
{0x10FF00, 0x10FFFD},
};
/**
* Adds the collation elements from a file (or other stream) in the UCA format.
* Values will override any previous mappings.
@ -1366,7 +1406,7 @@ final public class UCA implements Comparator {
boolean record = true;
/* if (multiChars.length() > 0) record = false;
else */
if (toD.normalizationDiffers(value)) record = false;
if (!toD.isNormalized(value)) record = false;
// collect CEs
if (value == 0x2F00) {
@ -1402,6 +1442,8 @@ final public class UCA implements Comparator {
expandingTable.push(TERMINATOR);
}
//if (value == 0xd801) System.out.print("DEBUG: " + line);
// assign CE(s) to char(s)
if (multiChars.length() > 0) {
contractingTable.put(multiChars.toString(), new Integer(ce));
@ -1455,8 +1497,9 @@ final public class UCA implements Comparator {
}
// assign CE(s) to char(s)
int value = source.charAt(0);
//if (value == 0x10000) System.out.print("DEBUG2: " + source);
if (source.length() > 0) {
contractingTable.put(source.toString(), new Integer(ce));
if (collationElements[value] == UNSUPPORTED) {
@ -1772,7 +1815,7 @@ final public class UCA implements Comparator {
* Used for checking data file integrity
*/
private void checkUnique(char value, int result, int fourth, String line) {
if (toD.normalizationDiffers(value)) return; // don't check decomposables.
if (!toD.isNormalized(value)) return; // don't check decomposables.
Object ceObj = new Long(((long)result << 16) | fourth);
Object probe = uniqueTable.get(ceObj);
if (probe != null) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.8 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -29,7 +29,7 @@ public class WriteCharts implements UCD_Types {
Default.setUCD();
for (int i = 0xE000; i < 0x10000; ++i) {
if (!Default.ucd.isRepresented(i)) continue;
if (Default.nfkc.normalizationDiffers(i)) continue;
if (!Default.nfkc.isNormalized(i)) continue;
System.out.println(Default.ucd.getCodeAndName(i));
}
}
@ -205,7 +205,7 @@ public class WriteCharts implements UCD_Types {
byte cat = Default.ucd.getCategory(i);
if (cat == Cs || cat == Co) continue;
if (!Default.nfkd.normalizationDiffers(i)) continue;
if (Default.nfkd.isNormalized(i)) continue;
String decomp = Default.nfkd.normalize(i);
byte script = getBestScript(decomp);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2002/05/29 23:18:15 $
* $Revision: 1.12 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -190,7 +190,7 @@ public class WriteCollationData implements UCD_Types {
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
if (normKD.normalizationDiffers(c)) {
if (!normKD.isNormalized(c)) {
++count;
String decomp = normKD.normalize(c);
datasize += decomp.length();
@ -218,7 +218,7 @@ public class WriteCollationData implements UCD_Types {
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
if (normD.normalizationDiffers(c)) {
if (!normD.isNormalized(c)) {
++count;
String decomp = normD.normalize(c);
datasize += decomp.length();
@ -408,7 +408,7 @@ public class WriteCollationData implements UCD_Types {
}
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
for (char ch = 0; ch < 0xFFFF; ++ch) {
if (!nfkd.normalizationDiffers(ch)) continue;
if (nfkd.isNormalized(ch)) continue;
if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul
String sortKey = collator.getSortKey(String.valueOf(ch), UCA.NON_IGNORABLE, decomposition);
String decompSortKey = collator.getSortKey(nfkd.normalize(ch), UCA.NON_IGNORABLE, decomposition);
@ -1148,6 +1148,9 @@ public class WriteCollationData implements UCD_Types {
}
}
static Normalizer nfdNew = new Normalizer(Normalizer.NFD, "");
static Normalizer nfkdNew = new Normalizer(Normalizer.NFKD, "");
static void writeRules (byte option) throws IOException {
//testTransitivity();
@ -1155,6 +1158,7 @@ public class WriteCollationData implements UCD_Types {
int[] ces = new int[50];
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
if (false) {
int len2 = collator.getCEs("\u2474", true, ces);
@ -1173,29 +1177,64 @@ public class WriteCollationData implements UCD_Types {
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE,
SKIP_CANONICAL_DECOMPOSIBLES ? nfd : null);
int[] lenArray = new int[1];
Set alreadyDone = new HashSet();
PrintWriter log2 = Utility.openPrintWriter("UCARules-log.txt", false, false);
while (true) {
String s = cc.next(ces, lenArray);
if (s == null) break;
int len = lenArray[0];
if (s.equals("\uD800")) {
System.out.println("Check: " + CEList.toString(ces, len));
}
log2.println(s + "\t" + CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
addToBackMap(backMap, ces, len, s, false);
if (len == 1) backMap.put(new Integer(ces[0]), s);
String key = String.valueOf((char)(ces[0]>>>16))
+ String.valueOf((char)(ces[0] & 0xFFFF))
+ collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + UCA.codePointOrder(s);
ordered.put(key, s);
alreadyDone.add(s);
Object result = ordered.get(key);
if (result == null) {
System.out.println("BAD SORT: " + Utility.hex(key) + ", " + Utility.hex(s));
}
}
System.out.println("Adding Kanji");
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAllocated(i)) continue;
if (nfkd.isNormalized(i)) continue;
Utility.dot(i);
String decomp = nfkd.normalize(i);
int cp;
for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(decomp, j);
String s = UTF16.valueOf(cp);
if (alreadyDone.contains(s)) continue;
alreadyDone.add(s);
int len = collator.getCEs(s, true, ces);
log2.println(s+ "\t" + CEList.toString(ces, len)
+ "\t" + ucd.getCodeAndName(s) + " from " + ucd.getCodeAndName(i));
addToBackMap(backMap, ces, len, s, false);
}
}
System.out.println("Writing");
String filename = "UCA_Rules.txt";
if (option == WITH_NAMES) filename = "UCA_Rules_With_Names.txt";
else if (option == IN_XML) filename = "UCA_Rules.xml";
log = Utility.openPrintWriter(filename);
log = Utility.openPrintWriter(filename, false, false);
if (option == IN_XML) log.println("<uca>");
else log.write('\uFEFF'); // BOM
@ -1351,60 +1390,35 @@ public class WriteCollationData implements UCD_Types {
// get relation
int relation = 3;
/*if (chr.charAt(0) == 0xFFFB) {
System.out.println("DEBUG");
}*/
if (collator.getPrimary(ce) != collator.getPrimary(lastCE)) {
relation = 0;
} else if (collator.getSecondary(ce) != collator.getSecondary(lastCE)) {
relation = 1;
} else if (collator.getTertiary(ce) != collator.getTertiary(lastCE)) {
relation = 2;
} else if (len > lastLen) {
relation = 2; // HACK
} else {
int minLen = len < lastLen ? len : lastLen;
for (int kk = 1; kk < minLen; ++kk) {
int lc = lastCes[kk];
int c = ces[kk];
if (collator.getPrimary(c) != collator.getPrimary(lc)
|| collator.getSecondary(c) != collator.getSecondary(lc)) {
relation = 3; // reset relation on FIRST char, since differ anyway
break;
} else if (collator.getTertiary(c) > collator.getTertiary(lc)) {
relation = 2; // reset to tertiary (but later ce's might override!)
}
}
}
int relation = getStrengthDifference(ces, len, lastCes, lastLen);
/*if (chr.equals("\u2474")) {
if (chr.equals("\u2F00")) {
System.out.println(UCA.ceToString(ces, len));
}*/
}
// There are double-CEs, so we have to know what the length of the first bit is.
int expansionStart = 1;
if (UCA.isImplicitCE(ces[0])) {
expansionStart = 2; // move up if first is double-ce
}
// check expansions
String expansion = "";
if (len > 1) {
int tert0 = ces[0] & 0xFF;
boolean isCompat = tert0 != 2 && tert0 != 8;
for (int i = 1; i < len; ++i) {
int probe = ces[i];
String s = getFromBackMap(backMap, probe);
if (s == null) {
int meHack = UCA.makeKey(0x1795,0x0020,0x0004);
if (probe == meHack) {
s = "\u3081";
} else {
System.out.println("No back map for " + collator.ceToString(ces[i])
+ ": " + ucd.getCodeAndName(chr));
s = "[" + Utility.hex(ces[i]) + "]";
}
}
expansion += s;
}
if (len > expansionStart) {
//int tert0 = ces[0] & 0xFF;
//boolean isCompat = tert0 != 2 && tert0 != 8;
log2.println("Exp: " + ucd.getCodeAndName(chr) + ", " + CEList.toString(ces, len) + ", start: " + expansionStart);
int[] rel = {relation};
expansion = getFromBackMap(backMap, ces, expansionStart, len, chr, rel);
relation = rel[0];
}
// print results
@ -1429,28 +1443,268 @@ public class WriteCollationData implements UCD_Types {
} else {
if (reset.length() != 0) log.println(reset);
log.print(RELATION_NAMES[relation] + " " + quoteOperand(chr));
if (len > 1) log.print(" / " + quoteOperand(expansion));
if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
if (option == WITH_NAMES) {
log.print("\t# "
+ collator.ceToString(ces, len) + " "
+ ucd.getCodeAndName(chr));
if (len > 1) log.print(" / " + Utility.hex(expansion));
if (expansion.length() > 0) log.print(" / " + Utility.hex(expansion));
}
log.println();
}
}
// log.println("& [top]"); // RESET
if (option == IN_XML) log.println("</uca>");
log2.close();
log.close();
Utility.fixDot();
}
static long getPrimary(int[] ces) {
if (UCA.isImplicitCE(ces[0])) {
return (UCA.getPrimary(ces[0]) << 16) + UCA.getPrimary(ces[1]);
} else {
return UCA.getPrimary(ces[0]);
}
}
static long getSecondary(int[] ces) {
if (UCA.isImplicitCE(ces[0])) {
return (UCA.getSecondary(ces[0]) << 16) + UCA.getSecondary(ces[1]);
} else {
return UCA.getSecondary(ces[0]);
}
}
static long getTertiary(int[] ces) {
if (UCA.isImplicitCE(ces[0])) {
return (UCA.getTertiary(ces[0]) << 16) + UCA.getTertiary(ces[1]);
} else {
return UCA.getTertiary(ces[0]);
}
}
static int getStrengthDifference(int[] ces, int len, int[] lastCes, int lastLen) {
int relation = 3;
if (getPrimary(ces) != getPrimary(lastCes)) {
relation = 0;
} else if (getSecondary(ces) != getSecondary(lastCes)) {
relation = 1;
} else if (getTertiary(ces) != getTertiary(lastCes)) {
relation = 2;
} else if (len > lastLen) {
relation = 2; // HACK
} else {
int minLen = len < lastLen ? len : lastLen;
int start = UCA.isImplicitCE(ces[0]) ? 2 : 1;
for (int kk = start; kk < minLen; ++kk) {
int lc = lastCes[kk];
int c = ces[kk];
if (collator.getPrimary(c) != collator.getPrimary(lc)
|| collator.getSecondary(c) != collator.getSecondary(lc)) {
relation = 3; // reset relation on FIRST char, since differ anyway
break;
} else if (collator.getTertiary(c) > collator.getTertiary(lc)) {
relation = 2; // reset to tertiary (but later ce's might override!)
}
}
}
return relation;
}
// static final String[] RELATION_NAMES = {" <", " <<", " <<<", " ="};
static final String[] RELATION_NAMES = {" <\t", " <<\t", " <<<\t", " =\t"};
static final String[] XML_RELATION_NAMES = {"o1", "o2", "o3", "o4"};
static final String getFromBackMap(Map backMap, int probe) {
String s = (String)backMap.get(new Integer(probe));
static class ArrayWrapper {
int[] array;
int start;
int limit;
/*public ArrayWrapper(int[] contents) {
set(contents, 0, contents.length);
}
*/
public ArrayWrapper(int[] contents, int start, int limit) {
set(contents, start, limit);
}
private void set(int[] contents, int start, int limit) {
array = contents;
this.start = start;
this.limit = limit;
}
public boolean equals(Object other) {
ArrayWrapper that = (ArrayWrapper) other;
if (that.limit - that.start != limit - start) return false;
for (int i = start; i < limit; ++i) {
if (array[i] != that.array[i - start + that.start]) return false;
}
return true;
}
public int hashCode() {
int result = limit - start;
for (int i = start; i < limit; ++i) {
result = result * 37 + array[i];
}
return result;
}
}
static int testCase[] = {
//collator.makeKey(0xFF40, 0x0020, 0x0002),
collator.makeKey(0x0255, 0x0020, 0x000E),
};
static String testString = "\u33C2\u002E";
static boolean contains(int[] array, int start, int limit, int key) {
for (int i = start; i < limit; ++i) {
if (array[i] == key) return true;
}
return false;
}
static final void addToBackMap(Map backMap, int[] ces, int len, String s, boolean show) {
if (show || contains(testCase, 0, testCase.length, ces[0]) || testString.indexOf(s) > 0) {
System.out.println("Test case: " + Utility.hex(s) + ", " + CEList.toString(ces, len));
}
backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s);
}
static int[] ignorableList = {
UCA.makeKey(0x0000, 0x0153, 0x0002),
UCA.makeKey(0x0000, 0x0154, 0x0002),
UCA.makeKey(0x0000, 0x0155, 0x0002),
UCA.makeKey(0x0000, 0x0156, 0x0002),
UCA.makeKey(0x0000, 0x0157, 0x0002),
UCA.makeKey(0x0000, 0x0158, 0x0002),
UCA.makeKey(0x0000, 0x0159, 0x0002),
UCA.makeKey(0x0000, 0x015A, 0x0002),
UCA.makeKey(0x0000, 0x015B, 0x0002),
UCA.makeKey(0x0000, 0x015C, 0x0002),
UCA.makeKey(0x0000, 0x015D, 0x0002),
UCA.makeKey(0x0000, 0x015E, 0x0002),
UCA.makeKey(0x0000, 0x015F, 0x0002),
UCA.makeKey(0x0000, 0x0160, 0x0002),
UCA.makeKey(0x0000, 0x0161, 0x0002),
UCA.makeKey(0x0000, 0x0162, 0x0002),
UCA.makeKey(0x0000, 0x0163, 0x0002),
UCA.makeKey(0x0000, 0x0164, 0x0002),
UCA.makeKey(0x0000, 0x0165, 0x0002),
UCA.makeKey(0x0000, 0x0166, 0x0002),
UCA.makeKey(0x0000, 0x0167, 0x0002),
UCA.makeKey(0x0000, 0x0168, 0x0002),
UCA.makeKey(0x0000, 0x0169, 0x0002),
UCA.makeKey(0x0000, 0x016A, 0x0002),
UCA.makeKey(0x0000, 0x016B, 0x0002),
UCA.makeKey(0x0000, 0x016C, 0x0002),
UCA.makeKey(0x0000, 0x016D, 0x0002),
UCA.makeKey(0x0000, 0x016E, 0x0002),
UCA.makeKey(0x0000, 0x016F, 0x0002),
UCA.makeKey(0x0000, 0x0170, 0x0002),
};
static final String getFromBackMap(Map backMap, int[] originalces, int expansionStart, int len, String chr, int[] rel) {
int[] ces = (int[])(originalces.clone());
String expansion = "";
// process ces to neutralize tertiary
for (int i = expansionStart; i < len; ++i) {
int probe = ces[i];
char primary = collator.getPrimary(probe);
char secondary = collator.getSecondary(probe);
char tertiary = collator.getTertiary(probe);
int tert = tertiary;
switch (tert) {
case 8: case 9: case 0xA: case 0xB: case 0xC: case 0x1D:
tert = 8;
break;
case 0xD: case 0x10: case 0x11: case 0x12: case 0x13: case 0x1C:
tert = 0xE;
break;
default:
tert = 2;
break;
}
ces[i] = collator.makeKey(primary, secondary, tert);
}
for (int i = expansionStart; i < len;) {
int limit;
String s = null;
for (limit = len; limit > i; --limit) {
ArrayWrapper wrapper = new ArrayWrapper(ces, i, limit);
s = (String)backMap.get(wrapper);
if (s != null) break;
}
if (s == null) {
do {
if (contains(ignorableList, 0, ignorableList.length, ces[i])) {
s = "";
if (rel[0] > 1) rel[0] = 1; // HACK
break;
}
// Try stomping the value to different tertiaries
int probe = ces[i];
char primary = collator.getPrimary(probe);
char secondary = collator.getSecondary(probe);
ces[i] = collator.makeKey(primary, secondary, 2);
ArrayWrapper wrapper = new ArrayWrapper(ces, i, i+1);
s = (String)backMap.get(wrapper);
if (s != null) break;
ces[i] = collator.makeKey(primary, secondary,0xE);
wrapper = new ArrayWrapper(ces, i, i+1);
s = (String)backMap.get(wrapper);
if (s != null) break;
/*
int meHack = UCA.makeKey(0x1795,0x0020,0x0004);
if (ces[i] == meHack) {
s = "\u3081";
break;
}
*/
// we failed completely. Print error message, and bail
System.out.println("No back map for " + collator.ceToString(ces[i])
+ " from " + CEList.toString(ces, len));
System.out.println("\t" + ucd.getCodeAndName(chr)
+ " => " + ucd.getCodeAndName(nfkdNew.normalize(chr))
);
s = "[" + Utility.hex(ces[i]) + "]";
} while (false); // exactly one time, just for breaking
limit = i + 1;
}
expansion += s;
i = limit;
}
return expansion;
}
/*
static final String getFromBackMap(Map backMap, int[] ces, int index, int limit) {
ArrayWrapper wrapper = new ArrayWrapper(ces, index, limit);
int probe = ces[index];
wrapperContents[0] = probe;
String s = (String)backMap.get(wrapper);
outputLen[0] = 1;
if (s != null) return s;
char primary = collator.getPrimary(probe);
@ -1473,25 +1727,31 @@ public class WriteCollationData implements UCD_Types {
break;
}
probe = collator.makeKey(primary, secondary, tert);
s = (String)backMap.get(new Integer(probe));
wrapperContents[0] = probe;
s = (String)backMap.get(wrapper);
if (s != null) return s;
probe = collator.makeKey(primary, secondary, collator.NEUTRAL_TERTIARY);
s = (String)backMap.get(new Integer(probe));
wrapperContents[0] = probe;
s = (String)backMap.get(wrapper);
}
if (s != null) return s;
if (primary != 0 && secondary != collator.NEUTRAL_SECONDARY) {
String first = getFromBackMap(backMap,
collator.makeKey(primary, collator.NEUTRAL_SECONDARY, tertiary));
String second = getFromBackMap(backMap,
collator.makeKey(0, secondary, collator.NEUTRAL_TERTIARY));
int[] dummyArray = new int[1];
dummyArray[0] = collator.makeKey(primary, collator.NEUTRAL_SECONDARY, tertiary);
String first = getFromBackMap(backMap, dummyArray, 0, outputLen);
dummyArray[0] = collator.makeKey(0, secondary, collator.NEUTRAL_TERTIARY);
String second = getFromBackMap(backMap, dummyArray, 0, outputLen);
if (first != null && second != null) {
s = first + second;
}
}
return s;
}
*/
static final String[] RELATION = {
"<", " << ", " <<< ", " = ", " = ", " = ", " >>> ", " >> ", ">"

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $
* $Date: 2002/04/23 01:59:16 $
* $Revision: 1.6 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -283,7 +283,7 @@ public class WriteHTMLCollation implements UCD_Types {
}
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
for (char ch = 0; ch < 0xFFFF; ++ch) {
if (!nfkd.normalizationDiffers(ch)) continue;
if (nfkd.isNormalized(ch)) continue;
if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul
String sortKey = collator.getSortKey(String.valueOf(ch), UCA.NON_IGNORABLE, decomposition);
String decompSortKey = collator.getSortKey(nfkd.normalize(ch), UCA.NON_IGNORABLE, decomposition);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2002/04/23 01:59:13 $
* $Revision: 1.13 $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -281,7 +281,7 @@ public final class DerivedProperty implements UCD_Types {
}
public String getValue(int cp, byte style) {
if (nfx.normalizationDiffers(cp)) return NO;
if (!nfx.isNormalized(cp)) return NO;
else if (nfx.isTrailing(cp)) return MAYBE;
else return "";
}
@ -598,7 +598,7 @@ of characters, the first of which has a non-zero combining class.
}
boolean hasValue(int cp) {
if (hasSoftDot(cp)) return true;
if (!Default.nfkd.normalizationDiffers(cp)) return false;
if (Default.nfkd.isNormalized(cp)) return false;
String decomp = Default.nfd.normalize(cp);
boolean ok = false;
for (int i = decomp.length()-1; i >= 0; --i) {
@ -700,7 +700,7 @@ of characters, the first of which has a non-zero combining class.
// if (true) throw new IllegalArgumentException("FIX Default.nf[2]");
if (!Default.nf[NFKD].normalizationDiffers(cp)) return Lo;
if (Default.nf[NFKD].isNormalized(cp)) return Lo;
String norm = Default.nf[NFKD].normalize(cp);
int cp2;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2002/04/23 01:59:14 $
* $Revision: 1.9 $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -416,7 +416,7 @@ public class GenerateCaseFolding implements UCD_Types {
static boolean specialNormalizationDiffers(int ch) {
if (ch == 0x00DF) return true; // es-zed
return Default.nfkd.normalizationDiffers(ch);
return !Default.nfkd.isNormalized(ch);
}
static String specialNormalization(String s) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.18 $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -1232,7 +1232,7 @@ public class GenerateData implements UCD_Types {
Utility.dot(i);
if (!Default.ucd.isRepresented(i)) continue;
if (!Default.nfd.normalizationDiffers(i)) {
if (Default.nfd.isNormalized(i)) {
if (Default.ucd.getScript(i) == LATIN_SCRIPT) {
int cp = i;
String hex = "u" + Utility.hex(cp, 4);
@ -1358,7 +1358,7 @@ public class GenerateData implements UCD_Types {
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) System.out.println("# " + i);
if (!Default.ucd.isAssigned(i)) continue;
if (!Default.nfd.normalizationDiffers(i)) continue;
if (Default.nfd.isNormalized(i)) continue;
String decomp = Default.nfd.normalize(i);
int cp;
for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp)) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.12 $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -119,10 +119,12 @@ public final class Main implements UCD_Types {
if (arg.equalsIgnoreCase("All")) {
// Append all args at end
/*
String[] temp = new String[args.length + ALL_FILES.length];
System.arraycopy(args, 0, temp, 0, args.length);
System.arraycopy(ALL_FILES, 0, temp, args.length, ALL_FILES.length);
args = temp;
*/
args = Utility.append(args, ALL_FILES);
expanding = true;
// EXTRACTED PROPERTIES

View File

@ -67,7 +67,7 @@ public final class NFSkippable extends UnicodeProperty {
if (!ucd.isAssigned(cp)) return true;
if (DEBUG) cause = "\t\tnf differs";
if (nf.normalizationDiffers(cp)) return false;
if (!nf.isNormalized(cp)) return false;
if (DEBUG) cause = "\t\tnon-zero cc";
if (ucd.getCombiningClass(cp) != 0) return false;
@ -87,7 +87,7 @@ public final class NFSkippable extends UnicodeProperty {
// "displaced", so we don't have to test further
if (DEBUG) cause = "\t\tno decomp";
if (!nfd.normalizationDiffers(cp)) return true;
if (nfd.isNormalized(cp)) return true;
// OPTIMIZATION -- careful
// Hangul syllables are skippable IFF they are isLeadingJamoComposition
@ -265,7 +265,7 @@ public final class NFSkippable extends UnicodeProperty {
byte cat = skipper.ucd.getCategory(cp);
if (cat == PRIVATE_USE || cat == SURROGATE) continue;
if (skipper.ucd.getCombiningClass(cp) != 0) continue;
if (skipper.nf.normalizationDiffers(cp)) continue;
if (!skipper.nf.isNormalized(cp)) continue;
if ((cp < 0xAC00 || cp > 0xAE00)
&& cp != skipper.ucd.mapToRepresentative(cp, false)) continue;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.8 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -205,8 +205,8 @@ public final class Normalizer implements UCD_Types {
* normalizer.
* @param ch the source character
*/
public boolean normalizationDiffers(int ch) {
return data.normalizationDiffers(ch, composition, compatibility);
public boolean isNormalized(int ch) {
return !data.normalizationDiffers(ch, composition, compatibility);
}
/**

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.13 $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -273,7 +273,7 @@ public class VerifyUCD implements UCD_Types {
}
public static boolean checkNormalizer(Normalizer x, int cp) {
boolean result = x.normalizationDiffers(cp);
boolean result = !x.isNormalized(cp);
if (false) {
String s = x.normalize(cp);
boolean sResult = !s.equals(UTF16.valueOf(cp));
@ -291,7 +291,7 @@ public class VerifyUCD implements UCD_Types {
Utility.dot(cp);
if (!Default.ucd.isAllocated(cp)) continue;
if (!Default.nfd.normalizationDiffers(cp)) continue;
if (Default.nfd.isNormalized(cp)) continue;
String decomp = Default.nfd.normalize(cp);
String comp = Default.nfc.normalize(cp);
@ -979,12 +979,12 @@ can help you narrow these down.
if (cp == 0x3131) {
System.out.println("Debug: " + idnProhibited
+ ", " + idnUnassigned
+ ", " + Default.nfkd.normalizationDiffers(cp)
+ ", " + !Default.nfkd.isNormalized(cp)
+ ", " + Default.ucd.getCodeAndName(Default.nfkc.normalize(cp))
+ ", " + Default.ucd.getCodeAndName(Default.nfc.normalize(cp)));
}
if (!idnProhibited && ! idnUnassigned && Default.nfkd.normalizationDiffers(cp)) {
if (!idnProhibited && ! idnUnassigned && !Default.nfkd.isNormalized(cp)) {
String kc = Default.nfkc.normalize(cp);
String c = Default.nfc.normalize(cp);
if (kc.equals(c)) continue;
@ -1415,7 +1415,7 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.dot(cp);
if (!Default.ucd.isAssigned(cp)) continue;
if (Default.ucd.isPUA(cp)) continue;
if (!normalizationDiffers(cp, j)) continue;
if (isNormalized(cp, j)) continue;
if (cp == 0xFDFB || cp == 0x0140) {
System.out.println("debug point");
@ -1478,9 +1478,9 @@ E0020-E007F; [TAGGING CHARACTERS]
return Default.ucd.getCase(s, FULL, FOLD);
}
static boolean normalizationDiffers(int cp, int j) {
if (j < 4) return Default.nf[j].normalizationDiffers(cp);
return true;
static boolean isNormalized(int cp, int j) {
if (j < 4) return !Default.nf[j].isNormalized(cp);
return false;
}
private static final String[] NAMES = {"Default.nfd", "NFC", "NFKD", "NFKC", "Fold"};
@ -1489,7 +1489,7 @@ E0020-E007F; [TAGGING CHARACTERS]
for (int j = 0; j < 4; ++j) {
Normalizer nfx = Default.nf[j];
System.out.println();
System.out.println("Testing normalizationDiffers for " + NAMES[j]);
System.out.println("Testing isNormalized for " + NAMES[j]);
System.out.println();
for (int i = 0; i < 0x10FFFF; ++i) {
Utility.dot(i);
@ -1497,7 +1497,7 @@ E0020-E007F; [TAGGING CHARACTERS]
if (Default.ucd.isPUA(i)) continue;
String s = nfx.normalize(i);
boolean differs = !s.equals(UTF32.valueOf32(i));
boolean call = nfx.normalizationDiffers(i);
boolean call = !nfx.isNormalized(i);
if (differs != call) {
Utility.fixDot();
System.out.println("Problem: differs: " + differs
@ -1597,7 +1597,7 @@ E0020-E007F; [TAGGING CHARACTERS]
static public void verifyNormalizationStability2(String version) {
Default.nfd.normalizationDiffers(0x10300);
// Default.nfd.normalizationDiffers(0x10300);
UCD older = UCD.make(version); // Default.ucd.getPreviousVersion();
@ -1640,7 +1640,7 @@ E0020-E007F; [TAGGING CHARACTERS]
} else {
// not in older version.
// (1) If there is a decomp, and it is composed of all OLD characters, then it must NOT compose
if (Default.nfd.normalizationDiffers(i)) {
if (!Default.nfd.isNormalized(i)) {
String decomp = Default.nfd.normalize(i);
if (noneHaveCategory(decomp, Cn, older)) {
String recomp = Default.nfc.normalize(decomp);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2002/04/24 02:38:52 $
* $Revision: 1.15 $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -22,6 +22,13 @@ import com.ibm.text.UCD.*;
public final class Utility { // COMMON UTILITIES
static final boolean UTF8 = true; // TODO -- make argument
public static String[] append(String[] array1, String[] array2) {
String[] temp = new String[array1.length + array2.length];
System.arraycopy(array1, 0, temp, 0, array1.length);
System.arraycopy(array2, 0, temp, array1.length, array2.length);
return temp;
}
public static String getName(int i, String[] names) {
try {