X-SVN-Rev: 5824
This commit is contained in:
Mark Davis 2001-09-19 23:33:52 +00:00
parent dee8a86dee
commit 42bddd7bf5
24 changed files with 1015 additions and 220 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
* $Date: 2001/08/31 00:20:40 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -102,7 +102,8 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
for (int i = startOffset; i < min; ++i) {
if (contents[i] != that.contents[i + delta]) {
if (contents[i] < that.contents[i + delta]) return -1;
if ((contents[i] & 0xFFFFFFFFL)
< (that.contents[i + delta] & 0xFFFFFFFFL)) return -1;
return 1;
}
}
@ -158,7 +159,9 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
public static String toString(int ce) {
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
+ Utility.hex(UCA.getSecondary(ce)) + "."
+ Utility.hex(UCA.getTertiary(ce)) + "](" + NAME3[UCA.getTertiary(ce)] + ")";
+ Utility.hex(UCA.getTertiary(ce)) + "]"
// + "(" + NAME3[UCA.getTertiary(ce)] + ")"
;
}
static final String[] NAME3 = {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
* $Date: 2001/09/06 01:30:31 $
* $Revision: 1.3 $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -43,7 +43,7 @@ public class GenOverlap implements UCD_Types {
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
@ -307,7 +307,7 @@ public class GenOverlap implements UCD_Types {
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
@ -505,7 +505,7 @@ public class GenOverlap implements UCD_Types {
//nfd = new Normalizer(Normalizer.NFD);
//nfkd = new Normalizer(Normalizer.NFKD);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);

View File

@ -0,0 +1,20 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2001/09/19 23:31:50 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
public class Main {
public static void main(String args[]) throws Exception {
WriteCollationData.main(args); // TODO, pull from there to here.
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2001/09/06 01:30:31 $
* $Revision: 1.3 $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -765,6 +765,13 @@ final public class UCA {
*/
static final int EXCEPTION_CE_MASK = 0xFFC00000;
/**
* Used to composed Hangul and Han characters
*/
static final int NEUTRAL_SECONDARY = 0x20;
static final int NEUTRAL_TERTIARY = 0x02;
/**
* Any unsupported characters (those not in the UCA data tables)
* are marked with a exception bit combination
@ -772,14 +779,10 @@ final public class UCA {
* There are at least 34 values, so that we can use a range for surrogates
* However, we do add to the first weight if we have surrogate pairs!
*/
static final int UNSUPPORTED = 0xFFC20101;
static final int UNSUPPORTED_P = 0xFFC2;
static final int UNSUPPORTED = makeKey(UNSUPPORTED_P, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
/**
* Used to composed Hangul and Han characters
*/
static final int NEUTRAL_SECONDARY = 0x20;
static final int NEUTRAL_TERTIARY = 0x02;
// was 0xFFC20101;
/**
* Contracting characters are marked with a exception bit combination
@ -968,9 +971,14 @@ final public class UCA {
// in code order.
// add bottom 5 bits to UNSUPPORTED, and push rest
//return UNSUPPORTED + (bigChar & 0xFFFF0000); // top bits added
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
return makeKey(UNSUPPORTED_P + (bigChar >> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
/*
expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
return UNSUPPORTED + ((bigChar << 1) & 0xFFFF0000); // top 34 values plus UNSUPPORTED
*/
}
if (ce == CONTRACTING) {
// Contracting is probably the most interesting (read "tricky") part
@ -1127,11 +1135,11 @@ final public class UCA {
return new Hashtable(multiTable);
}
public CollationContents getCollationContents(byte ceLimit, Normalizer skipDecomps) {
return new CollationContents(ceLimit, skipDecomps);
public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
return new UCAContents(ceLimit, skipDecomps);
}
public class CollationContents {
public class UCAContents {
int current = -1;
Normalizer skipDecomps = new Normalizer(Normalizer.NFD);
Iterator enum = null;
@ -1140,16 +1148,15 @@ final public class UCA {
/**
* use FIXED_CE as the limit
*/
CollationContents(byte ceLimit, Normalizer skipDecomps) {
UCAContents(byte ceLimit, Normalizer skipDecomps) {
this.ceLimit = ceLimit;
this.skipDecomps = skipDecomps;
}
/**
* returns a string and its ces
* returns a string
*/
public String next(int[] ces, int[] len) {
public String next() {
String result = null; // null if done
// normal case
@ -1158,7 +1165,6 @@ final public class UCA {
if (getCEType(ch) >= ceLimit) continue;
if (skipDecomps != null && skipDecomps.hasDecomposition(ch)) continue;
result = String.valueOf(ch);
len[0] = getCEs(result, true, ces);
return result;
}
@ -1166,11 +1172,36 @@ final public class UCA {
if (enum == null) enum = multiTable.keySet().iterator();
if (enum.hasNext()) {
result = (String)enum.next();
len[0] = getCEs(result, true, ces);
}
return result;
}
/**
* returns a string and its ces
*/
public String next(int[] ces, int[] len) {
String result = next(); // null if done
if (result != null) {
len[0] = getCEs(result, true, ces);
}
return result;
}
int[] lengthBuffer = new int[1];
/**
* returns a string and its ces
*/
public boolean next(Pair result) {
String s = next(ceListBuffer, lengthBuffer);
if (s == null) return false;
result.first = new CEList(ceListBuffer, 0, lengthBuffer[0]);
result.second = s;
return true;
}
}
/**

View File

@ -0,0 +1,213 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
* $Date: 2001/09/19 23:31:50 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import java.util.*;
import java.io.*;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;
public class WriteCharts implements UCD_Types {
static UCD ucd;
static public void test(UCA uca) throws IOException {
uca.setAlternate(UCA.NON_IGNORABLE);
ucd = UCD.make();
Normalizer nfd = new Normalizer(Normalizer.NFD);
UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
Set set = new TreeSet();
while (true) {
String x = cc.next();
if (x == null) break;
set.add(new Pair(uca.getSortKey(x), x));
}
PrintWriter output = null;
Iterator it = set.iterator();
int oldScript = -999;
int[] scriptCount = new int[LIMIT_SCRIPT];
int counter = 0;
int lastPrimary = -1;
String lastSortKey = null;
int high = uca.getSortKey("a").charAt(0);
int variable = UCA.getPrimary(uca.getVariableHigh());
int columnCount = 0;
indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html");
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
indexFile.println("<title>UCA Default Collation Table</title>");
indexFile.println("<base target='main'>");
indexFile.println("</head><body><h2 align='center'>UCA Default Collation Table</h2>");
indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
while (it.hasNext()) {
Utility.dot(counter);
Pair p = (Pair) it.next();
String sortKey = (String) p.first;
String s = (String) p.second;
int cp = UTF16.charAt(s,0);
byte script = ucd.getScript(cp);
if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
else if (script == INHERITED_SCRIPT) script = COMMON_SCRIPT;
// get first non-zero primary
int primary = sortKey.charAt(0);
if (sortKey.length() < 4) script = -3;
else if (primary == 0) script = -2;
else if (primary < variable) script = -1;
else if (primary < high) script = COMMON_SCRIPT;
if (script != oldScript
&& (oldScript < COMMON_SCRIPT || script != COMMON_SCRIPT && script != INHERITED_SCRIPT)) {
closeFile(output);
output = null;
}
if (output == null) {
++scriptCount[script+3];
if (scriptCount[script+3] > 1) {
System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " +
ucd.getScriptID_fromIndex(script) + ", " + ucd.getCodeAndName(s));
}
output = openFile(scriptCount[script+3], script);
oldScript = script;
}
int strength = 6;
if (lastSortKey != null && sortKey.charAt(0) == lastSortKey.charAt(0)) {
strength = uca.strengthDifference(sortKey, lastSortKey);
if (strength < 0) strength = -strength;
}
lastSortKey = sortKey;
String breaker = "";
if (columnCount > 10 || strength > 5) {
if (strength <= 5) breaker = "</tr><tr><td></td>";
else breaker = "</tr><tr>";
columnCount = 0;
}
output.println(breaker + CLASSNAME[strength] + s
+ "<br><tt>" + Utility.hex(s)
//+ "<br>" + script
//+ "<br>" + UCA.toString(sortKey)
+ "</tt></td>");
++columnCount;
}
closeFile(output);
indexFile.println("</body></html>");
indexFile.close();
}
static final String[] CLASSNAME = {
"<td class='q'>",
"<td class='q'>",
"<td class='q'>",
"<td class='t'>",
"<td class='s'>",
"<td class='p'>",
"<td class='f'>"};
static PrintWriter indexFile;
static PrintWriter openFile(int count, byte script) throws IOException {
String scriptName = getChunkName(script);
scriptName = ucd.getCase(scriptName, FULL, TITLE);
String fileName = "chart_" + scriptName + (count > 1 ? count + "" : "") + ".html";
PrintWriter output = Utility.openPrintWriter("CollationCharts\\" + fileName);
Utility.fixDot();
System.out.println("Writing: " + scriptName);
indexFile.println(" | <a href = '" + fileName + "'>" + scriptName + "</a>");
String title = "UCA: " + scriptName;
output.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
output.println("<title>" + title + "</title>");
output.println("<link rel='stylesheet' href='charts.css' type='text/css'>");
output.println("</head><body><h2>" + scriptName + "</h2>");
output.println("<table>");
return output;
}
static String getChunkName(byte script) {
if (script == -3) return "NULL";
else if (script == -2) return "IGNORABLE";
else if (script == -1) return "VARIABLE";
else if (script == HIRAGANA_SCRIPT) return "KATAKANA-HIRAGANA";
else return ucd.getScriptID_fromIndex(script);
}
static void closeFile(PrintWriter output) {
if (output == null) return;
output.println("</body></table></html>");
output.close();
}
}
/*
static final IntStack p1 = new IntStack(30);
static final IntStack s1 = new IntStack(30);
static final IntStack t1 = new IntStack(30);
static final IntStack p2 = new IntStack(30);
static final IntStack s2 = new IntStack(30);
static final IntStack t2 = new IntStack(30);
static int getStrengthDifference(CEList ceList, CEList lastCEList) {
extractNonzeros(ceList, p1, s1, t1);
extractNonzeros(lastCEList, p2, s2, t2);
int temp = p1.compareTo(p2);
if (temp != 0) return 3;
temp = s1.compareTo(s2);
if (temp != 0) return 2;
temp = t1.compareTo(t2);
if (temp != 0) return 1;
return 0;
}
static void extractNonzeros(CEList ceList, IntStack primaries, IntStack secondaries, IntStack tertiaries) {
primaries.clear();
secondaries.clear();
tertiaries.clear();
for (int i = 0; i < ceList.length(); ++i) {
int ce = ceList.at(i);
int temp = UCA.getPrimary(ce);
if (temp != 0) primaries.push(temp);
temp = UCA.getSecondary(ce);
if (temp != 0) secondaries.push(temp);
temp = UCA.getTertiary(ce);
if (temp != 0) tertiaries.push(temp);
}
}
*/

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2001/09/06 01:30:30 $
* $Revision: 1.3 $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -34,7 +34,6 @@ public class WriteCollationData implements UCD_Types {
static final boolean EXCLUDE_UNSUPPORTED = true;
static final boolean GENERATED_NFC_MISMATCHES = true;
static final boolean DO_CHARTS = true;
static final boolean WRITE_NAME_IN_CONFORMANCE = true;
static UCA collator;
@ -58,12 +57,13 @@ public class WriteCollationData implements UCD_Types {
ucd = UCD.make("");
if (args.length == 0) args = new String[] {"?"}; // force the help comment
boolean hex = false;
boolean shortPrint = false;
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES);
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator);
else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator);
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator);
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator);
@ -72,15 +72,15 @@ public class WriteCollationData implements UCD_Types {
else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) checkDisjointIgnorables();
else if (arg.equalsIgnoreCase("writeContractions")) writeContractions();
else if (arg.equalsIgnoreCase("FractionalUCA")) writeFractionalUCA("FractionalUCA");
else if (arg.equalsIgnoreCase("writeConformance")) writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE, hex);
else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED, hex);
else if (arg.equalsIgnoreCase("writeConformance")) writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint);
else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint);
else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) testCompatibilityCharacters();
else if (arg.equalsIgnoreCase("writeCollationValidityLog")) writeCollationValidityLog();
else if (arg.equalsIgnoreCase("writeCaseExceptions")) writeCaseExceptions();
else if (arg.equalsIgnoreCase("writeJavascriptInfo")) writeJavascriptInfo();
else if (arg.equalsIgnoreCase("writeCaseFolding")) writeCaseFolding();
else if (arg.equalsIgnoreCase("javatest")) javatest();
else if (arg.equalsIgnoreCase("hex")) hex = true;
else if (arg.equalsIgnoreCase("short")) shortPrint = true;
else {
System.out.println();
System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)");
@ -339,15 +339,17 @@ public class WriteCollationData implements UCD_Types {
}
static void writeConformance(String filename, byte option, boolean hex) throws IOException {
UCD ucd30 = UCD.make("300");
static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException {
UCD ucd30 = UCD.make("3.0.0");
PrintWriter log = Utility.openPrintWriter(filename);
if (!hex) log.write('\uFEFF');
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt");
if (!shortPrint) log.write('\uFEFF');
System.out.println("Sorting");
int counter = 0;
for (int i = 0; i <= 0x10FFFF; ++i) {
Utility.dot(counter++);
if (!ucd.isRepresented(i)) continue;
addStringX(UTF32.valueOf32(i), option);
}
@ -355,11 +357,14 @@ public class WriteCollationData implements UCD_Types {
Hashtable multiTable = collator.getContracting();
Enumeration enum = multiTable.keys();
while (enum.hasMoreElements()) {
Utility.dot(counter++);
addStringX((String)enum.nextElement(), option);
}
for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters
Utility.dot(counter++);
String s = UTF32.valueOf32(extraConformanceTests[i]);
Utility.fixDot();
System.out.println("Adding: " + Utility.hex(s));
addStringX(s, option);
}
@ -367,6 +372,7 @@ public class WriteCollationData implements UCD_Types {
for (int i = 0; ; ++i) { // add first unallocated character
if (!ucd.isAssigned(i)) {
String s = UTF32.valueOf32(i);
Utility.fixDot();
System.out.println("Adding: " + Utility.hex(s));
addStringX(s, option);
break;
@ -375,6 +381,7 @@ public class WriteCollationData implements UCD_Types {
for (int i = 0; i < extraConformanceRanges.length; ++i) {
Utility.dot(counter++);
int start = extraConformanceRanges[i][0];
int end = extraConformanceRanges[i][1];
int increment = ((end - start + 1) / 303) + 1;
@ -388,6 +395,7 @@ public class WriteCollationData implements UCD_Types {
addStringX(end, option);
}
Utility.fixDot();
System.out.println("Total: " + sortedD.size());
Iterator it;
@ -399,6 +407,7 @@ public class WriteCollationData implements UCD_Types {
String lastKey = "";
while (it.hasNext()) {
Utility.dot(counter);
String key = (String) it.next();
String source = (String) sortedD.get(key);
int fluff = key.charAt(key.length() - 1);
@ -408,14 +417,12 @@ public class WriteCollationData implements UCD_Types {
//log.println(source);
String clipped = source.substring(0, source.length()-1);
String stren = source.substring(source.length()-1);
if (hex) {
if (!shortPrint) {
log.print(Utility.hex(source));
} else {
log.print(source + "\t" + Utility.hex(clipped));
}
if (WRITE_NAME_IN_CONFORMANCE) {
log.print(
";\t#" + ucd.getName(clipped)+ "\t" + UCA.toString(key));
} else {
log.print(source + "\t" + Utility.hex(clipped));
}
log.println();
}
@ -754,7 +761,7 @@ public class WriteCollationData implements UCD_Types {
int[] ces = new int[50];
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
int[] lenArray = new int[1];
diLog.println("# Contractions");
@ -819,7 +826,7 @@ public class WriteCollationData implements UCD_Types {
String s = String.valueOf(ch);
int len = collator.getCEs(s, true, ces);
*/
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
int[] lenArray = new int[1];
Set sortedCodes = new TreeSet();
@ -987,7 +994,7 @@ public class WriteCollationData implements UCD_Types {
String s = String.valueOf(ch);
int len = collator.getCEs(s, true, ces);
*/
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
int[] lenArray = new int[1];
Set sortedCodes = new TreeSet();
@ -1179,7 +1186,7 @@ public class WriteCollationData implements UCD_Types {
java.util.Comparator cm = new RuleComparator();
Map ordered = new TreeMap(cm);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE,
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE,
SKIP_CANONICAL_DECOMPOSIBLES ? nfd : null);
int[] lenArray = new int[1];

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.3 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -59,7 +59,9 @@ public class DerivedProperty implements UCD_Types {
GraphemeExtend = 27,
GraphemeBase = 28,
LIMIT = 29;
FC_NFC_Closure = 29,
LIMIT = 30;
public DerivedProperty(UCD ucd) {
@ -156,8 +158,8 @@ public class DerivedProperty implements UCD_Types {
compName = "NFD for the character";
}
header = "# Derived Property: " + name
+ "\r\n# Normalized form " + NAME[i-GenNFD] + ", where DIFFERENT from " + compName + "."
+ "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly."
+ "\r\n# Lists characters in normalized form " + NAME[i-GenNFD] + "."
+ "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!"
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
}
@ -422,6 +424,25 @@ of characters, the first of which has a non-zero combining class.
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
};
dprops[FC_NFC_Closure] = new DProp() {
{
name = "FC_NFC_Closure";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
+ "\r\n# mappings that constitute the FC_NFC_Closure list";
}
public boolean propertyVaries() {return true;} // default
public String getProperty(int cp) {
if (!ucdData.isRepresented(cp)) return "";
String b = nfc.normalize(fold(cp));
String c = nfc.normalize(fold(b));
if (c.equals(b)) return "";
return "FN; " + Utility.hex(c);
} // default
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
};
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
dprops[i] = new QuickDProp(i);
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.4 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -27,7 +27,7 @@ final class DerivedPropertyLister extends PropertyLister {
int width;
boolean varies;
public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) {
public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
@ -87,7 +87,7 @@ final class DerivedPropertyLister extends PropertyLister {
String last;
public byte status(int cp) {
if (!ucdData.isAssigned(cp)) return EXCLUDE;
if (!ucdData.isAssigned(cp) && propMask != DerivedProperty.DefaultIgnorable) return EXCLUDE;
if (!varies) {
return dprop.hasProperty(cp, propMask) ? INCLUDE : EXCLUDE;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -17,14 +17,11 @@ import java.io.*;
class DiffPropertyLister extends PropertyLister {
private UCD oldUCD;
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) {
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
this.output = output;
this.ucdData = UCD.make(newUCDName);
if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName);
}
public byte status (int cp) {
return INCLUDE;
breakByCategory = false;
}
public String propertyName(int cp) {
@ -42,7 +39,7 @@ class DiffPropertyLister extends PropertyLister {
*/
public byte status(int lastCp, int cp) {
public byte status(int cp) {
/*if (cp == 0xFFFF) {
System.out.println("# " + Utility.hex(cp));
}
@ -50,6 +47,15 @@ class DiffPropertyLister extends PropertyLister {
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
}
public String headerString() {
if (oldUCD != null) {
return "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
} else {
return "# Allocated as of " + ucdData.getVersion();
}
}
/*
public int print() {
String status;
if (oldUCD != null) {
@ -73,6 +79,7 @@ class DiffPropertyLister extends PropertyLister {
output.println();
return count;
}
*/
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -20,7 +20,7 @@ import com.ibm.text.utility.*;
public class GenerateCaseFolding implements UCD_Types {
public static boolean DEBUG = false;
public static UCD ucd = UCD.make("310");
public static UCD ucd = UCD.make("");
public static void main(String[] args) throws java.io.IOException {
makeCaseFold();
@ -285,71 +285,4 @@ public class GenerateCaseFolding implements UCD_Types {
}
return result + "}";
}
static final void getAge() throws IOException {
PrintStream log = new PrintStream(
new BufferedOutputStream (
new FileOutputStream("UnicodeAge.txt"),
4*1024));
try {
log.println("# Derived file showing when various code points were allocated in Unicode");
log.println("# author: M. Davis");
log.println("# generated: " + new Date());
log.println("# Notes:");
log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 110 listing.");
log.println("# - The supplementary private use code points, although allocated earlier,");
log.println("# were NOT specifically listed in the UCD until 3.0.1, and are not included until then.");
new DiffPropertyLister(null, "110", log).print();
new DiffPropertyLister("110", "200", log).print();
new DiffPropertyLister("200", "210", log).print();
new DiffPropertyLister("210", "300", log).print();
new DiffPropertyLister("300", "310", log).print();
/*
printDiff("110", "200");
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
log.println();
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
+ n.format(u11.count()));
log.println();
u11.print(log, false, false, "1.1");
UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
log.println();
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
+ n.format(u20m.count()));
log.println();
u20m.print(log, false, false, "2.0");
UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
log.println();
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
+ n.format(u21m.count()));
log.println();
u21m.print(log, false, false, "2.1");
UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
log.println();
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
+ n.format(u30m.count()));
log.println();
u30m.print(log, false, false, "3.0");
UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
log.println();
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
+ n.format(u31m.count()));
log.println();
u31m.print(log, false, false, "3.1");
*/
} finally {
if (log != null) log.close();
}
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.5 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -22,9 +22,9 @@ import com.ibm.text.utility.*;
public class GenerateData implements UCD_Types {
public static void main (String[] args) throws IOException {
public static void main (String inVersion, String[] args) throws IOException {
System.out.println("START");
ucd = UCD.make();
ucd = UCD.make(inVersion);
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
String version = ucd.getVersion();
@ -36,10 +36,7 @@ public class GenerateData implements UCD_Types {
Utility.fixDot();
System.out.println("Argument: " + args[i]);
if (arg.equalsIgnoreCase("version")) {
version = args[++i];
ucd = UCD.make(version);
} else if (arg.equalsIgnoreCase("partition")) {
if (arg.equalsIgnoreCase("partition")) {
partitionProperties();
} else if (arg.equalsIgnoreCase("list")) {
listProperties();
@ -91,9 +88,12 @@ public class GenerateData implements UCD_Types {
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
mask = Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf);
mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.LIMIT-1);
mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.FC_NFC_Closure-1);
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version );
} else if (arg.equalsIgnoreCase("DerivedAge")) {
generateAge("DerivedAge-" + version );
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedLineBreak-" + version );
@ -181,7 +181,7 @@ public class GenerateData implements UCD_Types {
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
public static void doHeader(String fileName, PrintWriter output, int headerChoice) {
output.println("# " + fileName + ".txt");
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
@ -203,7 +203,7 @@ public class GenerateData implements UCD_Types {
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName + "dX.txt"));
PrintWriter output = Utility.openPrintWriter(fileName + "dX.txt");
doHeader(fileName, output, headerChoice);
for (int i = 0; i < DerivedProperty.LIMIT; ++i) {
if ((bitMask & (1<<i)) == 0) continue;
@ -218,8 +218,8 @@ public class GenerateData implements UCD_Types {
/*
public static void listStrings(String file, int type, int subtype) throws IOException {
ucd = UCD.make("310");
UCD ucd30 = UCD.make("300");
ucd = UCD.make("3.1.0");
UCD ucd30 = UCD.make("3.0.0");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
for (int i = 0; i < 0x10FFFF; ++i) {
@ -238,7 +238,7 @@ public class GenerateData implements UCD_Types {
*/
public static void generateCompExclusions() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
PrintWriter output = Utility.openPrintWriter("CompositionExclusionsDelta.txt");
new CompLister(output).print();
output.close();
}
@ -247,10 +247,10 @@ public class GenerateData implements UCD_Types {
UCD oldUCD;
int oldLength = 0;
public CompLister(PrintStream output) {
public CompLister(PrintWriter output) {
this.output = output;
ucdData = UCD.make("310");
oldUCD = UCD.make("300");
ucdData = UCD.make("3.1.0");
oldUCD = UCD.make("3.0.0");
showOnConsole = true;
}
public String propertyName(int cp) {
@ -310,7 +310,7 @@ public class GenerateData implements UCD_Types {
public static void listDifferences() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "PropertyDifferences.txt"));
PrintWriter output = Utility.openPrintWriter("PropertyDifferences.txt");
for (int i = 1; i < LIMIT_ENUM; ++i) {
int iType = i & 0xFF00;
@ -441,7 +441,7 @@ public class GenerateData implements UCD_Types {
//*/
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file + "dX.txt"));
PrintWriter output = Utility.openPrintWriter(file + "dX.txt");
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
@ -686,4 +686,80 @@ public class GenerateData implements UCD_Types {
};
static final void generateAge(String filename) throws IOException {
PrintWriter log = Utility.openPrintWriter(filename + "dX.txt");
try {
log.println("# Derived file showing when various code points were allocated in Unicode");
log.println("# author: M. Davis");
log.println("# generated: " + new Date());
log.println("# Notes:");
log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 1.1.0 listing.");
log.println("# - The supplementary private use code points and the non-character code points");
log.println("# were allocated in version 2.0, but not specifically listed in the UCD");
log.println("# until versions 3.0.1 and 3.1.0 respectively.");
log.println("# ================================================");
log.println();
new DiffPropertyLister(null, "1.1.0", log).print();
log.println("# ================================================");
log.println();
new DiffPropertyLister("1.1.0", "2.0.0", log).print();
log.println("# ================================================");
log.println();
new DiffPropertyLister("2.0.0", "2.1.2", log).print();
log.println("# ================================================");
log.println();
new DiffPropertyLister("2.1.2", "3.0.0", log).print();
log.println("# ================================================");
log.println();
new DiffPropertyLister("3.0.0", "3.1.0", log).print();
/*
printDiff("110", "200");
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
log.println();
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
+ n.format(u11.count()));
log.println();
u11.print(log, false, false, "1.1");
UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
log.println();
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
+ n.format(u20m.count()));
log.println();
u20m.print(log, false, false, "2.0");
UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
log.println();
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
+ n.format(u21m.count()));
log.println();
u21m.print(log, false, false, "2.1");
UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
log.println();
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
+ n.format(u30m.count()));
log.println();
u30m.print(log, false, false, "3.0");
UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
log.println();
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
+ n.format(u31m.count()));
log.println();
u31m.print(log, false, false, "3.1");
*/
} finally {
if (log != null) log.close();
}
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -49,7 +49,7 @@ public final class Main {
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(Utility.split(args[++i],','));
else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyFloatLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -17,7 +17,7 @@ import java.io.*;
class MyFloatLister extends PropertyLister {
private float propMask;
public MyFloatLister(UCD ucd, float f, PrintStream output) {
public MyFloatLister(UCD ucd, float f, PrintWriter output) {
this.propMask = f;
this.output = output;
this.ucdData = ucd;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -22,7 +22,7 @@ final class MyPropertyLister extends PropertyLister {
private int propMask;
public MyPropertyLister(UCD ucd, int propMask, PrintStream output) {
public MyPropertyLister(UCD ucd, int propMask, PrintWriter output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
@ -67,7 +67,7 @@ final class MyPropertyLister extends PropertyLister {
if (s.length() == 0) s = "Other Combining Class";
return "# " + s;
} else if (main == BINARY_PROPERTIES) {
return "# Binary Property";
return "";
} else if (main == JOINING_GROUP) {
return "";
} else {

View File

@ -0,0 +1,348 @@
package com.ibm.text.UCD;
import java.util.*;
import com.ibm.text.*;
import com.ibm.text.utility.*;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
*/
public class NormalizerSample implements UCD_Types {
static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved.";
public static boolean SHOW_PROGRESS = false;
/**
* Create a normalizer for a given form.
*/
public NormalizerSample(byte form, String unicodeVersion) {
this.composition = (form & COMPOSITION_MASK) != 0;
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
this.data = getData(unicodeVersion);
}
/**
* Create a normalizer for a given form.
*/
public NormalizerSample(byte form) {
this(form,"");
}
/**
* Masks for the form selector
*/
public static final byte
COMPATIBILITY_MASK = 1,
COMPOSITION_MASK = 2;
/**
* Normalization Form Selector
*/
public static final byte
NFD = 0 ,
NFKD = COMPATIBILITY_MASK,
NFC = COMPOSITION_MASK,
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
// First decompose the source into target,
// then compose if the form requires.
if (source.length() != 0) {
internalDecompose(source, target);
if (composition) {
internalCompose(target);
}
}
return target;
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(int cp) {
return normalize(UTF16.valueOf(cp));
}
/**
*/
private StringBuffer hasDecompositionBuffer = new StringBuffer();
public boolean hasDecomposition(int cp) {
hasDecompositionBuffer.setLength(0);
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean normalizationDiffers(int ch) {
return data.normalizationDiffers(ch, composition, compatibility);
}
/**
* Utility: Gets recursive decomposition of a character from the
* Unicode Character Database.
* @param compatibility If false selects the recursive
* canonical decomposition, otherwise selects
* the recursive compatibility AND canonical decomposition.
* @param ch the source character
* @param buffer buffer to be filled with the decomposition
*/
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
data.getRecursiveDecomposition(ch, buffer, compatibility);
}
// ======================================
// PRIVATES
// ======================================
/**
* The current form.
*/
private boolean composition;
private boolean compatibility;
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If COMPATIBILITY_MASK
* bit is on in this byte, then selects the recursive
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(String source, StringBuffer target) {
StringBuffer buffer = new StringBuffer();
int ch32;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
data.getRecursiveDecomposition(ch32, buffer, compatibility);
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(buffer, j);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
// bubble-sort combining marks as necessary
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
ch2 = UTF16.charAt(target, k-1);
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
target.insert(k, UTF16.valueOf(ch));
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* Uses UTF16, which is a utility class for supplementary character support in Java.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
int starterCh = UTF16.charAt(target,0);
int compPos = UTF16.getCharCount(starterCh); // length of last composition
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
// Loop on the decomposed characters, combining where possible
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(target, decompPos);
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ ", decompPos: " + decompPos
+ ", compPos: " + compPos
+ ", ch: " + Utility.hex(ch)
);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != data.NOT_COMPOSITE
&& (lastClass < chClass || lastClass == 0)) {
UTF16.setCharAt(target, starterPos, composite);
// we know that we will only be replacing non-supplementaries by non-supplementaries
// so we don't have to adjust the decompPos
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
UTF16.setCharAt(target, compPos, ch);
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
System.out.println("ADJUSTING: " + Utility.hex(target));
decompPos += target.length() - oldLen;
oldLen = target.length();
}
compPos += UTF16.getCharCount(ch);
}
}
target.setLength(compPos);
}
// The following class makes use of the UCD class, which accesses data in the Unicode Character Database
static class Stub {
private UCD ucd;
private HashMap compTable = new HashMap();
private BitSet isSecond = new BitSet();
private BitSet canonicalRecompose = new BitSet();
private BitSet compatibilityRecompose = new BitSet();
static final int NOT_COMPOSITE = 0xFFFF;
Stub(String version) {
ucd = UCD.make(version);
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isTrailingJamo(i)) isSecond.set(i);
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
try {
String s = ucd.getDecompositionMapping(i);
int len = UTF16.countCodePoint(s);
if (len != 2) {
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
continue;
}
int a = UTF16.charAt(s, 0);
if (ucd.getCombiningClass(a) != 0) continue;
int b = UTF16.charAt(s, UTF16.getCharCount(a));
isSecond.set(b);
// have a recomposition, so set the bit
canonicalRecompose.set(i);
// set the compatibility recomposition bit
// ONLY if the component characters
// don't compatibility decompose
if (ucd.getDecompositionType(a) <= CANONICAL
&& ucd.getDecompositionType(b) <= CANONICAL) {
compatibilityRecompose.set(i);
}
long key = (((long)a)<<32) | b;
compTable.put(new Long(key), new Integer(i));
} catch (Exception e) {
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
}
}
}
}
short getCanonicalClass(int cp) {
return ucd.getCombiningClass(cp);
}
boolean isTrailing(int cp) {
return isSecond.get(cp);
}
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
if (!composition) {
if (compatibility) return dt >= CANONICAL;
else return dt == CANONICAL;
} else {
// almost the same, except that we add back in the characters
// that RECOMPOSE
if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
else return dt == CANONICAL && !canonicalRecompose.get(cp);
}
}
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
if (dt == CANONICAL || dt > CANONICAL && compatibility) {
String s = ucd.getDecompositionMapping(cp);
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
getRecursiveDecomposition(cp, buffer, compatibility);
}
} else {
UTF16.append(buffer, cp);
}
}
int getPairwiseComposition(int starterCh, int ch) {
int hangulPoss = UCD.composeHangul(starterCh, ch);
if (hangulPoss != 0xFFFF) return hangulPoss;
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
if (obj == null) return 0xFFFF;
return ((Integer)obj).intValue();
}
}
/**
* Contains normalization data from the Unicode Character Database.
* use false for the minimal set, true for the real set.
*/
private Stub data;
private static HashMap versionCache = new HashMap();
private static Stub getData (String version) {
if (version.length() == 0) version = UCD.latestVersion;
Stub result = (Stub)versionCache.get(version);
if (result == null) {
result = new Stub(version);
versionCache.put(version, result);
}
return result;
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -15,6 +15,7 @@ package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
import java.text.NumberFormat;
abstract public class PropertyLister implements UCD_Types {
@ -24,9 +25,10 @@ abstract public class PropertyLister implements UCD_Types {
protected UCD ucdData;
protected PrintStream output;
protected PrintWriter output;
protected boolean showOnConsole;
protected boolean usePropertyComment = true;
protected boolean breakByCategory = true;
protected int firstRealCp = -2;
protected int lastRealCp = -2;
protected boolean alwaysBreaks = false; // set to true if property only breaks
@ -51,7 +53,7 @@ abstract public class PropertyLister implements UCD_Types {
}
public String optionalComment(int cp) {
if (!usePropertyComment) return "";
if (!usePropertyComment || !breakByCategory) return "";
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
@ -167,7 +169,7 @@ abstract public class PropertyLister implements UCD_Types {
if (s == INCLUDE && firstRealCp != -1) {
byte cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll) cat = Lu;
if (cat != firstRealCpCat) s = BREAK;
if (breakByCategory && cat != firstRealCpCat) s = BREAK;
}
switch(s) {
@ -208,9 +210,12 @@ abstract public class PropertyLister implements UCD_Types {
}
if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header);
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(0);
output.println();
output.println("# Total code points: " + count);
output.println("# Total code points: " + nf.format(count));
output.println();
return count;
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2001/09/01 00:06:15 $
* $Revision: 1.3 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -145,7 +145,7 @@ public class TestData implements UCD_Types {
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
public static void doHeader(String fileName, PrintWriter output, int headerChoice) {
output.println("# " + fixFile(fileName));
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
@ -167,8 +167,8 @@ public class TestData implements UCD_Types {
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
ucd = UCD.make("310");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
ucd = UCD.make("3.1.0");
PrintWriter output = Utility.openPrintWriter(fileName);
doHeader(fileName, output, headerChoice);
for (int i = 0; i < 32; ++i) {
if ((bitMask & (1<<i)) == 0) continue;
@ -183,9 +183,9 @@ public class TestData implements UCD_Types {
/*
public static void listStrings(String file, int type, int subtype) throws IOException {
ucd = UCD.make("310");
UCD ucd30 = UCD.make("300");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
ucd = UCD.make("3.1.0");
UCD ucd30 = UCD.make("3.0.0");
PrintWriter output = new PrintStream(new FileOutputStream(GEN_DIR + file));
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) System.out.println("# " + i);
@ -203,7 +203,7 @@ public class TestData implements UCD_Types {
*/
public static void generateCompExclusions() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
PrintWriter output = Utility.openPrintWriter("CompositionExclusionsDelta.txt");
new CompLister(output).print();
output.close();
}
@ -212,10 +212,10 @@ public class TestData implements UCD_Types {
UCD oldUCD;
int oldLength = 0;
public CompLister(PrintStream output) {
public CompLister(PrintWriter output) {
this.output = output;
ucdData = UCD.make("310");
oldUCD = UCD.make("300");
ucdData = UCD.make("3.1.0");
oldUCD = UCD.make("3.0.0");
showOnConsole = true;
}
public String propertyName(int cp) {
@ -249,7 +249,7 @@ public class TestData implements UCD_Types {
//*/
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
PrintWriter output = Utility.openPrintWriter(file);
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -42,6 +42,7 @@ public final class UCD implements UCD_Types {
*/
public static UCD make(String version) {
if (version == null || version.length() == 0) version = latestVersion;
if (version.indexOf('.') < 0) throw new IllegalArgumentException("Version must be of form 3.1.1");
UCD result = (UCD)versionCache.get(version);
if (result == null) {
result = new UCD();
@ -74,6 +75,7 @@ public final class UCD implements UCD_Types {
if (major < 2 && codePoint > 0xFFFF) return false;
return true; // Noncharacter
}
if (major >= 2 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
return false;
}
@ -439,6 +441,21 @@ public final class UCD implements UCD_Types {
return get(codePoint, false).script;
}
public byte getScript(String s) {
byte result = COMMON_SCRIPT;
if (s == null || s.length() == 0) return result;
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
byte script = getScript(cp);
if (script == INHERITED_SCRIPT) continue;
result = script;
}
return result;
}
public byte getAge(int codePoint) {
return get(codePoint, false).age;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -15,8 +15,8 @@ package com.ibm.text.UCD;
public interface UCD_Types {
public static final String DATA_DIR = "C:\\DATA\\";
public static final String BIN_DIR = DATA_DIR + "\\BIN\\";
public static final String GEN_DIR = DATA_DIR + "\\GEN\\";
public static final String BIN_DIR = DATA_DIR + "BIN\\";
public static final String GEN_DIR = DATA_DIR + "GEN\\";
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.4 $
* $Date: 2001/09/19 23:33:15 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -20,6 +20,7 @@ import java.math.BigDecimal;
import java.util.*;
import java.io.*;
//import java.text.*;
import com.ibm.text.*;
import com.ibm.text.utility.*;
@ -331,6 +332,7 @@ public class VerifyUCD implements UCD_Types {
System.out.println("Checking Prohibited and Unassigned");
System.out.println();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (mappedOut.get(cp)) continue;
boolean ucdUnassigned = !ucd.isAllocated(cp);
@ -339,33 +341,89 @@ public class VerifyUCD implements UCD_Types {
boolean idnProhibited = prohibited.get(cp);
if (ucdUnassigned && !idnUnassigned) {
showError("UCD Unassigned but not IDN Unassigned: ", cp);
showError("?UCD Unassigned but not IDN Unassigned", cp, "");
++errorCount;
} else if (!ucdUnassigned && idnUnassigned) {
showError("Not UCD Unassigned but IDN Unassigned: ", cp);
showError("?Not UCD Unassigned but IDN Unassigned", cp, "");
++errorCount;
}
if (idnProhibited && unassigned.get(cp)) {
showError("Both IDN Unassigned AND IDN Prohibited: ", cp);
showError("?Both IDN Unassigned AND IDN Prohibited", cp, "");
++errorCount;
}
if (guess && !idnProhibited) {
showError("UCD ?prohibited? but not IDN Prohibited: ", cp);
showError("?UCD ?prohibited? but not IDN Prohibited ", cp, "");
++errorCount;
} else if (!guess && idnProhibited) {
showError("Not UCD ?prohibited? but IDN Prohibited: ", cp);
showError("?Not UCD ?prohibited? but IDN Prohibited ", cp, "");
++errorCount;
}
if (cp == 0x3131) {
System.out.println("Debug: " + idnProhibited
+ ", " + idnUnassigned
+ ", " + nfkc.hasDecomposition(cp)
+ ", " + ucd.getCodeAndName(nfkc.normalize(cp))
+ ", " + ucd.getCodeAndName(nfc.normalize(cp)));
}
if (!idnProhibited && ! idnUnassigned && nfkc.hasDecomposition(cp)) {
String kc = nfkc.normalize(cp);
String c = nfc.normalize(cp);
if (kc.equals(c)) continue;
int cp2;
boolean excluded = false;
for (int j = 0; j < kc.length(); j += UTF16.getCharCount(cp2)) {
cp2 = UTF16.charAt(kc, j);
if (prohibited.get(cp2)) {
showError("Prohibited with NFKC, but output with NFC", cp, "");
excluded = true;
break;
}
}
if (!excluded) {
showError("Remapped to core abstract character with NFKC (but not NFC)", cp, ""); // , "\t=> " + ucd.getCodeAndName(kc));
}
}
}
System.out.println();
System.out.println("Total Errors: " + errorCount);
System.out.println("Writing IDNCheck.txt");
PrintWriter log = Utility.openPrintWriter("IDNCheck.txt");
log.println("IDN Check");
log.println("Total Errors: " + errorCount);
Iterator it = idnMap.keySet().iterator();
while (it.hasNext()) {
String description = (String) it.next();
Map map = (Map) idnMap.get(description);
log.println();
log.println(description);
log.println("Total: " + map.size());
log.println();
Iterator it2 = map.keySet().iterator();
while (it2.hasNext()) {
Object key = it2.next();
String line = (String) map.get(key);
log.println(" " + line);
}
}
log.close();
}
static void showError(String description, int cp) {
System.out.println(description + ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")");
static Map idnMap = new HashMap();
static void showError(String description, int cp, String option) {
Map probe = (Map) idnMap.get(description);
if (probe == null) {
probe = new TreeMap();
idnMap.put(description, probe);
}
probe.put(new Integer(cp), ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")" + option);
}
@ -611,8 +669,7 @@ E0020-E007F; [TAGGING CHARACTERS]
if (reason.equals("Map out")) {
value = Utility.fromHex(parts[1]);
Utility.fixDot();
System.out.println("Note, Mapping Out: " + ucd.getCodeAndName(cp)
+ ", " + ucd.getCodeAndName(value) + ", " + ucd.getCategoryID(cp));
showError("Mapping Out: ", cp, "");
mappedOut.set(cp);
}
idnFold.put(key, value);
@ -1033,26 +1090,37 @@ E0020-E007F; [TAGGING CHARACTERS]
int sum = 0;
long start, end;
java.text.NumberFormat nf = java.text.NumberFormat.getPercentInstance();
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy0(i).length();
}
end = System.currentTimeMillis();
double base = end - start;
System.out.println("unsynchronized static char[]: " + nf.format((end - start)/base));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy2(i).length();
}
end = System.currentTimeMillis();
System.out.println("synchronized: " + (end - start));
System.out.println("synchronized static char[]: " + nf.format((end - start)/base));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy1(i).length();
}
end = System.currentTimeMillis();
System.out.println("char[] each time: " + (end - start));
System.out.println("char[] each time: " + nf.format((end - start)/base));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy3(i).length();
}
end = System.currentTimeMillis();
System.out.println("String +: " + (end - start));
System.out.println("two valueofs: " + nf.format((end - start)/base));
System.out.println(sum);
}
@ -1074,6 +1142,12 @@ E0020-E007F; [TAGGING CHARACTERS]
}
}
static String dummy0(int a) {
temp2[0] = (char)(a >>> 16);
temp2[1] = (char)a;
return new String(temp2);
}
static String dummy3(int a) {
return String.valueOf((char)(a >>> 16)) + (char)a;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $
* $Date: 2001/08/31 00:19:16 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:52 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -17,7 +17,7 @@ package com.ibm.text.utility;
// Simple stack mechanism, with push, pop and access
// =============================================================
public final class IntStack {
public final class IntStack implements Comparable {
private int[] values;
private int top = 0;
@ -51,4 +51,31 @@ public final class IntStack {
public boolean isEmpty() {
return top == 0;
}
public void clear() {
top = 0;
}
public int compareTo(Object other) {
IntStack that = (IntStack) other;
int min = top;
if (min < that.top) min = that.top;
for (int i = 0; i < min; ++i) {
int result = values[i] - that.values[i];
if (result != 0) return result;
}
return top - that.top;
}
public boolean equals(Object other) {
return compareTo(other) == 0;
}
public int hashCode() {
int result = top;
for (int i = 0; i < top; ++i) {
result = result * 37 + values[i];
}
return result;
}
}

View File

@ -5,15 +5,15 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Pair.java,v $
* $Date: 2001/08/31 00:19:16 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:52 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.utility;
public final class Pair implements java.lang.Comparable {
public final class Pair implements java.lang.Comparable, Cloneable {
public Comparable first, second;
@ -41,4 +41,12 @@ public final class Pair implements java.lang.Comparable {
if (trial != 0) return trial;
return second.compareTo(that.second);
}
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null;
}
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java,v $
* $Date: 2001/08/31 00:19:16 $
* $Revision: 1.2 $
* $Date: 2001/09/19 23:33:52 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -72,6 +72,8 @@ public final class UTF8StreamWriter extends Writer {
int utf32 = buffer[cStart++];
if (utf32 == 0x0D) continue; // skip write
// special check for surrogates
if (highSurrogate != 0) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2001/09/06 01:29:03 $
* $Revision: 1.3 $
* $Date: 2001/09/19 23:33:52 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -408,12 +408,15 @@ public final class Utility { // COMMON UTILITIES
private static final String[] searchPath = {
"EXTRAS",
"3.1.2",
"3.2.0",
"3.1.1",
"3.1.0",
"3.0.1",
"3.0.0",
"2.1.9",
"2.1.8",
"2.1.5",
"2.1.2",
"2.0.0",
"1.1.0",
};