scuffed-code/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java

889 lines
34 KiB
Java
Raw Normal View History

2001-08-31 00:30:17 +00:00
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
* $Date: 2004/11/12 23:17:15 $
* $Revision: 1.16 $
2001-08-31 00:30:17 +00:00
*
*******************************************************************************
*/
2001-08-30 20:50:18 +00:00
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import java.util.*;
import java.text.NumberFormat;
import java.io.*;
2001-08-31 00:30:17 +00:00
/** Simple program to merge UCD files into XML. Not yet documented!!
2001-08-30 20:50:18 +00:00
* @author Mark Davis
*/
public final class ConvertUCD implements UCD_Types {
2002-03-20 00:21:43 +00:00
public static final boolean SHOW = false;
2001-08-30 20:50:18 +00:00
public static final boolean DEBUG = false;
static final boolean SHOW_SAMPLE = false;
2001-08-31 00:30:17 +00:00
int major;
int minor;
int update;
2001-08-31 00:30:17 +00:00
String version;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// varies by version
/*
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR20 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR21 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
*/
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
//public static final String blocksnamePlain = "Blocks.txt";
//public static final String blocksname31 = "Blocks-4d2.beta";
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/** First item is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
static String[][] labelList = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
2002-03-15 00:34:46 +00:00
//{"ExtraProperties", "xp"},
2001-08-31 00:30:17 +00:00
{"PropList", "binary"},
2001-08-30 20:50:18 +00:00
//{"ExtraProperties", "xp"},
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions", "ce"},
{"CaseFolding", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts", "sn"},
//{"Jamo", "jn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
//*/
};
static HashMap isHex = new HashMap();
static HashMap defaults = new HashMap();
static {
for (int j = 0; j < labelList.length; ++j) {
String[] labels = labelList[j];
for (int i = 1; i < labels.length; ++i) {
boolean hex = false;
String def = null;
//char appendChar = '\u0000';
// pull off "*": hex interpretation
if (labels[i].charAt(0) == '*') { // HEX value
hex = true;
labels[i] = labels[i].substring(1);
}
/*
// pull off "$": append duplicates
if (labels[i].charAt(0) == '$') { // HEX value
appendChar = labels[i].charAt(1);
labels[i] = labels[i].substring(2);
}
// pull off default values
int pos = labels[i].indexOf('-');
if (pos >= 0) {
def = labels[i].substring(pos+1);
labels[i] = labels[i].substring(0,pos);
}
*/
// store results
// we do this after all processing, so that the label is clean!!
if (hex) isHex.put(labels[i], "");
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
defaults.put(labels[i], def);
}
}
}
2001-08-30 20:50:18 +00:00
/*
static String[][] labelList31 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
2001-08-31 00:30:17 +00:00
{"PropList-3.1.0d5.beta", "binary"},
2001-08-30 20:50:18 +00:00
{"ExtraProperties", "xp"},
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-3.1.0d4.beta", "sn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
{"Jamo", "jn"},
//
};
/*
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"ExtraProperties", "xp"},
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
2001-08-31 00:30:17 +00:00
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
2001-08-30 20:50:18 +00:00
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-1d4", "sn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
{"Jamo", "jn"},
//
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
//"NamesList-3.1.0d1.beta"
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String[][] labelList30 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"CompositionExclusions", "ce"},
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
{"CaseFolding", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
/*
{"Jamo", "jn"},
2001-08-31 00:30:17 +00:00
{"PropList.alpha", "RANGE", "OMIT"},
2001-08-30 20:50:18 +00:00
//
};
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String[][] labelList11 = {
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String[][] labelList20 = {
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String[][] labelList21 = {
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
*/
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// handles
public static final String blocksname = "Blocks";
//public static final String[][] labelList;
public static final boolean NEWPROPS = true;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/*
static {
switch (major*10 + minor) {
case 31:
blocksname = blocksname31;
labelList = labelList31;
break;
case 30:
blocksname = blocksnamePlain;
labelList = labelList30;
break;
case 21:
blocksname = blocksnamePlain;
labelList = labelList21;
break;
case 20:
blocksname = blocksnamePlain;
labelList = labelList20;
break;
default:
blocksname = blocksnamePlain;
labelList = labelList11;
break;
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
*/
static final String dataFilePrefix = "UCD_Data";
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// MAIN!!
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public static void main (String[] args) throws Exception {
2002-03-20 00:21:43 +00:00
System.out.println("Building binary version of UCD");
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
"UTF8"),
32*1024));
log.write("\uFEFF"); // BOM
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
try {
for (int i = 0; i < args.length; ++i) {
String version = args[i];
2001-08-30 20:50:18 +00:00
if (version.length() == 0) version = UCD.latestVersion;
2001-08-31 00:30:17 +00:00
new ConvertUCD().toJava(version);
2001-08-30 20:50:18 +00:00
}
} finally {
log.close();
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/*
static void toXML() throws Exception {
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if (false) readBlocks();
if (true) for (int i = 0; i < labelList.length; ++i) {
readSemi(labelList[i]);
} else {
readSemi(labelList[0]); // TESTING ONLY
}
writeXML();
}
*/
2001-08-31 00:30:17 +00:00
void toJava(String version) throws Exception {
this.version = version;
String[] parts = new String[3];
Utility.split(version, '.', parts);
major = Integer.parseInt(parts[0]);
minor = Integer.parseInt(parts[1]);
update = Integer.parseInt(parts[2]);
2001-09-01 01:11:13 +00:00
System.out.println("Building " + version);
2001-08-30 20:50:18 +00:00
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if (false) readBlocks();
if (true) for (int i = 0; i < labelList.length; ++i) {
readSemi(labelList[i]);
} else {
readSemi(labelList[0]); // TESTING ONLY
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
Iterator it = charData.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
UData value = (UData) charData.get(key);
value.compact();
}
2002-03-20 00:21:43 +00:00
2004-03-11 19:04:00 +00:00
/*
2002-03-20 00:21:43 +00:00
UData ud;
ud = getEntry(0x5e);
System.out.println("SPOT-CHECK: 5e: " + ud);
2002-03-20 00:21:43 +00:00
ud = getEntry(0x130);
System.out.println("SPOT-CHECK: 130: " + ud);
ud = getEntry(0x1f6);
System.out.println("SPOT-CHECK: 1f6: " + ud);
2002-03-20 00:21:43 +00:00
ud = getEntry(0x2A6D6);
2001-08-30 20:50:18 +00:00
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
2002-03-20 00:21:43 +00:00
2001-08-30 20:50:18 +00:00
ud = getEntry(0xFFFF);
System.out.println("SPOT-CHECK: FFFF: " + ud);
2004-03-11 19:04:00 +00:00
*/
2001-08-30 20:50:18 +00:00
writeJavaData();
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static PrintWriter log;
//static String directory = BASE_DIR;
//static Map appendDuplicates = new HashMap();
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/** First item in labels is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
2001-08-31 00:30:17 +00:00
List blockData = new LinkedList();
2001-08-31 00:30:17 +00:00
void readBlocks() throws Exception {
2001-08-30 20:50:18 +00:00
System.out.println("Reading 'Blocks'");
2002-10-05 01:28:58 +00:00
BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1);
2001-08-30 20:50:18 +00:00
String line = "";
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
//String original = line;
String comment = "";
int commentPos = line.indexOf('#');
if (commentPos >= 0) {
comment = line.substring(commentPos+1).trim();
line = line.substring(0, commentPos);
}
line = line.trim();
if (line.length() == 0) continue;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int count = Utility.split(line,';',parts);
if (count != 3) throw new ChainException("Bad count in Blocks", null);
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} catch (Exception e) {
System.out.println("Exception at: " + line);
throw e;
} finally {
input.close();
}
}
2001-08-31 00:30:17 +00:00
Set properties = new TreeSet();
2001-08-31 00:30:17 +00:00
void readSemi(String[] labels) throws Exception {
2001-08-30 20:50:18 +00:00
System.out.println();
System.out.println("Reading '" + labels[0] + "'");
if (major < 3 || (major == 3 && minor < 1)) {
if (labels[0] == "PropList") {
System.out.println("SKIPPING old format of Proplist for " + version);
return;
}
}
String tempVersion = version;
if (version.equals(UCD.latestVersion)) tempVersion = "";
2002-10-05 01:28:58 +00:00
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, Utility.LATIN1);
2001-08-30 20:50:18 +00:00
if (input == null) {
System.out.println("COULDN'T OPEN: " + labels[0]);
return;
}
boolean showedSemi = false;
boolean showedShort = false;
String line = "";
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
String original = line;
String comment = "";
int commentPos = line.indexOf('#');
if (commentPos >= 0) {
comment = line.substring(commentPos+1).trim();
line = line.substring(0, commentPos);
}
line = line.trim();
if (line.length() == 0) continue;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int count = Utility.split(line,';',parts);
2001-08-31 00:30:17 +00:00
2004-03-11 19:04:00 +00:00
if (false && parts[0].equals("2801")) {
2001-08-30 20:50:18 +00:00
System.out.println("debug?");
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// fix malformed or simple lists.
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if (count != labels.length) {
if (count == labels.length + 1 && parts[count-1].equals("")) {
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
showedSemi = true;
} else if (count == 1) { // fix simple list
++count;
parts[1] = "Y";
} else if (count < labels.length) {
if (!showedShort) System.out.println("Line shorter than labels: " + original);
showedShort = true;
for (int i = count; i < labels.length; ++i) {
parts[i] = "";
}
} else {
2001-08-31 00:30:17 +00:00
throw new ChainException("wrong count: {0}",
2001-08-30 20:50:18 +00:00
new Object[] {new Integer(line), new Integer(count)});
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// store char
// first field is always character OR range. May be UTF-32
int cpTop;
int cpStart;
int ddot = parts[0].indexOf(".");
if (ddot >= 0) {
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
2002-03-15 00:34:46 +00:00
// System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
2001-08-30 20:50:18 +00:00
} else {
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
cpTop = cpStart;
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// properties first
if (labels[1].equals("PROP")) {
String prop = parts[2].trim();
// FIX!!
boolean skipLetters = false;
if (prop.equals("Alphabetic")) {
prop = "Other_Alphabetic";
skipLetters = true;
}
// END FIX!!
properties.add(prop);
2002-03-15 00:34:46 +00:00
if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted
2001-08-30 20:50:18 +00:00
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
2001-08-31 00:30:17 +00:00
if (end == 0) end = cpStart;
2001-08-30 20:50:18 +00:00
for (int j = cpStart; j <= end; ++j) {
2004-03-11 19:04:00 +00:00
if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue;
2001-08-30 20:50:18 +00:00
if (skipLetters && getEntry(cpStart).isLetter()) continue;
appendCharProperties(j, prop);
}
}
} else { // not range!
String val = "";
String lastVal;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
for (int i = 1; i < labels.length; ++i) {
String key = labels[i];
lastVal = val;
if (isHex.get(key) != null) {
val = Utility.fromHex(parts[i]);
} else {
val = parts[i].trim();
}
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
if (val.equals("")) continue; // skip empty values, they mean default
for (int cps = cpStart; cps <= cpTop; ++cps) {
2004-03-11 19:04:00 +00:00
if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if (key.equals("binary")) {
appendCharProperties(cps, val);
} else if (key.equals("fc")) {
UData data = getEntry(cps);
String type = parts[i-1].trim();
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
data.fullCaseFolding = val;
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if (type.equals("S") || type.equals("C") || type.equals("L")) {
data.simpleCaseFolding = val;
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if (type.equals("I")) {
data.simpleCaseFolding = val;
setBinaryProperty(cps, CaseFoldTurkishI);
2004-03-11 19:04:00 +00:00
if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
2002-03-20 00:21:43 +00:00
+ Utility.hex(cps) + ": " + Utility.hex(val));
}
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
&& labels[4].equals("sc")
&& parts[4].trim().length() > 0) {
if (i < 4) {
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
+ Utility.hex(key) + ":" + Utility.hex(val));
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
2001-08-30 20:50:18 +00:00
}
} else {
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
UData data = getEntryIfExists(cps);
if (data == null || data.generalCategory == Cn) continue;
}
*/
addCharData(cps, key, val);
}
}
}
}
}
} catch (Exception e) {
System.out.println("Exception at: " + line + ", " + e.getMessage());
throw e;
} finally {
input.close();
}
//printValues("JOINING_TYPE", jtSet);
//printValues("JOINING_GROUP", jgSet);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static void printValues(String title, Set s) {
Iterator it = s.iterator();
System.out.println("public static String[] " + title + " = {");
while (it.hasNext()) {
String value = (String) it.next();
System.out.println(" \"" + value + "\",");
}
System.out.println("};");
it = s.iterator();
System.out.println("public static byte ");
int count = 0;
while (it.hasNext()) {
String value = (String) it.next();
System.out.println(" " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ",");
}
System.out.println(" LIMIT_" + title + " = " + count);
System.out.println(";");
}
2001-08-31 00:30:17 +00:00
Map charData = new TreeMap();
2001-08-31 00:30:17 +00:00
/*
2001-08-30 20:50:18 +00:00
static void writeXML() throws IOException {
System.out.println("Writing 'UCD-Main.xml'");
BufferedWriter output = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
"UTF8"),
32*1024);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
try {
// write header
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
output.write("<UnicodeCharacterDatabase>\r\n");
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write blocks
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
Iterator it = blockData.iterator();
while (it.hasNext()) {
String[] block = (String[]) it.next();
2001-08-31 00:30:17 +00:00
output.write(" <block start='" + Utility.quoteXML(block[0])
2001-08-30 20:50:18 +00:00
+ "' end='" + Utility.quoteXML(block[1])
+ "' name='" + Utility.quoteXML(block[2])
+ "'/>\r\n" );
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write char data
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
it = charData.keySet().iterator();
while (it.hasNext()) {
Integer cc = (Integer) it.next();
2002-04-24 02:38:53 +00:00
output.write(" <e c='" + Utility.quoteXML(cc.intValue()) + "'");
2001-08-30 20:50:18 +00:00
/*
UData data = (UData) charData.get(cc);
Iterator dataIt = data.keySet().iterator();
while (dataIt.hasNext()) {
String label = (String) dataIt.next();
if (label.equals("c")) continue; // already wrote it.
if (label.equals("fc")) {
String fc = getResolved(data, "fc");
String lc = getResolved(data, "lc");
if (!fc.equals(lc) && !lc.equals(cc)) log.println("FC " + fc.length() + ": " + toString(cc));
}
String value = Utility.quoteXML((String) data.get(label));
output.write(" " + label + "='" + value + "'");
}
*//*
2001-08-30 20:50:18 +00:00
output.write("/>\r\n");
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write footer
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
output.write("</UnicodeCharacterDatabase>\r\n");
} finally {
output.close();
}
}
*/
void writeJavaData() throws IOException {
2001-08-30 20:50:18 +00:00
Iterator it = charData.keySet().iterator();
int codePoint = -1;
System.out.println("Writing " + dataFilePrefix + version);
DataOutputStream dataOut = new DataOutputStream(
new BufferedOutputStream(
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
128*1024));
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write header
dataOut.writeByte(BINARY_FORMAT);
dataOut.writeByte(major);
dataOut.writeByte(minor);
dataOut.writeByte(update);
long millis = System.currentTimeMillis();
dataOut.writeLong(millis);
dataOut.writeInt(charData.size());
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
int count = 0;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write records
try {
// write char data
while (it.hasNext()) {
Object cc = (Object) it.next();
//codePoint = UTF32.char32At(cc,0);
if (DEBUG) System.out.println(Utility.hex(cc));
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
UData uData = (UData) charData.get(cc);
if (false && uData.name == null) {
System.out.println("Warning: NULL name\r\n" + uData);
System.out.println();
}
2004-03-11 19:04:00 +00:00
if (false && uData.codePoint == 0x2801) {
2001-08-30 20:50:18 +00:00
System.out.println("SPOT-CHECK: " + uData);
}
uData.writeBytes(dataOut);
count++;
if (DEBUG) System.out.println("Setting2");
}
System.out.println("Wrote Data " + count);
} catch (Exception e) {
throw new ChainException("Bad data write {0}", new Object [] {Utility.hex(codePoint)}, e);
} finally {
dataOut.close();
}
}
2001-08-31 00:30:17 +00:00
//static String[] xsSplit = new String[40];
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// Cache a little bit for speed
int getEntryCodePoint = -1;
UData getEntryUData = null;
2001-08-31 00:30:17 +00:00
UData getEntryIfExists(int cp) {
2001-08-30 20:50:18 +00:00
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
if (charEntry == null) return null;
getEntryCodePoint = cp;
getEntryUData = charEntry;
return charEntry;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/* Get entry in table for cc
*/
UData getEntry(int cp) {
2001-08-30 20:50:18 +00:00
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
if (charEntry == null) {
charEntry = new UData(cp);
charData.put(cc, charEntry);
//charEntry.put("c", cc);
}
getEntryCodePoint = cp;
getEntryUData = charEntry;
return charEntry;
}
/** Adds the character data. Signals duplicates with an exception
*/
void setBinaryProperty(int cp, int binProp) {
2001-08-30 20:50:18 +00:00
UData charEntry = getEntry(cp);
charEntry.binaryProperties |= (1L << binProp);
2001-08-30 20:50:18 +00:00
}
2001-08-31 00:30:17 +00:00
void appendCharProperties(int cp, String key) {
2001-08-30 20:50:18 +00:00
int ind;
//if (true || NEWPROPS) {
2002-03-15 00:34:46 +00:00
ind = Utility.lookup(key, UCD_Names.BP, true);
2001-08-30 20:50:18 +00:00
/*} else {
ind = Utility.lookup(key, UCD_Names.BP_OLD);
}
*/
//charEntry.binaryProperties |= (1 << ind);
setBinaryProperty(cp, ind);
}
2001-08-31 00:30:17 +00:00
Set jtSet = new TreeSet();
Set jgSet = new TreeSet();
2003-02-25 23:38:23 +00:00
2001-08-30 20:50:18 +00:00
/** Adds the character data. Signals duplicates with an exception
*/
void addCharData(int cp, String key, String value) {
2001-08-30 20:50:18 +00:00
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry(cp);
//if (cp < 10) System.out.println(" " + charEntry);
2003-02-25 23:38:23 +00:00
if (SHOW_SAMPLE && cp == 0x221) {
System.out.println("Sample: " + cp + ", " + key + ", " + value);
System.out.println(charEntry);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if (key.equals("bm")) {
if (value.equals("Y")) charEntry.binaryProperties |= 1;
} else if (key.equals("ce")) {
charEntry.binaryProperties |= 2;
} else if (key.equals("on")) {
if (charEntry.name.charAt(0) == '<') {
charEntry.name = '<' + value + '>';
}
} else if (key.equals("dm")) {
charEntry.decompositionType = CANONICAL;
if (value.charAt(0) == '<') {
int pos = value.indexOf('>');
String dType = value.substring(1,pos);
if (major < 2) if (dType.charAt(0) == '+') dType = dType.substring(1);
value = value.substring(pos+1);
setField(charEntry, "dt", dType);
}
// FIX OLD
if (major < 2) {
int oldStyle = value.indexOf('<');
if (oldStyle > 0) {
value = value.substring(0,oldStyle);
}
oldStyle = value.indexOf('{');
if (oldStyle > 0) {
value = value.substring(0,oldStyle);
}
}
setField(charEntry, key, Utility.fromHex(value));
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// fix the numeric fields to be more sensible
} else if (key.equals("dd")) {
if (charEntry.numericType < UCD_Types.DECIMAL) {
charEntry.numericType = UCD_Types.DECIMAL;
}
setField(charEntry, "nv", value);
} else if (key.equals("dv")) {
if (charEntry.numericType < UCD_Types.DIGIT) {
charEntry.numericType = UCD_Types.DIGIT;
}
setField(charEntry, "nv", value);
} else if (key.equals("nv")) {
if (charEntry.numericType < UCD_Types.NUMERIC) {
charEntry.numericType = UCD_Types.NUMERIC;
}
setField(charEntry, "nv", value);
/*} else if (key.equals("jt")) {
jtSet.add(value);
} else if (key.equals("jg")) {
jgSet.add(value);
*/
} else {
setField(charEntry, key, value);
}
2003-02-25 23:38:23 +00:00
if (SHOW_SAMPLE && cp == 0x221) {
System.out.println("Sample Result:");
System.out.println(charEntry);
}
2001-08-30 20:50:18 +00:00
}
2001-08-31 00:30:17 +00:00
public void setField(UData uData, String fieldName, String fieldValue) {
2001-08-30 20:50:18 +00:00
try {
if (fieldName.equals("n")) {
uData.name = fieldValue;
} else if (fieldName.equals("dm")) {
uData.decompositionMapping = fieldValue;
} else if (fieldName.equals("bg")) {
uData.bidiMirror = fieldValue;
} else if (fieldName.equals("uc")) {
uData.simpleUppercase = fieldValue;
} else if (fieldName.equals("lc")) {
uData.simpleLowercase = fieldValue;
} else if (fieldName.equals("tc")) {
uData.simpleTitlecase = fieldValue;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("su")) {
uData.fullUppercase = fieldValue;
} else if (fieldName.equals("sl")) {
2002-03-20 00:21:43 +00:00
if (DEBUG) System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData);
2001-08-30 20:50:18 +00:00
uData.fullLowercase = fieldValue;
} else if (fieldName.equals("st")) {
uData.fullTitlecase = fieldValue;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("sc")) {
2002-03-20 00:21:43 +00:00
if (uData.specialCasing.length() > 0) {
uData.specialCasing += ";";
}
uData.specialCasing += fieldValue;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("xp")) {
uData.binaryProperties |= 1L << Utility.lookup(fieldValue, UCD_Names.BP, true);
2001-08-30 20:50:18 +00:00
//UCD_Names.BP_OLD
} else if (fieldName.equals("gc")) {
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true);
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("bc")) {
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true);
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("dt")) {
if (major < 2) {
if (fieldValue.equals("no-break")) fieldValue = "noBreak";
else if (fieldValue.equals("circled")) fieldValue = "circle";
else if (fieldValue.equals("sup")) fieldValue = "super";
else if (fieldValue.equals("break")) fieldValue = "compat";
else if (fieldValue.equals("font variant")) fieldValue = "font";
else if (fieldValue.equals("no-join")) fieldValue = "compat";
else if (fieldValue.equals("join")) fieldValue = "compat";
}
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.LONG_DECOMPOSITION_TYPE, true);
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("nt")) {
uData.numericType = Utility.lookup(fieldValue, UCD_Names.LONG_NUMERIC_TYPE, true);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("ea")) {
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EAST_ASIAN_WIDTH, true);
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("lb")) {
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LINE_BREAK, true);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("sn")) {
uData.script = Utility.lookup(fieldValue, UCD_Names.LONG_SCRIPT, true);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("jt")) {
2002-03-15 00:34:46 +00:00
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE, true);
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("jg")) {
byte temp = (byte)Utility.find(fieldValue, UCD_Names.OLD_JOINING_GROUP, true);
if (temp != -1) uData.joiningGroup = temp;
else uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.JOINING_GROUP, true);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("nv")) {
if (major < 2) {
if (fieldValue.equals("-")) return;
}
2003-03-12 16:01:26 +00:00
uData.numericValue = Utility.doubleFrom(fieldValue);
2001-08-30 20:50:18 +00:00
} else if (fieldName.equals("cc")) {
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
} else if (fieldName.equals("bp")) {
uData.binaryProperties = (byte)Utility.longFrom(fieldValue);
2001-08-30 20:50:18 +00:00
} else {
throw new IllegalArgumentException("Unknown fieldName");
}
} catch (Exception e) {
throw new ChainException(
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
}