scuffed-code/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java

280 lines
12 KiB
Java
Raw Normal View History

package com.ibm.text.UCD;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.UnicodeDataFile;
public class MakeUnicodeFiles {
static boolean DEBUG = true;
public static void main() throws IOException {
generateFile("Scripts","z");
}
static class OrderedMap {
HashMap map = new HashMap();
ArrayList keys = new ArrayList();
void put(Object o, Object t) {
map.put(o,t);
keys.add(o);
}
List keyset() {
return keys;
}
}
static class PrintStyle {
boolean longForm = false;
boolean noLabel = false;
boolean makeUppercase = false;
boolean makeFirstLetterLowercase = false;
String skipValue = null;
String skipUnassigned = null;
boolean orderByRangeStart = false;
boolean valueList = false;
PrintStyle setLongForm(boolean value) {
longForm = value;
return this;
}
PrintStyle setSkipUnassigned(String value) {
skipUnassigned = value;
return this;
}
PrintStyle setNoLabel(boolean value) {
noLabel = value;
return this;
}
PrintStyle setMakeUppercase(boolean value) {
makeUppercase = value;
return this;
}
PrintStyle setMakeFirstLetterLowercase(boolean value) {
makeFirstLetterLowercase = value;
return this;
}
PrintStyle setSkipValue(String value) {
skipValue = value;
return this;
}
PrintStyle setOrderByRangeStart(boolean value) {
orderByRangeStart = value;
return this;
}
PrintStyle setValueList(boolean value) {
valueList = value;
return this;
}
}
static PrintStyle DEFAULT_PRINT_STYLE = new PrintStyle();
static Comparator skeletonComparator = new UnicodeProperty.SkeletonComparator();
static Map printStyles = new TreeMap(/*skeletonComparator*/);
static {
printStyles.put("Script", new PrintStyle().setLongForm(true)
.setMakeUppercase(true).setSkipUnassigned("Common"));
printStyles.put("Age", new PrintStyle().setNoLabel(true));
printStyles.put("Numeric_Type", new PrintStyle().setLongForm(true)
.setMakeFirstLetterLowercase(true).setSkipUnassigned("none"));
printStyles.put("General_Category", new PrintStyle().setNoLabel(true)
//.setSkipUnassigned(true)
);
printStyles.put("Line_Break", new PrintStyle().setSkipUnassigned("Unknown"));
printStyles.put("Joining_Type", new PrintStyle().setSkipValue("Non_Joining"));
printStyles.put("Joining_Group", new PrintStyle().setSkipValue("No_Joining_Group")
.setMakeUppercase(true));
printStyles.put("East_Asian_Width", new PrintStyle().setSkipUnassigned("Neutral"));
printStyles.put("Decomposition_Type", new PrintStyle().setLongForm(true)
.setSkipValue("None").setMakeFirstLetterLowercase(true));
printStyles.put("Bidi_Class", new PrintStyle().setSkipUnassigned("Left_To_Right"));
printStyles.put("Block", new PrintStyle().setNoLabel(true)
.setValueList(true));
printStyles.put("Age", new PrintStyle().setSkipValue("unassigned"));
printStyles.put("Canonical_Combining_Class", new PrintStyle().setSkipValue("0"));
printStyles.put("Hangul_Syllable_Type", new PrintStyle().setSkipValue("NA"));
}
//PropertyAliases
//PropertyValueAliases
//CompositionExclusions
//SpecialCasing
//NormalizationTest
//add("CaseFolding", new String[] {"CaseFolding"});
static Map contents = new TreeMap();
static void add(String name, String[] properties) {
contents.put(name, properties);
}
static {
add("Blocks", new String[] {"Block"});
add("DerivedAge", new String[] {"Age"});
add("Scripts", new String[] {"Script"});
add("HangulSyllableType", new String[] {"HangulSyllableType"});
if (false) add("DerivedNormalizationProps", new String[] {
"FNC", "Full_Composition_Exclusion",
"NFD_QuickCheck", "NFC_QuickCheck", "NFKD_QuickCheck", "NFKC_QuickCheck",
"Expands_On_NFD", "Expands_On_NFC", "Expands_On_NFKD", "Expands_On_NFKC"});
add("DerivedBidiClass", new String[] {"BidiClass"});
add("DerivedBinaryProperties", new String[] {"BidiMirrored"});
add("DerivedCombiningClass", new String[] {"CanonicalCombiningClass"});
add("DerivedDecompositionType", new String[] {"DecompositionType"});
add("DerivedEastAsianWidth", new String[] {"EastAsianWidth"});
add("DerivedGeneralCategory", new String[] {"GeneralCategory"});
add("DerivedJoiningGroup", new String[] {"JoiningGroup"});
add("DerivedJoiningType", new String[] {"JoiningType"});
add("DerivedLineBreak", new String[] {"LineBreak"});
add("DerivedNumericType", new String[] {"NumericType"});
add("DerivedNumericValues", new String[] {"NumericValue"});
add("PropList", new String[] {
"White_Space", "Bidi_Control", "Join_Control",
"Dash", "Hyphen", "Quotation_Mark",
"Terminal_Punctuation", "Other_Math",
"Hex_Digit", "ASCII_Hex_Digit",
"Other_Alphabetic",
"Ideographic",
"Diacritic", "Extender",
"Other_Lowercase", "Other_Uppercase",
"Noncharacter_Code_Point",
"Other_Grapheme_Extend",
"Grapheme_Link",
"IDS_Binary_Operator", "IDS_Trinary_Operator",
"Radical", "Unified_Ideograph",
"Other_Default_Ignorable_Code_Point",
"Deprecated", "Soft_Dotted",
"Logical_Order_Exception",
"Other_ID_Start"
});
add("DerivedCoreProperties", new String[] {
"Math", "Alphabetic", "Lowercase", "Uppercase",
"ID_Start", "ID_Continue",
"XID_Start", "XID_Continue",
"Default_Ignorable_Code_Point",
"Grapheme_Extend", "Grapheme_Base"
});
}
public static void generateFile(String atOrAfter, String atOrBefore) throws IOException {
Iterator it = contents.keySet().iterator();
while (it.hasNext()) {
String propname = (String) it.next();
if (propname.compareTo(atOrAfter) < 0) continue;
if (propname.compareTo(atOrBefore) > 0) continue;
generateFile(propname);
}
}
public static void generateFile(String filename) throws IOException {
String[] propList = (String[]) contents.get(filename);
UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedDataTest/", filename);
PrintWriter pw = udf.out; // bf2.openUTF8Writer(UCD_Types.GEN_DIR, "Test" + filename + ".txt");
UnicodeProperty.Factory toolFactory
= ToolUnicodePropertySource.make(Default.ucdVersion());
BagFormatter bf2 = new BagFormatter(toolFactory);
UnicodeSet unassigned = toolFactory.getSet("gc=cn")
.addAll(toolFactory.getSet("gc=cs"));
//System.out.println(unassigned.toPattern(true));
// .removeAll(toolFactory.getSet("noncharactercodepoint=true"));
String separator = bf2.getLineSeparator()
+ "# ================================================"
+ bf2.getLineSeparator() + bf2.getLineSeparator();
for (int i = 0; i < propList.length; ++i) {
UnicodeProperty prop = toolFactory.getProperty(propList[i]);
System.out.println(prop.getName());
pw.print(separator);
PrintStyle ps = (PrintStyle) printStyles.get(prop.getName());
if (ps == null) {
ps = DEFAULT_PRINT_STYLE;
System.out.println("Using default style!");
}
if (ps.noLabel) bf2.setLabelSource(null);
if (ps.valueList) {
bf2.setValueSource(new UnicodeProperty.FilteredProperty(prop, new ReplaceFilter()))
.setNameSource(null)
.setShowCount(false)
.showSetNames(pw,new UnicodeSet(0,0x10FFFF));
} else if (prop.getType() <= prop.EXTENDED_BINARY) {
UnicodeSet s = prop.getSet("True");
bf2.setValueSource(prop.getName());
bf2.showSetNames(pw, s);
} else {
bf2.setValueSource(prop);
Collection aliases = prop.getAvailableValueAliases();
if (ps.orderByRangeStart) {
System.out.println("Reordering");
TreeSet temp2 = new TreeSet(new RangeStartComparator(prop));
temp2.addAll(aliases);
aliases = temp2;
}
Iterator it = aliases.iterator();
while (it.hasNext()) {
String value = (String)it.next();
UnicodeSet s = prop.getSet(value);
System.out.println(value + "\t" + prop.getShortestValueAlias(value) + "\t" + ps.skipValue);
System.out.println(s.toPattern(true));
if (skeletonComparator.compare(value, ps.skipValue) == 0) continue;
if (skeletonComparator.compare(value, ps.skipUnassigned) == 0) {
s.removeAll(unassigned);
}
if (s.size() == 0) continue;
//if (unassigned.containsAll(s)) continue; // skip if all unassigned
//if (s.contains(0xD0000)) continue; // skip unassigned
pw.print(separator);
if (!ps.longForm) value = prop.getShortestValueAlias(value);
if (ps.makeUppercase) value = value.toUpperCase(Locale.ENGLISH);
if (ps.makeFirstLetterLowercase) {
// NOTE: this is ok since we are only working in ASCII
value = value.substring(0,1).toLowerCase(Locale.ENGLISH)
+ value.substring(1);
}
bf2.setValueSource(value);
bf2.showSetNames(pw, s);
}
}
}
udf.close();
}
static class RangeStartComparator implements Comparator {
UnicodeProperty prop;
CompareProperties.UnicodeSetComparator comp = new CompareProperties.UnicodeSetComparator();
RangeStartComparator(UnicodeProperty prop) {
this.prop = prop;
}
public int compare(Object o1, Object o2) {
UnicodeSet s1 = prop.getSet((String)o1);
UnicodeSet s2 = prop.getSet((String)o2);
if (true) System.out.println("comparing " + o1 + ", " + o2
+ s1.toPattern(true) + "?" + s2.toPattern(true)
+ ", " + comp.compare(s1, s2));
return comp.compare(s1, s2);
}
}
public static class ReplaceFilter extends UnicodeProperty.StringFilter {
public String remap(String original) {
return original.replace('_',' ');
}
}
}