459c96f0b1
X-SVN-Rev: 14494
280 lines
12 KiB
Java
280 lines
12 KiB
Java
package com.ibm.text.UCD;
|
|
|
|
import java.io.IOException;
|
|
import java.io.PrintWriter;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.Comparator;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
import java.util.Map;
|
|
import java.util.TreeMap;
|
|
import java.util.TreeSet;
|
|
|
|
import com.ibm.icu.dev.test.util.BagFormatter;
|
|
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
|
import com.ibm.icu.text.UnicodeSet;
|
|
import com.ibm.text.utility.UnicodeDataFile;
|
|
|
|
public class MakeUnicodeFiles {
|
|
|
|
static boolean DEBUG = true;
|
|
|
|
public static void main() throws IOException {
|
|
generateFile("Scripts","z");
|
|
}
|
|
|
|
static class OrderedMap {
|
|
HashMap map = new HashMap();
|
|
ArrayList keys = new ArrayList();
|
|
void put(Object o, Object t) {
|
|
map.put(o,t);
|
|
keys.add(o);
|
|
}
|
|
List keyset() {
|
|
return keys;
|
|
}
|
|
}
|
|
|
|
static class PrintStyle {
|
|
boolean longForm = false;
|
|
boolean noLabel = false;
|
|
boolean makeUppercase = false;
|
|
boolean makeFirstLetterLowercase = false;
|
|
String skipValue = null;
|
|
String skipUnassigned = null;
|
|
boolean orderByRangeStart = false;
|
|
boolean valueList = false;
|
|
|
|
PrintStyle setLongForm(boolean value) {
|
|
longForm = value;
|
|
return this;
|
|
}
|
|
PrintStyle setSkipUnassigned(String value) {
|
|
skipUnassigned = value;
|
|
return this;
|
|
}
|
|
PrintStyle setNoLabel(boolean value) {
|
|
noLabel = value;
|
|
return this;
|
|
}
|
|
PrintStyle setMakeUppercase(boolean value) {
|
|
makeUppercase = value;
|
|
return this;
|
|
}
|
|
PrintStyle setMakeFirstLetterLowercase(boolean value) {
|
|
makeFirstLetterLowercase = value;
|
|
return this;
|
|
}
|
|
PrintStyle setSkipValue(String value) {
|
|
skipValue = value;
|
|
return this;
|
|
}
|
|
PrintStyle setOrderByRangeStart(boolean value) {
|
|
orderByRangeStart = value;
|
|
return this;
|
|
}
|
|
PrintStyle setValueList(boolean value) {
|
|
valueList = value;
|
|
return this;
|
|
}
|
|
}
|
|
static PrintStyle DEFAULT_PRINT_STYLE = new PrintStyle();
|
|
static Comparator skeletonComparator = new UnicodeProperty.SkeletonComparator();
|
|
static Map printStyles = new TreeMap(/*skeletonComparator*/);
|
|
static {
|
|
printStyles.put("Script", new PrintStyle().setLongForm(true)
|
|
.setMakeUppercase(true).setSkipUnassigned("Common"));
|
|
printStyles.put("Age", new PrintStyle().setNoLabel(true));
|
|
printStyles.put("Numeric_Type", new PrintStyle().setLongForm(true)
|
|
.setMakeFirstLetterLowercase(true).setSkipUnassigned("none"));
|
|
printStyles.put("General_Category", new PrintStyle().setNoLabel(true)
|
|
//.setSkipUnassigned(true)
|
|
);
|
|
printStyles.put("Line_Break", new PrintStyle().setSkipUnassigned("Unknown"));
|
|
printStyles.put("Joining_Type", new PrintStyle().setSkipValue("Non_Joining"));
|
|
printStyles.put("Joining_Group", new PrintStyle().setSkipValue("No_Joining_Group")
|
|
.setMakeUppercase(true));
|
|
printStyles.put("East_Asian_Width", new PrintStyle().setSkipUnassigned("Neutral"));
|
|
printStyles.put("Decomposition_Type", new PrintStyle().setLongForm(true)
|
|
.setSkipValue("None").setMakeFirstLetterLowercase(true));
|
|
printStyles.put("Bidi_Class", new PrintStyle().setSkipUnassigned("Left_To_Right"));
|
|
printStyles.put("Block", new PrintStyle().setNoLabel(true)
|
|
.setValueList(true));
|
|
printStyles.put("Age", new PrintStyle().setSkipValue("unassigned"));
|
|
printStyles.put("Canonical_Combining_Class", new PrintStyle().setSkipValue("0"));
|
|
printStyles.put("Hangul_Syllable_Type", new PrintStyle().setSkipValue("NA"));
|
|
|
|
}
|
|
//PropertyAliases
|
|
//PropertyValueAliases
|
|
//CompositionExclusions
|
|
//SpecialCasing
|
|
//NormalizationTest
|
|
//add("CaseFolding", new String[] {"CaseFolding"});
|
|
static Map contents = new TreeMap();
|
|
static void add(String name, String[] properties) {
|
|
contents.put(name, properties);
|
|
}
|
|
static {
|
|
add("Blocks", new String[] {"Block"});
|
|
add("DerivedAge", new String[] {"Age"});
|
|
add("Scripts", new String[] {"Script"});
|
|
add("HangulSyllableType", new String[] {"HangulSyllableType"});
|
|
if (false) add("DerivedNormalizationProps", new String[] {
|
|
"FNC", "Full_Composition_Exclusion",
|
|
"NFD_QuickCheck", "NFC_QuickCheck", "NFKD_QuickCheck", "NFKC_QuickCheck",
|
|
"Expands_On_NFD", "Expands_On_NFC", "Expands_On_NFKD", "Expands_On_NFKC"});
|
|
|
|
add("DerivedBidiClass", new String[] {"BidiClass"});
|
|
add("DerivedBinaryProperties", new String[] {"BidiMirrored"});
|
|
add("DerivedCombiningClass", new String[] {"CanonicalCombiningClass"});
|
|
add("DerivedDecompositionType", new String[] {"DecompositionType"});
|
|
add("DerivedEastAsianWidth", new String[] {"EastAsianWidth"});
|
|
add("DerivedGeneralCategory", new String[] {"GeneralCategory"});
|
|
add("DerivedJoiningGroup", new String[] {"JoiningGroup"});
|
|
add("DerivedJoiningType", new String[] {"JoiningType"});
|
|
add("DerivedLineBreak", new String[] {"LineBreak"});
|
|
add("DerivedNumericType", new String[] {"NumericType"});
|
|
add("DerivedNumericValues", new String[] {"NumericValue"});
|
|
add("PropList", new String[] {
|
|
"White_Space", "Bidi_Control", "Join_Control",
|
|
"Dash", "Hyphen", "Quotation_Mark",
|
|
"Terminal_Punctuation", "Other_Math",
|
|
"Hex_Digit", "ASCII_Hex_Digit",
|
|
"Other_Alphabetic",
|
|
"Ideographic",
|
|
"Diacritic", "Extender",
|
|
"Other_Lowercase", "Other_Uppercase",
|
|
"Noncharacter_Code_Point",
|
|
"Other_Grapheme_Extend",
|
|
"Grapheme_Link",
|
|
"IDS_Binary_Operator", "IDS_Trinary_Operator",
|
|
"Radical", "Unified_Ideograph",
|
|
"Other_Default_Ignorable_Code_Point",
|
|
"Deprecated", "Soft_Dotted",
|
|
"Logical_Order_Exception",
|
|
"Other_ID_Start"
|
|
});
|
|
add("DerivedCoreProperties", new String[] {
|
|
"Math", "Alphabetic", "Lowercase", "Uppercase",
|
|
"ID_Start", "ID_Continue",
|
|
"XID_Start", "XID_Continue",
|
|
"Default_Ignorable_Code_Point",
|
|
"Grapheme_Extend", "Grapheme_Base"
|
|
});
|
|
}
|
|
|
|
public static void generateFile(String atOrAfter, String atOrBefore) throws IOException {
|
|
Iterator it = contents.keySet().iterator();
|
|
while (it.hasNext()) {
|
|
String propname = (String) it.next();
|
|
if (propname.compareTo(atOrAfter) < 0) continue;
|
|
if (propname.compareTo(atOrBefore) > 0) continue;
|
|
generateFile(propname);
|
|
}
|
|
}
|
|
|
|
public static void generateFile(String filename) throws IOException {
|
|
String[] propList = (String[]) contents.get(filename);
|
|
UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedDataTest/", filename);
|
|
PrintWriter pw = udf.out; // bf2.openUTF8Writer(UCD_Types.GEN_DIR, "Test" + filename + ".txt");
|
|
UnicodeProperty.Factory toolFactory
|
|
= ToolUnicodePropertySource.make(Default.ucdVersion());
|
|
BagFormatter bf2 = new BagFormatter(toolFactory);
|
|
UnicodeSet unassigned = toolFactory.getSet("gc=cn")
|
|
.addAll(toolFactory.getSet("gc=cs"));
|
|
//System.out.println(unassigned.toPattern(true));
|
|
// .removeAll(toolFactory.getSet("noncharactercodepoint=true"));
|
|
String separator = bf2.getLineSeparator()
|
|
+ "# ================================================"
|
|
+ bf2.getLineSeparator() + bf2.getLineSeparator();
|
|
|
|
for (int i = 0; i < propList.length; ++i) {
|
|
UnicodeProperty prop = toolFactory.getProperty(propList[i]);
|
|
System.out.println(prop.getName());
|
|
pw.print(separator);
|
|
PrintStyle ps = (PrintStyle) printStyles.get(prop.getName());
|
|
if (ps == null) {
|
|
ps = DEFAULT_PRINT_STYLE;
|
|
System.out.println("Using default style!");
|
|
}
|
|
if (ps.noLabel) bf2.setLabelSource(null);
|
|
|
|
if (ps.valueList) {
|
|
bf2.setValueSource(new UnicodeProperty.FilteredProperty(prop, new ReplaceFilter()))
|
|
.setNameSource(null)
|
|
.setShowCount(false)
|
|
.showSetNames(pw,new UnicodeSet(0,0x10FFFF));
|
|
} else if (prop.getType() <= prop.EXTENDED_BINARY) {
|
|
UnicodeSet s = prop.getSet("True");
|
|
bf2.setValueSource(prop.getName());
|
|
bf2.showSetNames(pw, s);
|
|
} else {
|
|
bf2.setValueSource(prop);
|
|
Collection aliases = prop.getAvailableValueAliases();
|
|
if (ps.orderByRangeStart) {
|
|
System.out.println("Reordering");
|
|
TreeSet temp2 = new TreeSet(new RangeStartComparator(prop));
|
|
temp2.addAll(aliases);
|
|
aliases = temp2;
|
|
}
|
|
Iterator it = aliases.iterator();
|
|
while (it.hasNext()) {
|
|
String value = (String)it.next();
|
|
UnicodeSet s = prop.getSet(value);
|
|
|
|
System.out.println(value + "\t" + prop.getShortestValueAlias(value) + "\t" + ps.skipValue);
|
|
System.out.println(s.toPattern(true));
|
|
|
|
if (skeletonComparator.compare(value, ps.skipValue) == 0) continue;
|
|
if (skeletonComparator.compare(value, ps.skipUnassigned) == 0) {
|
|
s.removeAll(unassigned);
|
|
}
|
|
|
|
if (s.size() == 0) continue;
|
|
//if (unassigned.containsAll(s)) continue; // skip if all unassigned
|
|
//if (s.contains(0xD0000)) continue; // skip unassigned
|
|
pw.print(separator);
|
|
if (!ps.longForm) value = prop.getShortestValueAlias(value);
|
|
if (ps.makeUppercase) value = value.toUpperCase(Locale.ENGLISH);
|
|
if (ps.makeFirstLetterLowercase) {
|
|
// NOTE: this is ok since we are only working in ASCII
|
|
value = value.substring(0,1).toLowerCase(Locale.ENGLISH)
|
|
+ value.substring(1);
|
|
}
|
|
bf2.setValueSource(value);
|
|
bf2.showSetNames(pw, s);
|
|
}
|
|
}
|
|
}
|
|
udf.close();
|
|
}
|
|
static class RangeStartComparator implements Comparator {
|
|
UnicodeProperty prop;
|
|
CompareProperties.UnicodeSetComparator comp = new CompareProperties.UnicodeSetComparator();
|
|
RangeStartComparator(UnicodeProperty prop) {
|
|
this.prop = prop;
|
|
}
|
|
public int compare(Object o1, Object o2) {
|
|
UnicodeSet s1 = prop.getSet((String)o1);
|
|
UnicodeSet s2 = prop.getSet((String)o2);
|
|
if (true) System.out.println("comparing " + o1 + ", " + o2
|
|
+ s1.toPattern(true) + "?" + s2.toPattern(true)
|
|
+ ", " + comp.compare(s1, s2));
|
|
return comp.compare(s1, s2);
|
|
}
|
|
|
|
}
|
|
|
|
public static class ReplaceFilter extends UnicodeProperty.StringFilter {
|
|
public String remap(String original) {
|
|
return original.replace('_',' ');
|
|
}
|
|
}
|
|
|
|
|
|
|
|
} |