/** ******************************************************************************* * Copyright (C) 1996-2001, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $ * $Date: 2005/03/26 05:40:04 $ * $Revision: 1.17 $ * ******************************************************************************* */ package com.ibm.text.UCD; import java.util.*; import java.io.*; import com.ibm.icu.text.UTF16; import com.ibm.text.utility.*; public class GenerateCaseFolding implements UCD_Types { public static boolean DEBUG = false; public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1 // PICK_SHORT & NF_CLOSURE = false for old style /*public static void main(String[] args) throws java.io.IOException { makeCaseFold(arg[0]); //getAge(); } */ static PrintWriter log; public static void makeCaseFold(boolean normalized) throws java.io.IOException { PICK_SHORT = NF_CLOSURE = normalized; log = Utility.openPrintWriter("CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX); System.out.println("Writing Log: " + "CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true)); System.out.println("Making Full Data"); Map fullData = getCaseFolding(true, NF_CLOSURE, ""); Utility.fixDot(); System.out.println("Making Simple Data"); Map simpleData = getCaseFolding(false, NF_CLOSURE, ""); // write the data System.out.println("Making Turkish Full Data"); Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr"); Utility.fixDot(); System.out.println("Making Simple Data"); Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr"); // write the data Utility.fixDot(); System.out.println("Writing"); String filename = "CaseFolding"; if (normalized) filename += "-Normalized"; String directory = "DerivedData/"; UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, filename); PrintWriter out = fc.out; /* PrintWriter out = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()), "UTF8"), 4*1024)); */ for (int ch = 0; ch <= 0x10FFFF; ++ch) { Utility.dot(ch); if (!charsUsed.get(ch)) continue; String rFull = (String)fullData.get(UTF32.valueOf32(ch)); String rSimple = (String)simpleData.get(UTF32.valueOf32(ch)); String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch)); String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch)); if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue; if (rFull != null && rFull.equals(rSimple) || (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) { String type = "C"; if (ch == 0x49) { drawLine(out, ch, "C", "i"); drawLine(out, ch, "T", "\u0131"); } else if (ch == 0x130) { drawLine(out, ch, "F", "i\u0307"); drawLine(out, ch, "T", "i"); } else if (ch == 0x131) { // do nothing //drawLine(out, ch, "I", "i"); } else { drawLine(out, ch, type, rFull); } } else { if (rFull != null) { drawLine(out, ch, "F", rFull); } if (rSimple != null) { drawLine(out, ch, "S", rSimple); } } if (rFullTurkish != null && !rFullTurkish.equals(rFull)) { drawLine(out, ch, "T", rFullTurkish); } if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) { drawLine(out, ch, "t", rSimpleTurkish); } } fc.close(); log.close(); } /* Goal is following (with no entries for 0131 or 0069) 0049; C; 0069; # LATIN CAPITAL LETTER I 0049; T; 0131; # LATIN CAPITAL LETTER I 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ static void drawLine(PrintWriter out, int ch, String type, String result) { String comment = ""; if (COMMENT_DIFFS) { String lower = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER); if (!lower.equals(result)) { String upper = Default.ucd().getCase(UTF16.valueOf(ch), FULL, UPPER); String lower2 = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER); if (lower.equals(lower2)) { comment = "[Diff " + Utility.hex(lower, " ") + "] "; } else { Utility.fixDot(); System.out.println("PROBLEM WITH: " + Default.ucd().getCodeAndName(ch)); comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] "; } } } out.println(Utility.hex(ch) + "; " + type + "; " + Utility.hex(result, " ") + "; # " + comment + Default.ucd().getName(ch)); } static int probeCh = 0x01f0; static String shower = UTF16.valueOf(probeCh); static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException { Map data = new TreeMap(); Map repChar = new TreeMap(); //String option = ""; // get the equivalence classes for (int ch = 0; ch <= 0x10FFFF; ++ch) { Utility.dot(ch); //if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch)); if (!Default.ucd().isRepresented(ch)) continue; getClosure(ch, data, full, nfClose, condition); } // get the representative characters Iterator it = data.keySet().iterator(); while (it.hasNext()) { String s = (String) it.next(); Set set = (Set) data.get(s); show = set.contains(shower); if (show) { Utility.fixDot(); System.out.println(toString(set)); } // Pick the best available representative String rep = null; int repGood = 0; String dup = null; Iterator it2 = set.iterator(); while (it2.hasNext()) { String s2 = (String)it2.next(); int s2Good = goodness(s2, full, condition); if (s2Good > repGood) { rep = s2; repGood = s2Good; dup = null; } else if (s2Good == repGood) { dup = s2; } } if (rep == null) { Utility.fixDot(); System.err.println("No representative for: " + toString(set)); } else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) { String message = ""; if ((repGood & NFC_FORMAT) == 0) { message += " [NOT NFC FORMAT]"; } if ((repGood & ISLOWER) == 0) { message += " [NOT LOWERCASE]"; } Utility.fixDot(); log.println("Non-Optimal Representative " + message); log.println(" Rep:\t" + Default.ucd().getCodeAndName(rep)); log.println(" Set:\t" + toString(set,true, true)); } log.println(); log.println(); log.println(rep + "\t#" + Default.ucd().getName(rep)); // Add it for all the elements of the set it2 = set.iterator(); while (it2.hasNext()) { String s2 = (String)it2.next(); if (s2.equals(rep)) continue; log.println(s2 + "\t#" + Default.ucd().getName(s2)); if (UTF16.countCodePoint(s2) == 1) { repChar.put(UTF32.getCodePointSubstring(s2,0), rep); charsUsed.set(UTF16.charAt(s2, 0)); } } } return repChar; } static BitSet charsUsed = new BitSet(); static boolean show = false; static final int NFC_FORMAT = 64; static final int ISLOWER = 128; static int goodness(String s, boolean full, String condition) { if (s == null) return 0; int result = 32-s.length(); if (!PICK_SHORT) { result = s.length(); } if (!full) result <<= 8; String low = lower(upper(s, full, condition), full, condition); if (s.equals(low)) result |= ISLOWER; else if (PICK_SHORT && Default.nfd().normalize(s).equals(Default.nfd().normalize(low))) result |= ISLOWER; if (s.equals(Default.nfc().normalize(s))) result |= NFC_FORMAT; if (show) { Utility.fixDot(); System.out.println(Utility.hex(result) + ", " + Default.ucd().getCodeAndName(s)); } return result; } /* static HashSet temp = new HashSet(); static void normalize(HashSet set) { temp.clear(); temp.addAll(set); set.clear(); Iterator it = temp.iterator(); while (it.hasNext()) { String s = (String) it.next(); String s2 = KC.normalize(s); set.add(s); data2.put(s,set); if (!s.equals(s2)) { set.add(s2); data2.put(s2,set); System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2)); } } } */ /* String String lower1 = Default.ucd.getLowercase(ch); String lower2 = Default.ucd.toLowercase(ch,option); char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0); //String lower1 = String.valueOf(Default.ucd.getLowercase(ch)); //String lower = Default.ucd.toLowercase(ch2,option); String upper = Default.ucd.toUppercase(ch2,option); String lowerUpper = Default.ucd.toLowercase(upper,option); //String title = Default.ucd.toTitlecase(ch2,option); //String lowerTitle = Default.ucd.toLowercase(upper,option); if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { // output.println(Utility.hex(ch) + "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E") + "; " + Utility.hex(lowerUpper," ") + ";\t#" + Default.ucd.getName(ch) ); //if (!lowerUpper.equals(lower)) { // output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower)); //} //if (!lowerUpper.equals(lowerTitle)) { // output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle)); //} } */ static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) { String charStr = UTF32.valueOf32(ch); String lowerStr = lower(charStr, full, condition); String titleStr = title(charStr, full, condition); String upperStr = upper(charStr, full, condition); if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return; if (DEBUG) System.err.println("Closure for " + Utility.hex(ch)); // make new set Set set = new TreeSet(); set.add(charStr); data.put(charStr, set); // add cases to get started add(set, lowerStr, data); add(set, upperStr, data); add(set, titleStr, data); // close it main: while (true) { Iterator it = set.iterator(); while (it.hasNext()) { String s = (String) it.next(); // do funny stuff since we can't modify set while iterating // We don't do this because if the source is not normalized, we don't want to normalize if (nfClose) { if (add(set, Default.nfd().normalize(s), data)) continue main; if (add(set, Default.nfc().normalize(s), data)) continue main; if (add(set, Default.nfkd().normalize(s), data)) continue main; if (add(set, Default.nfkc().normalize(s), data)) continue main; } if (add(set, lower(s, full, condition), data)) continue main; if (add(set, title(s, full, condition), data)) continue main; if (add(set, upper(s, full, condition), data)) continue main; } break; } } static String lower(String s, boolean full, String condition) { String result = lower2(s,full, condition); return result.replace('\u03C2', '\u03C3'); // HACK for lower } // These functions are no longer necessary, since Default.ucd is parameterized, // but it's not worth changing static String lower2(String s, boolean full, String condition) { /*if (!full) { if (s.length() != 1) return s; return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER); } */ return Default.ucd().getCase(s, full ? FULL : SIMPLE, LOWER, condition); } static String upper(String s, boolean full, String condition) { /* if (!full) { if (s.length() != 1) return s; return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER); } */ return Default.ucd().getCase(s, full ? FULL : SIMPLE, UPPER, condition); } static String title(String s, boolean full, String condition) { /*if (!full) { if (s.length() != 1) return s; return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE); } */ return Default.ucd().getCase(s, full ? FULL : SIMPLE, TITLE, condition); } static boolean add(Set set, String s, Map data) { if (set.contains(s)) return false; set.add(s); if (DEBUG) System.err.println("adding: " + toString(set)); Set other = (Set) data.get(s); if (other != null && other != set) { // merge // make all the items in set point to merged set Iterator it = other.iterator(); while (it.hasNext()) { data.put(it.next(), set); } set.addAll(other); } if (DEBUG) System.err.println("done adding: " + toString(set)); return true; } static String toString(Set set) { return toString(set, false, false); } static String toString(Set set, boolean name, boolean crtab) { String result = "{"; Iterator it2 = set.iterator(); boolean first = true; while (it2.hasNext()) { String s2 = (String) it2.next(); if (!first) { if (crtab) { result += ";\r\n\t"; } else { result += "; "; } } first = false; if (name) { result += Default.ucd().getCodeAndName(s2); } else { result += Utility.hex(s2, " "); } } return result + "}"; } static boolean specialNormalizationDiffers(int ch) { if (ch == 0x00DF) return true; // es-zed return !Default.nfkd().isNormalized(ch); } static String specialNormalization(String s) { if (s.equals("\u00DF")) return "ss"; return Default.nfkd().normalize(s); } static boolean isExcluded(int ch) { // if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A.. if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee.. byte type = Default.ucd().getDecompositionType(ch); if (type == COMPAT_SQUARE) return true; //if (type == COMPAT_UNSPECIFIED) return true; return false; } static void generateSpecialCasing(boolean normalize) throws IOException { Map sorted = new TreeMap(); String suffix2 = ""; if (normalize) suffix2 = "-Normalized"; PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions" + suffix2 + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX); for (int ch = 0; ch <= 0x10FFFF; ++ch) { Utility.dot(ch); if (!Default.ucd().isRepresented(ch)) continue; if (!specialNormalizationDiffers(ch)) continue; String lower = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, LOWER)); String upper = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, UPPER)); String title = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, TITLE)); String chstr = UTF16.valueOf(ch); String decomp = specialNormalization(chstr); String flower = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, LOWER)); String fupper = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, UPPER)); String ftitle = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, TITLE)); String base = decomp; String blower = specialNormalization(lower); String bupper = specialNormalization(upper); String btitle = specialNormalization(title); if (true) { flower = Default.nfc().normalize(flower); fupper = Default.nfc().normalize(fupper); ftitle = Default.nfc().normalize(ftitle); base = Default.nfc().normalize(base); blower = Default.nfc().normalize(blower); bupper = Default.nfc().normalize(bupper); btitle = Default.nfc().normalize(btitle); } if (ch == CHECK_CHAR) { System.out.println("Code: " + Default.ucd().getCodeAndName(ch)); System.out.println("Decomp: " + Default.ucd().getCodeAndName(decomp)); System.out.println("Base: " + Default.ucd().getCodeAndName(base)); System.out.println("SLower: " + Default.ucd().getCodeAndName(lower)); System.out.println("FLower: " + Default.ucd().getCodeAndName(flower)); System.out.println("BLower: " + Default.ucd().getCodeAndName(blower)); System.out.println("STitle: " + Default.ucd().getCodeAndName(title)); System.out.println("FTitle: " + Default.ucd().getCodeAndName(ftitle)); System.out.println("BTitle: " + Default.ucd().getCodeAndName(btitle)); System.out.println("SUpper: " + Default.ucd().getCodeAndName(upper)); System.out.println("FUpper: " + Default.ucd().getCodeAndName(fupper)); System.out.println("BUpper: " + Default.ucd().getCodeAndName(bupper)); } // presumably if there is a single code point, it would already be in the simple mappings if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1 && UTF16.countCodePoint(title) == 1) { if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Default.ucd().getCodeAndName(ch)); continue; } // if there is no change from the base, skip if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) { if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Default.ucd().getCodeAndName(ch)); continue; } // fix special cases // if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue; if (flower.equals(blower)) flower = lower; if (fupper.equals(bupper)) fupper = upper; if (ftitle.equals(btitle)) ftitle = title; // if there are no changes from the original, or the expanded original, skip if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) { if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Default.ucd().getCodeAndName(ch)); continue; } String name = Default.ucd().getName(ch); int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1 : ch == 0x130 ? 2 : name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4 : name.indexOf("LIGATURE") >= 0 ? 3 : name.indexOf("GEGRAMMENI") < 0 ? 5 : UTF16.countCodePoint(ftitle) == 1 ? 6 : UTF16.countCodePoint(fupper) == 2 ? 7 : 8; if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch)); // HACK boolean denormalize = !normalize && order != 6 && order != 7; String mapping = Utility.hex(ch) + "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd().normalize(flower) : flower) + "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd().normalize(ftitle) : ftitle) + "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd().normalize(fupper) : fupper) + "; # " + Default.ucd().getName(ch); // special exclusions if (isExcluded(ch)) { log.println("# " + mapping); } else { int x = ch; if (ch == 0x01F0) x = 0x03B1; // HACK to reorder the same sorted.put(new Integer((order << 24) | x), mapping); } } log.close(); System.out.println("Writing"); //String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true); //PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2); PrintWriter out = udf.out; /* String[] batName = {""}; String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName); out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false)); out.println(UnicodeDataFile.generateDateLine()); out.println("#"); Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out); */ Iterator it = sorted.keySet().iterator(); int lastOrder = -1; while (it.hasNext()) { Integer key = (Integer) it.next(); String line = (String) sorted.get(key); int order = key.intValue() >> 24; if (order != lastOrder) { lastOrder = order; out.println(); boolean skipLine = false; switch(order) { case 1: out.println("# The German es-zed is special--the normal mapping is to SS."); out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase())"); break; case 2: out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below."); break; case 3: out.println("# Ligatures"); break; case 4: skipLine = true; break; case 5: out.println("# No corresponding uppercase precomposed character"); break; case 6: Utility.appendFile("SpecialCasingIota.txt", Utility.UTF8, out); break; case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break; case 8: skipLine = true; break; } if (!skipLine) out.println(); } out.println(line); } //Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out); udf.close(); //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } }