2001-08-31 00:30:17 +00:00
|
|
|
/**
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
|
|
* others. All Rights Reserved. *
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
2004-02-06 18:32:05 +00:00
|
|
|
* $Date: 2004/02/06 18:30:22 $
|
|
|
|
* $Revision: 1.14 $
|
2001-08-31 00:30:17 +00:00
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
package com.ibm.text.UCD;
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
import java.io.*;
|
2002-03-15 01:57:01 +00:00
|
|
|
import com.ibm.icu.text.UTF16;
|
2001-08-30 20:50:18 +00:00
|
|
|
|
|
|
|
import com.ibm.text.utility.*;
|
|
|
|
|
|
|
|
public class GenerateCaseFolding implements UCD_Types {
|
|
|
|
public static boolean DEBUG = false;
|
2001-12-13 23:36:29 +00:00
|
|
|
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
|
|
|
|
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
|
|
|
|
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
|
2002-03-20 00:21:43 +00:00
|
|
|
static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1
|
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
// PICK_SHORT & NF_CLOSURE = false for old style
|
|
|
|
|
|
|
|
|
|
|
|
/*public static void main(String[] args) throws java.io.IOException {
|
|
|
|
makeCaseFold(arg[0]);
|
2001-08-30 20:50:18 +00:00
|
|
|
//getAge();
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
static PrintWriter log;
|
|
|
|
|
|
|
|
public static void makeCaseFold(boolean normalized) throws java.io.IOException {
|
|
|
|
PICK_SHORT = NF_CLOSURE = normalized;
|
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
Default.setUCD();
|
2002-07-30 09:57:18 +00:00
|
|
|
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
|
2001-12-13 23:36:29 +00:00
|
|
|
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
|
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
System.out.println("Making Full Data");
|
2003-02-25 23:38:23 +00:00
|
|
|
Map fullData = getCaseFolding(true, NF_CLOSURE, "");
|
2001-12-13 23:36:29 +00:00
|
|
|
Utility.fixDot();
|
2003-02-25 23:38:23 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
System.out.println("Making Simple Data");
|
2003-02-25 23:38:23 +00:00
|
|
|
Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
|
|
|
|
// write the data
|
|
|
|
|
|
|
|
System.out.println("Making Turkish Full Data");
|
|
|
|
Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
|
|
|
|
Utility.fixDot();
|
|
|
|
|
|
|
|
System.out.println("Making Simple Data");
|
|
|
|
Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
|
2001-08-30 20:50:18 +00:00
|
|
|
// write the data
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
Utility.fixDot();
|
2001-08-30 20:50:18 +00:00
|
|
|
System.out.println("Writing");
|
2001-12-13 23:36:29 +00:00
|
|
|
String filename = "CaseFolding";
|
|
|
|
if (normalized) filename += "-Normalized";
|
|
|
|
String directory = "DerivedData/";
|
2002-03-15 00:34:46 +00:00
|
|
|
String newFile = directory + filename + GenerateData.getFileSuffix(true);
|
2002-07-30 09:57:18 +00:00
|
|
|
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
2003-02-25 23:38:23 +00:00
|
|
|
String[] batName = {""};
|
|
|
|
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName);
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
|
2002-03-15 00:34:46 +00:00
|
|
|
out.println(GenerateData.generateDateLine());
|
2001-12-13 23:36:29 +00:00
|
|
|
out.println("#");
|
2002-10-05 01:28:58 +00:00
|
|
|
Utility.appendFile("CaseFoldingHeader.txt", Utility.LATIN1, out);
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
/*
|
2001-08-30 20:50:18 +00:00
|
|
|
PrintWriter out = new PrintWriter(
|
|
|
|
new BufferedWriter(
|
|
|
|
new OutputStreamWriter(
|
2001-12-13 23:36:29 +00:00
|
|
|
new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
|
2001-08-30 20:50:18 +00:00
|
|
|
"UTF8"),
|
|
|
|
4*1024));
|
2001-12-13 23:36:29 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
|
|
|
Utility.dot(ch);
|
2001-08-30 20:50:18 +00:00
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
if (!charsUsed.get(ch)) continue;
|
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
|
|
|
|
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
|
2003-02-25 23:38:23 +00:00
|
|
|
String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
|
|
|
|
String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
|
|
|
|
if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;
|
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
if (rFull != null && rFull.equals(rSimple)
|
|
|
|
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
|
2001-08-30 20:50:18 +00:00
|
|
|
String type = "C";
|
2002-03-22 22:08:53 +00:00
|
|
|
if (ch == 0x49) {
|
|
|
|
drawLine(out, ch, "C", "i");
|
|
|
|
drawLine(out, ch, "T", "\u0131");
|
|
|
|
} else if (ch == 0x130) {
|
2002-03-20 00:21:43 +00:00
|
|
|
drawLine(out, ch, "F", "i\u0307");
|
2002-03-22 22:08:53 +00:00
|
|
|
drawLine(out, ch, "T", "i");
|
2002-03-20 00:21:43 +00:00
|
|
|
} else if (ch == 0x131) {
|
2002-03-22 22:08:53 +00:00
|
|
|
// do nothing
|
|
|
|
//drawLine(out, ch, "I", "i");
|
2002-03-20 00:21:43 +00:00
|
|
|
} else {
|
|
|
|
drawLine(out, ch, type, rFull);
|
|
|
|
}
|
2001-08-30 20:50:18 +00:00
|
|
|
} else {
|
|
|
|
if (rFull != null) {
|
|
|
|
drawLine(out, ch, "F", rFull);
|
|
|
|
}
|
|
|
|
if (rSimple != null) {
|
|
|
|
drawLine(out, ch, "S", rSimple);
|
|
|
|
}
|
|
|
|
}
|
2003-02-25 23:38:23 +00:00
|
|
|
if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
|
|
|
|
drawLine(out, ch, "T", rFullTurkish);
|
|
|
|
}
|
|
|
|
if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
|
|
|
|
drawLine(out, ch, "t", rSimpleTurkish);
|
|
|
|
}
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
|
|
|
out.close();
|
2001-12-13 23:36:29 +00:00
|
|
|
log.close();
|
2003-02-25 23:38:23 +00:00
|
|
|
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2002-03-22 22:08:53 +00:00
|
|
|
|
|
|
|
/* Goal is following (with no entries for 0131 or 0069)
|
|
|
|
|
|
|
|
0049; C; 0069; # LATIN CAPITAL LETTER I
|
|
|
|
0049; T; 0131; # LATIN CAPITAL LETTER I
|
|
|
|
|
|
|
|
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
|
|
|
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
|
|
|
*/
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
static void drawLine(PrintWriter out, int ch, String type, String result) {
|
2001-12-13 23:36:29 +00:00
|
|
|
String comment = "";
|
|
|
|
if (COMMENT_DIFFS) {
|
2002-04-23 01:59:16 +00:00
|
|
|
String lower = Default.ucd.getCase(UTF16.valueOf(ch), FULL, LOWER);
|
2001-12-13 23:36:29 +00:00
|
|
|
if (!lower.equals(result)) {
|
2002-04-23 01:59:16 +00:00
|
|
|
String upper = Default.ucd.getCase(UTF16.valueOf(ch), FULL, UPPER);
|
|
|
|
String lower2 = Default.ucd.getCase(UTF16.valueOf(ch), FULL, LOWER);
|
2001-12-13 23:36:29 +00:00
|
|
|
if (lower.equals(lower2)) {
|
|
|
|
comment = "[Diff " + Utility.hex(lower, " ") + "] ";
|
|
|
|
} else {
|
|
|
|
Utility.fixDot();
|
2002-04-23 01:59:16 +00:00
|
|
|
System.out.println("PROBLEM WITH: " + Default.ucd.getCodeAndName(ch));
|
2001-12-13 23:36:29 +00:00
|
|
|
comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-08-31 00:30:17 +00:00
|
|
|
out.println(Utility.hex(ch)
|
2001-12-13 23:36:29 +00:00
|
|
|
+ "; " + type
|
|
|
|
+ "; " + Utility.hex(result, " ")
|
2002-04-23 01:59:16 +00:00
|
|
|
+ "; # " + comment + Default.ucd.getName(ch));
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
static int probeCh = 0x01f0;
|
|
|
|
static String shower = UTF16.valueOf(probeCh);
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
|
2001-08-30 20:50:18 +00:00
|
|
|
Map data = new TreeMap();
|
|
|
|
Map repChar = new TreeMap();
|
|
|
|
//String option = "";
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
// get the equivalence classes
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
|
|
|
Utility.dot(ch);
|
|
|
|
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
2002-04-23 01:59:16 +00:00
|
|
|
if (!Default.ucd.isRepresented(ch)) continue;
|
2003-02-25 23:38:23 +00:00
|
|
|
getClosure(ch, data, full, nfClose, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
// get the representative characters
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
Iterator it = data.keySet().iterator();
|
|
|
|
while (it.hasNext()) {
|
|
|
|
String s = (String) it.next();
|
|
|
|
Set set = (Set) data.get(s);
|
2001-12-13 23:36:29 +00:00
|
|
|
show = set.contains(shower);
|
|
|
|
if (show) {
|
|
|
|
Utility.fixDot();
|
|
|
|
System.out.println(toString(set));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pick the best available representative
|
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
String rep = null;
|
|
|
|
int repGood = 0;
|
|
|
|
String dup = null;
|
|
|
|
Iterator it2 = set.iterator();
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
String s2 = (String)it2.next();
|
2003-02-25 23:38:23 +00:00
|
|
|
int s2Good = goodness(s2, full, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
if (s2Good > repGood) {
|
|
|
|
rep = s2;
|
|
|
|
repGood = s2Good;
|
|
|
|
dup = null;
|
|
|
|
} else if (s2Good == repGood) {
|
|
|
|
dup = s2;
|
|
|
|
}
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
if (rep == null) {
|
|
|
|
Utility.fixDot();
|
|
|
|
System.err.println("No representative for: " + toString(set));
|
|
|
|
} else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) {
|
|
|
|
String message = "";
|
|
|
|
if ((repGood & NFC_FORMAT) == 0) {
|
|
|
|
message += " [NOT NFC FORMAT]";
|
|
|
|
}
|
|
|
|
if ((repGood & ISLOWER) == 0) {
|
|
|
|
message += " [NOT LOWERCASE]";
|
|
|
|
}
|
|
|
|
Utility.fixDot();
|
|
|
|
log.println("Non-Optimal Representative " + message);
|
2002-04-23 01:59:16 +00:00
|
|
|
log.println(" Rep:\t" + Default.ucd.getCodeAndName(rep));
|
2001-12-13 23:36:29 +00:00
|
|
|
log.println(" Set:\t" + toString(set,true, true));
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
log.println();
|
|
|
|
log.println();
|
|
|
|
log.println(rep + "\t#" + Default.ucd.getName(rep));
|
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
// Add it for all the elements of the set
|
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
it2 = set.iterator();
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
String s2 = (String)it2.next();
|
2003-02-25 23:38:23 +00:00
|
|
|
if (s2.equals(rep)) continue;
|
|
|
|
|
|
|
|
log.println(s2 + "\t#" + Default.ucd.getName(s2));
|
|
|
|
|
|
|
|
if (UTF16.countCodePoint(s2) == 1) {
|
2001-12-13 23:36:29 +00:00
|
|
|
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
|
|
|
|
charsUsed.set(UTF16.charAt(s2, 0));
|
|
|
|
}
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return repChar;
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
static BitSet charsUsed = new BitSet();
|
|
|
|
static boolean show = false;
|
|
|
|
static final int NFC_FORMAT = 64;
|
|
|
|
static final int ISLOWER = 128;
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static int goodness(String s, boolean full, String condition) {
|
2001-08-30 20:50:18 +00:00
|
|
|
if (s == null) return 0;
|
2001-12-13 23:36:29 +00:00
|
|
|
int result = 32-s.length();
|
|
|
|
if (!PICK_SHORT) {
|
|
|
|
result = s.length();
|
|
|
|
}
|
|
|
|
if (!full) result <<= 8;
|
2003-02-25 23:38:23 +00:00
|
|
|
String low = lower(upper(s, full, condition), full, condition);
|
2001-12-13 23:36:29 +00:00
|
|
|
if (s.equals(low)) result |= ISLOWER;
|
2002-04-23 01:59:16 +00:00
|
|
|
else if (PICK_SHORT && Default.nfd.normalize(s).equals(Default.nfd.normalize(low))) result |= ISLOWER;
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
if (s.equals(Default.nfc.normalize(s))) result |= NFC_FORMAT;
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
if (show) {
|
|
|
|
Utility.fixDot();
|
2002-04-23 01:59:16 +00:00
|
|
|
System.out.println(Utility.hex(result) + ", " + Default.ucd.getCodeAndName(s));
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
2001-08-30 20:50:18 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
/*
|
|
|
|
static HashSet temp = new HashSet();
|
|
|
|
static void normalize(HashSet set) {
|
|
|
|
temp.clear();
|
|
|
|
temp.addAll(set);
|
|
|
|
set.clear();
|
|
|
|
Iterator it = temp.iterator();
|
|
|
|
while (it.hasNext()) {
|
|
|
|
String s = (String) it.next();
|
|
|
|
String s2 = KC.normalize(s);
|
|
|
|
set.add(s);
|
|
|
|
data2.put(s,set);
|
|
|
|
if (!s.equals(s2)) {
|
|
|
|
set.add(s2);
|
|
|
|
data2.put(s2,set);
|
|
|
|
System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*/
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
/*
|
2001-08-31 00:30:17 +00:00
|
|
|
String
|
2002-04-23 01:59:16 +00:00
|
|
|
String lower1 = Default.ucd.getLowercase(ch);
|
|
|
|
String lower2 = Default.ucd.toLowercase(ch,option);
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0);
|
|
|
|
//String lower1 = String.valueOf(Default.ucd.getLowercase(ch));
|
|
|
|
//String lower = Default.ucd.toLowercase(ch2,option);
|
|
|
|
String upper = Default.ucd.toUppercase(ch2,option);
|
|
|
|
String lowerUpper = Default.ucd.toLowercase(upper,option);
|
|
|
|
//String title = Default.ucd.toTitlecase(ch2,option);
|
|
|
|
//String lowerTitle = Default.ucd.toLowercase(upper,option);
|
2001-08-31 00:30:17 +00:00
|
|
|
|
|
|
|
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
|
|
|
|
output.println(Utility.hex(ch)
|
2001-08-30 20:50:18 +00:00
|
|
|
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
|
|
|
|
+ "; " + Utility.hex(lowerUpper," ")
|
2002-04-23 01:59:16 +00:00
|
|
|
+ ";\t#" + Default.ucd.getName(ch)
|
2001-08-30 20:50:18 +00:00
|
|
|
);
|
|
|
|
//if (!lowerUpper.equals(lower)) {
|
2002-04-23 01:59:16 +00:00
|
|
|
// output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower));
|
2001-08-30 20:50:18 +00:00
|
|
|
//}
|
|
|
|
//if (!lowerUpper.equals(lowerTitle)) {
|
2002-04-23 01:59:16 +00:00
|
|
|
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle));
|
2001-08-30 20:50:18 +00:00
|
|
|
//}
|
|
|
|
}
|
|
|
|
*/
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
|
2001-08-30 20:50:18 +00:00
|
|
|
String charStr = UTF32.valueOf32(ch);
|
2003-02-25 23:38:23 +00:00
|
|
|
String lowerStr = lower(charStr, full, condition);
|
|
|
|
String titleStr = title(charStr, full, condition);
|
|
|
|
String upperStr = upper(charStr, full, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
|
|
|
|
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
// make new set
|
|
|
|
Set set = new TreeSet();
|
|
|
|
set.add(charStr);
|
|
|
|
data.put(charStr, set);
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
// add cases to get started
|
|
|
|
add(set, lowerStr, data);
|
|
|
|
add(set, upperStr, data);
|
|
|
|
add(set, titleStr, data);
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
// close it
|
|
|
|
main:
|
|
|
|
while (true) {
|
|
|
|
Iterator it = set.iterator();
|
|
|
|
while (it.hasNext()) {
|
|
|
|
String s = (String) it.next();
|
|
|
|
// do funny stuff since we can't modify set while iterating
|
2001-12-13 23:36:29 +00:00
|
|
|
// We don't do this because if the source is not normalized, we don't want to normalize
|
|
|
|
if (nfClose) {
|
2002-04-23 01:59:16 +00:00
|
|
|
if (add(set, Default.nfd.normalize(s), data)) continue main;
|
|
|
|
if (add(set, Default.nfc.normalize(s), data)) continue main;
|
|
|
|
if (add(set, Default.nfkd.normalize(s), data)) continue main;
|
|
|
|
if (add(set, Default.nfkc.normalize(s), data)) continue main;
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
2003-02-25 23:38:23 +00:00
|
|
|
if (add(set, lower(s, full, condition), data)) continue main;
|
|
|
|
if (add(set, title(s, full, condition), data)) continue main;
|
|
|
|
if (add(set, upper(s, full, condition), data)) continue main;
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static String lower(String s, boolean full, String condition) {
|
|
|
|
String result = lower2(s,full, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
return result.replace('\u03C2', '\u03C3'); // HACK for lower
|
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
// These functions are no longer necessary, since Default.ucd is parameterized,
|
2001-08-30 20:50:18 +00:00
|
|
|
// but it's not worth changing
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static String lower2(String s, boolean full, String condition) {
|
2001-12-13 23:36:29 +00:00
|
|
|
/*if (!full) {
|
2001-08-30 20:50:18 +00:00
|
|
|
if (s.length() != 1) return s;
|
2002-04-23 01:59:16 +00:00
|
|
|
return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
*/
|
2003-02-25 23:38:23 +00:00
|
|
|
return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static String upper(String s, boolean full, String condition) {
|
2001-12-13 23:36:29 +00:00
|
|
|
/* if (!full) {
|
2001-08-30 20:50:18 +00:00
|
|
|
if (s.length() != 1) return s;
|
2002-04-23 01:59:16 +00:00
|
|
|
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
*/
|
2003-02-25 23:38:23 +00:00
|
|
|
return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
static String title(String s, boolean full, String condition) {
|
2001-12-13 23:36:29 +00:00
|
|
|
/*if (!full) {
|
2001-08-30 20:50:18 +00:00
|
|
|
if (s.length() != 1) return s;
|
2002-04-23 01:59:16 +00:00
|
|
|
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
*/
|
2003-02-25 23:38:23 +00:00
|
|
|
return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE, condition);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
static boolean add(Set set, String s, Map data) {
|
|
|
|
if (set.contains(s)) return false;
|
|
|
|
set.add(s);
|
|
|
|
if (DEBUG) System.err.println("adding: " + toString(set));
|
|
|
|
Set other = (Set) data.get(s);
|
|
|
|
if (other != null && other != set) { // merge
|
|
|
|
// make all the items in set point to merged set
|
|
|
|
Iterator it = other.iterator();
|
|
|
|
while (it.hasNext()) {
|
|
|
|
data.put(it.next(), set);
|
|
|
|
}
|
|
|
|
set.addAll(other);
|
|
|
|
}
|
|
|
|
if (DEBUG) System.err.println("done adding: " + toString(set));
|
|
|
|
return true;
|
|
|
|
}
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-08-30 20:50:18 +00:00
|
|
|
static String toString(Set set) {
|
2001-12-13 23:36:29 +00:00
|
|
|
return toString(set, false, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
static String toString(Set set, boolean name, boolean crtab) {
|
2001-08-30 20:50:18 +00:00
|
|
|
String result = "{";
|
|
|
|
Iterator it2 = set.iterator();
|
|
|
|
boolean first = true;
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
String s2 = (String) it2.next();
|
2001-12-13 23:36:29 +00:00
|
|
|
if (!first) {
|
|
|
|
if (crtab) {
|
|
|
|
result += ";\r\n\t";
|
|
|
|
} else {
|
|
|
|
result += "; ";
|
|
|
|
}
|
|
|
|
}
|
2001-08-30 20:50:18 +00:00
|
|
|
first = false;
|
2001-12-13 23:36:29 +00:00
|
|
|
if (name) {
|
2002-04-23 01:59:16 +00:00
|
|
|
result += Default.ucd.getCodeAndName(s2);
|
2001-12-13 23:36:29 +00:00
|
|
|
} else {
|
|
|
|
result += Utility.hex(s2, " ");
|
|
|
|
}
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
|
|
|
return result + "}";
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
static boolean specialNormalizationDiffers(int ch) {
|
|
|
|
if (ch == 0x00DF) return true; // es-zed
|
2002-05-31 01:41:04 +00:00
|
|
|
return !Default.nfkd.isNormalized(ch);
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static String specialNormalization(String s) {
|
|
|
|
if (s.equals("\u00DF")) return "ss";
|
2002-04-23 01:59:16 +00:00
|
|
|
return Default.nfkd.normalize(s);
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static boolean isExcluded(int ch) {
|
2002-03-20 00:21:43 +00:00
|
|
|
// if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
|
2001-12-13 23:36:29 +00:00
|
|
|
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
|
|
|
|
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
|
|
|
|
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
|
|
|
|
if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..
|
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
byte type = Default.ucd.getDecompositionType(ch);
|
2001-12-13 23:36:29 +00:00
|
|
|
if (type == COMPAT_SQUARE) return true;
|
|
|
|
//if (type == COMPAT_UNSPECIFIED) return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2002-03-15 00:34:46 +00:00
|
|
|
static void generateSpecialCasing(boolean normalize) throws IOException {
|
2002-04-23 01:59:16 +00:00
|
|
|
Default.setUCD();
|
2001-12-13 23:36:29 +00:00
|
|
|
Map sorted = new TreeMap();
|
|
|
|
|
2002-03-15 00:34:46 +00:00
|
|
|
String suffix2 = "";
|
|
|
|
if (normalize) suffix2 = "-Normalized";
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
|
|
|
|
+ suffix2 + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
|
|
|
Utility.dot(ch);
|
2002-04-23 01:59:16 +00:00
|
|
|
if (!Default.ucd.isRepresented(ch)) continue;
|
2001-12-13 23:36:29 +00:00
|
|
|
if (!specialNormalizationDiffers(ch)) continue;
|
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
String lower = Default.nfc.normalize(Default.ucd.getCase(ch, SIMPLE, LOWER));
|
|
|
|
String upper = Default.nfc.normalize(Default.ucd.getCase(ch, SIMPLE, UPPER));
|
|
|
|
String title = Default.nfc.normalize(Default.ucd.getCase(ch, SIMPLE, TITLE));
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
String chstr = UTF16.valueOf(ch);
|
|
|
|
|
|
|
|
String decomp = specialNormalization(chstr);
|
2002-04-23 01:59:16 +00:00
|
|
|
String flower = Default.nfc.normalize(Default.ucd.getCase(decomp, SIMPLE, LOWER));
|
|
|
|
String fupper = Default.nfc.normalize(Default.ucd.getCase(decomp, SIMPLE, UPPER));
|
|
|
|
String ftitle = Default.nfc.normalize(Default.ucd.getCase(decomp, SIMPLE, TITLE));
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2002-03-15 00:34:46 +00:00
|
|
|
String base = decomp;
|
|
|
|
String blower = specialNormalization(lower);
|
|
|
|
String bupper = specialNormalization(upper);
|
|
|
|
String btitle = specialNormalization(title);
|
|
|
|
|
|
|
|
if (true) {
|
2002-04-23 01:59:16 +00:00
|
|
|
flower = Default.nfc.normalize(flower);
|
|
|
|
fupper = Default.nfc.normalize(fupper);
|
|
|
|
ftitle = Default.nfc.normalize(ftitle);
|
|
|
|
base = Default.nfc.normalize(base);
|
|
|
|
blower = Default.nfc.normalize(blower);
|
|
|
|
bupper = Default.nfc.normalize(bupper);
|
|
|
|
btitle = Default.nfc.normalize(btitle);
|
2002-03-15 00:34:46 +00:00
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2002-03-20 00:21:43 +00:00
|
|
|
if (ch == CHECK_CHAR) {
|
2002-04-23 01:59:16 +00:00
|
|
|
System.out.println("Code: " + Default.ucd.getCodeAndName(ch));
|
|
|
|
System.out.println("Decomp: " + Default.ucd.getCodeAndName(decomp));
|
|
|
|
System.out.println("Base: " + Default.ucd.getCodeAndName(base));
|
|
|
|
System.out.println("SLower: " + Default.ucd.getCodeAndName(lower));
|
|
|
|
System.out.println("FLower: " + Default.ucd.getCodeAndName(flower));
|
|
|
|
System.out.println("BLower: " + Default.ucd.getCodeAndName(blower));
|
|
|
|
System.out.println("STitle: " + Default.ucd.getCodeAndName(title));
|
|
|
|
System.out.println("FTitle: " + Default.ucd.getCodeAndName(ftitle));
|
|
|
|
System.out.println("BTitle: " + Default.ucd.getCodeAndName(btitle));
|
|
|
|
System.out.println("SUpper: " + Default.ucd.getCodeAndName(upper));
|
|
|
|
System.out.println("FUpper: " + Default.ucd.getCodeAndName(fupper));
|
|
|
|
System.out.println("BUpper: " + Default.ucd.getCodeAndName(bupper));
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// presumably if there is a single code point, it would already be in the simple mappings
|
|
|
|
|
|
|
|
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
|
2002-03-20 00:21:43 +00:00
|
|
|
&& UTF16.countCodePoint(title) == 1) {
|
2002-04-23 01:59:16 +00:00
|
|
|
if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Default.ucd.getCodeAndName(ch));
|
2002-03-20 00:21:43 +00:00
|
|
|
continue;
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
// if there is no change from the base, skip
|
|
|
|
|
2002-03-20 00:21:43 +00:00
|
|
|
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
|
2002-04-23 01:59:16 +00:00
|
|
|
if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Default.ucd.getCodeAndName(ch));
|
2002-03-20 00:21:43 +00:00
|
|
|
continue;
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
// fix special cases
|
|
|
|
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
|
|
|
|
if (flower.equals(blower)) flower = lower;
|
|
|
|
if (fupper.equals(bupper)) fupper = upper;
|
|
|
|
if (ftitle.equals(btitle)) ftitle = title;
|
|
|
|
|
|
|
|
// if there are no changes from the original, or the expanded original, skip
|
|
|
|
|
2002-03-20 00:21:43 +00:00
|
|
|
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
|
2002-04-23 01:59:16 +00:00
|
|
|
if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Default.ucd.getCodeAndName(ch));
|
2002-03-20 00:21:43 +00:00
|
|
|
continue;
|
|
|
|
}
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
String name = Default.ucd.getName(ch);
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
|
2002-03-20 00:21:43 +00:00
|
|
|
: ch == 0x130 ? 2
|
|
|
|
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
|
|
|
|
: name.indexOf("LIGATURE") >= 0 ? 3
|
|
|
|
: name.indexOf("GEGRAMMENI") < 0 ? 5
|
|
|
|
: UTF16.countCodePoint(ftitle) == 1 ? 6
|
|
|
|
: UTF16.countCodePoint(fupper) == 2 ? 7
|
|
|
|
: 8;
|
|
|
|
|
2002-04-23 01:59:16 +00:00
|
|
|
if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Default.ucd.getCodeAndName(ch));
|
2001-12-13 23:36:29 +00:00
|
|
|
|
2002-03-15 00:34:46 +00:00
|
|
|
// HACK
|
2002-03-20 00:21:43 +00:00
|
|
|
boolean denormalize = !normalize && order != 6 && order != 7;
|
2002-03-15 00:34:46 +00:00
|
|
|
|
|
|
|
String mapping = Utility.hex(ch)
|
2002-04-23 01:59:16 +00:00
|
|
|
+ "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd.normalize(flower) : flower)
|
|
|
|
+ "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd.normalize(ftitle) : ftitle)
|
|
|
|
+ "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd.normalize(fupper) : fupper)
|
|
|
|
+ "; # " + Default.ucd.getName(ch);
|
2001-12-13 23:36:29 +00:00
|
|
|
|
|
|
|
// special exclusions
|
|
|
|
if (isExcluded(ch)) {
|
|
|
|
log.println("# " + mapping);
|
|
|
|
} else {
|
2002-03-15 00:34:46 +00:00
|
|
|
int x = ch;
|
|
|
|
if (ch == 0x01F0) x = 0x03B1; // HACK to reorder the same
|
|
|
|
sorted.put(new Integer((order << 24) | x), mapping);
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
log.close();
|
|
|
|
|
|
|
|
System.out.println("Writing");
|
2002-03-15 00:34:46 +00:00
|
|
|
String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
|
2002-07-30 09:57:18 +00:00
|
|
|
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
2003-02-25 23:38:23 +00:00
|
|
|
String[] batName = {""};
|
|
|
|
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true), batName);
|
2002-03-15 00:34:46 +00:00
|
|
|
out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
|
|
|
|
out.println(GenerateData.generateDateLine());
|
|
|
|
out.println("#");
|
2002-10-05 01:28:58 +00:00
|
|
|
Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out);
|
2001-08-31 00:30:17 +00:00
|
|
|
|
2001-12-13 23:36:29 +00:00
|
|
|
Iterator it = sorted.keySet().iterator();
|
|
|
|
int lastOrder = -1;
|
|
|
|
while (it.hasNext()) {
|
|
|
|
Integer key = (Integer) it.next();
|
|
|
|
String line = (String) sorted.get(key);
|
|
|
|
int order = key.intValue() >> 24;
|
|
|
|
if (order != lastOrder) {
|
|
|
|
lastOrder = order;
|
|
|
|
out.println();
|
|
|
|
boolean skipLine = false;
|
|
|
|
switch(order) {
|
|
|
|
case 1:
|
|
|
|
out.println("# The German es-zed is special--the normal mapping is to SS.");
|
|
|
|
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
|
|
|
|
break;
|
2002-03-20 00:21:43 +00:00
|
|
|
case 2:
|
|
|
|
out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
|
|
|
|
break;
|
|
|
|
case 3: out.println("# Ligatures"); break;
|
|
|
|
case 4: skipLine = true; break;
|
|
|
|
case 5: out.println("# No corresponding uppercase precomposed character"); break;
|
2002-10-05 01:28:58 +00:00
|
|
|
case 6: Utility.appendFile("SpecialCasingIota.txt", Utility.UTF8, out); break;
|
2004-02-06 18:32:05 +00:00
|
|
|
case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
|
2002-03-20 00:21:43 +00:00
|
|
|
case 8: skipLine = true; break;
|
2001-12-13 23:36:29 +00:00
|
|
|
}
|
|
|
|
if (!skipLine) out.println();
|
|
|
|
}
|
|
|
|
out.println(line);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
2002-10-05 01:28:58 +00:00
|
|
|
Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
|
2001-12-13 23:36:29 +00:00
|
|
|
out.close();
|
2003-02-25 23:38:23 +00:00
|
|
|
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
2001-08-30 20:50:18 +00:00
|
|
|
}
|
|
|
|
}
|