2005-05-27 21:43:46 +00:00
|
|
|
/**
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
|
|
* others. All Rights Reserved. *
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
|
2005-06-24 23:51:52 +00:00
|
|
|
* $Date: 2005/06/24 23:51:52 $
|
|
|
|
* $Revision: 1.3 $
|
2005-05-27 21:43:46 +00:00
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
package com.ibm.text.UCD;
|
|
|
|
|
|
|
|
import java.io.BufferedReader;
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.PrintWriter;
|
2005-06-21 21:28:31 +00:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Arrays;
|
2005-05-27 21:43:46 +00:00
|
|
|
import java.util.Comparator;
|
|
|
|
import java.util.HashMap;
|
2005-06-21 21:28:31 +00:00
|
|
|
import java.util.HashSet;
|
2005-05-27 21:43:46 +00:00
|
|
|
import java.util.Iterator;
|
2005-06-21 21:28:31 +00:00
|
|
|
import java.util.List;
|
2005-05-27 21:43:46 +00:00
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Set;
|
|
|
|
import java.util.TreeMap;
|
|
|
|
import java.util.TreeSet;
|
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
import javax.transaction.xa.Xid;
|
|
|
|
|
2005-06-24 23:51:52 +00:00
|
|
|
import com.ibm.icu.dev.demo.translit.InfoDialog;
|
2005-05-27 21:43:46 +00:00
|
|
|
import com.ibm.icu.dev.test.util.ArrayComparator;
|
|
|
|
import com.ibm.icu.dev.test.util.BagFormatter;
|
2005-06-24 23:51:52 +00:00
|
|
|
import com.ibm.icu.dev.test.util.CollectionUtilities;
|
|
|
|
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
|
2005-06-21 21:28:31 +00:00
|
|
|
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
2005-05-27 21:43:46 +00:00
|
|
|
import com.ibm.icu.dev.test.util.UnicodeMap;
|
2005-06-21 21:28:31 +00:00
|
|
|
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
2005-05-27 21:43:46 +00:00
|
|
|
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
2005-06-21 21:28:31 +00:00
|
|
|
import com.ibm.icu.lang.UScript;
|
2005-05-27 21:43:46 +00:00
|
|
|
import com.ibm.icu.text.UTF16;
|
|
|
|
import com.ibm.icu.text.UnicodeSet;
|
2005-06-21 21:28:31 +00:00
|
|
|
import com.ibm.icu.text.UnicodeSetIterator;
|
2005-05-27 21:43:46 +00:00
|
|
|
import com.ibm.text.utility.Utility;
|
2005-06-21 21:28:31 +00:00
|
|
|
import com.ibm.text.utility.XEquivalenceClass;
|
2005-06-24 23:51:52 +00:00
|
|
|
import com.sun.corba.se.connection.GetEndPointInfoAgainException;
|
2005-05-27 21:43:46 +00:00
|
|
|
|
|
|
|
public class GenerateConfusables {
|
2005-06-24 23:51:52 +00:00
|
|
|
public static boolean EXCLUDE_CONFUSABLE_COMPAT = true;
|
2005-06-21 21:28:31 +00:00
|
|
|
|
|
|
|
public static void main(String[] args) throws IOException {
|
|
|
|
Set arg2 = new HashSet(Arrays.asList(args));
|
|
|
|
try {
|
|
|
|
if (arg2.contains("-b")) generateIDN();
|
|
|
|
if (arg2.contains("-c")) generateConfusables();
|
|
|
|
if (arg2.contains("-d")) generateDecompFile();
|
|
|
|
if (arg2.contains("-s")) generateSource();
|
|
|
|
} catch (Exception e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
} finally {
|
|
|
|
System.out.println("Done");
|
|
|
|
}
|
|
|
|
}
|
2005-05-27 21:43:46 +00:00
|
|
|
static PrintWriter log;
|
|
|
|
static final String ARROW = "\u2192";
|
2005-06-24 23:51:52 +00:00
|
|
|
static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
|
2005-06-21 21:28:31 +00:00
|
|
|
static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn")
|
|
|
|
.addAll(ups.getSet("gc=Co"))
|
|
|
|
.addAll(ups.getSet("gc=Cs"));
|
|
|
|
static UnicodeSet skipSet = ups.getSet("gc=Cc")
|
|
|
|
.addAll(ups.getSet("gc=Cf"))
|
|
|
|
.addAll(UNASSIGNED);
|
|
|
|
static UnicodeSet whiteSpace = ups.getSet("Whitespace=TRUE");
|
2005-06-24 23:51:52 +00:00
|
|
|
static UnicodeSet lowercase = ups.getSet("gc=Ll");
|
2005-06-21 21:28:31 +00:00
|
|
|
static UnicodeSet _skipNFKD;
|
|
|
|
|
|
|
|
static Map gatheredNFKD = new TreeMap();
|
|
|
|
static UnicodeMap nfcMap = new UnicodeMap();
|
|
|
|
|
|
|
|
static String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\source\\";
|
|
|
|
static String outdir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
|
|
|
|
|
|
|
|
static Comparator codepointComparator = new UTF16.StringComparator();
|
|
|
|
|
|
|
|
static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
|
|
|
|
"\\u3400-\\u4DB5" +
|
|
|
|
"\\u4E00-\\u9FA5" +
|
|
|
|
"\\uA000-\\uA48C" +
|
|
|
|
"\\uAC00-\\uD7A3" +
|
|
|
|
"\\u1100-\\u11FF" +
|
|
|
|
"\\uFB00-\\uFEFC" +
|
|
|
|
"\\u2460-\\u24FF" +
|
|
|
|
"\\u3251-\\u33FF" +
|
|
|
|
"\\u4DC0-\\u4DFF" +
|
2005-06-24 23:51:52 +00:00
|
|
|
"\\u3165-\\u318E" +
|
|
|
|
"\\uA490-\\uA4C6" +
|
|
|
|
"\\U00010140-\\U00010174" +
|
2005-06-21 21:28:31 +00:00
|
|
|
"\\U0001D300-\\U0001D356" +
|
|
|
|
"\\U0001D000-\\U0001D1DD" +
|
|
|
|
"\\U00020000-\\U0002A6D6" +
|
|
|
|
"\\U0001D400-\\U0001D7FF" +
|
|
|
|
"[:script=Canadian_Aboriginal:]" +
|
|
|
|
"[:script=ETHIOPIC:]" +
|
|
|
|
"[:script=Tagalog:]" +
|
|
|
|
"[:script=Hanunoo:]" +
|
|
|
|
"[:script=Buhid:]" +
|
|
|
|
"[:script=Tagbanwa:]" +
|
|
|
|
"[:script=Deseret:]" +
|
|
|
|
"[:script=Shavian:]" +
|
|
|
|
"[:script=Ogham:]" +
|
|
|
|
"[:script=Old Italic:]" +
|
|
|
|
"[:script=Runic:]" +
|
|
|
|
"[:script=Gothic:]" +
|
|
|
|
"[:script=Ugaritic:]" +
|
|
|
|
"[:script=Linear B:]" +
|
|
|
|
"[:script=Cypriot:]" +
|
|
|
|
"[:script=Coptic:]" +
|
|
|
|
"[:script=Syriac:]" +
|
|
|
|
"[:script=Glagolitic:]" +
|
|
|
|
"[:script=Glagolitic:]" +
|
|
|
|
"[:script=Old Persian:]" +
|
|
|
|
"[:script=Kharoshthi:]" +
|
|
|
|
"[:script=Osmanya:]" +
|
|
|
|
"[:default ignorable code point:]" +
|
|
|
|
"]");
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @throws IOException
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private static void generateIDN() throws IOException {
|
2005-06-24 23:51:52 +00:00
|
|
|
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
|
|
|
|
info.printIDNStuff();
|
|
|
|
}
|
|
|
|
|
|
|
|
private static class IdentifierInfo {
|
|
|
|
static private IdentifierInfo info;
|
2005-06-21 21:28:31 +00:00
|
|
|
|
2005-06-24 23:51:52 +00:00
|
|
|
static IdentifierInfo getIdentifierInfo() {
|
|
|
|
try {
|
|
|
|
if (info == null) info = new IdentifierInfo();
|
|
|
|
return info;
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw (RuntimeException) new IllegalArgumentException("Unable to access data").initCause(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private boolean mergeRanges = true;
|
|
|
|
|
|
|
|
private UnicodeSet removalSet, remainingOutputSet, inputSet_strict, inputSet_lenient, nonstarting;
|
|
|
|
UnicodeSet propNFKCSet, notInXID, xidPlus;
|
|
|
|
|
|
|
|
private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(),
|
|
|
|
reviews, removals2, lowerIsBetter;
|
|
|
|
|
|
|
|
private IdentifierInfo() throws IOException {
|
|
|
|
propNFKCSet = ups.getSet("NFKC_QuickCheck=N")
|
|
|
|
.complement();
|
|
|
|
UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE");
|
|
|
|
|
|
|
|
loadFileData();
|
|
|
|
xidPlus = new UnicodeSet(propXIDContinueSet).addAll(
|
|
|
|
additions.getSet(null).complement()).retainAll(propNFKCSet);
|
|
|
|
|
|
|
|
getIdentifierSet();
|
|
|
|
notInXID = new UnicodeSet(IDNOutputSet)
|
|
|
|
.removeAll(xidPlus);
|
|
|
|
removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
|
|
|
|
removalSet = removals.getSet(null).complement();
|
|
|
|
|
|
|
|
remainingOutputSet = new UnicodeSet(IDNOutputSet)
|
|
|
|
.removeAll(removalSet);
|
|
|
|
|
|
|
|
UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet)
|
|
|
|
.removeAll(removalSet).removeAll(remainingOutputSet);
|
|
|
|
UnicodeSet remainingInputSet = new UnicodeSet();
|
|
|
|
UnicodeSet specialRemove = new UnicodeSet();
|
|
|
|
// remove any others that don't normalize/case fold to something in
|
|
|
|
// the output set
|
|
|
|
for (UnicodeSetIterator usi = new UnicodeSetIterator(
|
|
|
|
remainingInputSet1); usi.next();) {
|
|
|
|
String nss = Default.nfkc().normalize(usi.getString());
|
|
|
|
String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD);
|
|
|
|
String cf2 = Default.nfkc().normalize(cf);
|
|
|
|
if (remainingOutputSet.containsAll(cf2))
|
|
|
|
remainingInputSet.add(usi.codepoint);
|
|
|
|
else
|
|
|
|
specialRemove.add(usi.codepoint);
|
|
|
|
}
|
|
|
|
// filter out the items that are case foldings of items in output
|
|
|
|
inputSet_strict = new UnicodeSet();
|
|
|
|
for (UnicodeSetIterator usi = new UnicodeSetIterator(
|
|
|
|
remainingInputSet); usi.next();) {
|
|
|
|
String ss = usi.getString();
|
|
|
|
String nss = Default.nfkc().normalize(ss);
|
|
|
|
String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD);
|
|
|
|
if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) {
|
|
|
|
System.out.println("check");
|
|
|
|
}
|
|
|
|
//> > 2126 ; retained-input-only-CF # (?) OHM SIGN
|
|
|
|
//> > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN
|
|
|
|
|
|
|
|
if (!remainingOutputSet.containsAll(nss)
|
|
|
|
&& remainingOutputSet.containsAll(cf))
|
|
|
|
inputSet_strict.add(ss);
|
|
|
|
}
|
|
|
|
// hack
|
|
|
|
inputSet_strict.remove(0x03F4).remove(0x2126).remove(0x212B);
|
|
|
|
inputSet_lenient = new UnicodeSet(remainingInputSet)
|
|
|
|
.removeAll(inputSet_strict);
|
|
|
|
nonstarting = new UnicodeSet(remainingOutputSet).addAll(
|
|
|
|
remainingInputSet).retainAll(new UnicodeSet("[:M:]"));
|
|
|
|
reviews = new UnicodeMap().putAll(removals);
|
|
|
|
reviews.putAll(remainingOutputSet, "output");
|
|
|
|
reviews.putAll(inputSet_strict, "input");
|
|
|
|
reviews.putAll(inputSet_lenient, "input-lenient");
|
|
|
|
reviews.putAll(specialRemove, PROHIBITED + "output-disallowed");
|
|
|
|
|
|
|
|
lowerIsBetter = new UnicodeMap();
|
|
|
|
|
|
|
|
lowerIsBetter.putAll(propNFKCSet, MARK_NFC); // nfkc is better than the alternative
|
|
|
|
lowerIsBetter.putAll(inputSet_lenient, MARK_INPUT_LENIENT);
|
|
|
|
lowerIsBetter.putAll(inputSet_strict, MARK_INPUT_STRICT);
|
|
|
|
lowerIsBetter.putAll(remainingOutputSet, MARK_OUTPUT);
|
|
|
|
lowerIsBetter.setMissing(MARK_NOT_NFC);
|
|
|
|
|
|
|
|
lowerIsBetter.lock();
|
|
|
|
// add special values:
|
|
|
|
//lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0));
|
|
|
|
|
|
|
|
UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting,
|
|
|
|
"nonstarting");
|
|
|
|
UnicodeMap.Composer composer = new UnicodeMap.Composer() {
|
|
|
|
public Object compose(int codePoint, Object a, Object b) {
|
|
|
|
if (a == null)
|
|
|
|
return b;
|
|
|
|
else if (b == null)
|
|
|
|
return a;
|
|
|
|
else
|
|
|
|
return a.toString() + "-" + b.toString();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
reviews.composeWith(nonstartingmap, composer);
|
|
|
|
reviews.putAll(new UnicodeSet(IDNInputSet).complement(), "");
|
|
|
|
UnicodeMap.Composer composer2 = new UnicodeMap.Composer() {
|
|
|
|
public Object compose(int codePoint, Object a, Object b) {
|
|
|
|
if (b == null)
|
|
|
|
return a;
|
|
|
|
return "remap-to-" + Utility.hex(b.toString());
|
|
|
|
}
|
|
|
|
};
|
|
|
|
reviews.composeWith(remap, composer2);
|
|
|
|
removals2 = new UnicodeMap().putAll(removals);
|
|
|
|
removals2.putAll(ups.getSet("XID_Continue=TRUE").complement(),
|
|
|
|
PROHIBITED + NOT_IN_XID);
|
|
|
|
removals2.setMissing("future?");
|
|
|
|
|
|
|
|
additions.lock();
|
|
|
|
remap.lock();
|
|
|
|
removals.lock();
|
|
|
|
reviews.lock();
|
|
|
|
removals2.lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private void loadFileData() throws IOException {
|
|
|
|
// get the word chars
|
|
|
|
BufferedReader br = BagFormatter.openUTF8Reader(indir,
|
|
|
|
"wordchars.txt");
|
|
|
|
String line = null;
|
|
|
|
try {
|
|
|
|
while (true) {
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
if (line == null)
|
|
|
|
break;
|
|
|
|
if (line.length() == 0)
|
2005-06-21 21:28:31 +00:00
|
|
|
continue;
|
2005-06-24 23:51:52 +00:00
|
|
|
String[] pieces = Utility.split(line, ';');
|
|
|
|
int code = Integer.parseInt(pieces[0].trim(), 16);
|
|
|
|
if (pieces[1].trim().equals("remap-to")) {
|
|
|
|
remap.put(code, UTF16.valueOf(Integer.parseInt(
|
|
|
|
pieces[2].trim(), 16)));
|
|
|
|
} else {
|
|
|
|
if (XIDContinueSet.contains(code)) {
|
|
|
|
System.out.println("Already in XID continue: "
|
|
|
|
+ line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
additions.put(code, "addition");
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
} catch (Exception e) {
|
|
|
|
throw (RuntimeException) new RuntimeException(
|
|
|
|
"Failure on line " + line).initCause(e);
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
br.close();
|
|
|
|
|
|
|
|
// get all the removals.
|
|
|
|
br = BagFormatter.openUTF8Reader(indir, "removals.txt");
|
|
|
|
UnicodeSet allocated = ups.getSet("generalcategory=cn").complement();
|
|
|
|
|
|
|
|
UnicodeSet sources = new UnicodeSet();
|
|
|
|
line = null;
|
|
|
|
try {
|
|
|
|
while (true) {
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
if (line == null)
|
|
|
|
break;
|
|
|
|
if (line.length() == 0)
|
|
|
|
continue;
|
|
|
|
sources.clear();
|
|
|
|
String[] pieces = Utility.split(line, ';');
|
|
|
|
if (pieces.length < 2) {
|
|
|
|
System.out.println("Missing line " + line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
String codelist = pieces[0].trim();
|
|
|
|
String reasons = pieces[1].trim();
|
|
|
|
if (pieces[0].startsWith("[")) {
|
|
|
|
sources = new UnicodeSet(codelist).retainAll(allocated);
|
|
|
|
} else {
|
|
|
|
String[] codes = Utility.split(codelist, ' ');
|
|
|
|
for (int i = 0; i < codes.length; ++i) {
|
|
|
|
if (codes[i].length() == 0)
|
|
|
|
continue;
|
|
|
|
String[] range = codes[i].split("\\.\\.");
|
|
|
|
int start = Integer.parseInt(range[0], 16);
|
|
|
|
int end = start;
|
|
|
|
if (range.length > 1)
|
|
|
|
end = Integer.parseInt(range[1], 16);
|
|
|
|
sources.add(start, end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
removals.putAll(sources, PROHIBITED + reasons);
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw (RuntimeException) new RuntimeException(
|
|
|
|
"Failure on line " + line).initCause(e);
|
|
|
|
}
|
|
|
|
br.close();
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
|
|
|
|
2005-06-24 23:51:52 +00:00
|
|
|
void printIDNStuff() throws IOException {
|
|
|
|
PrintWriter out;
|
|
|
|
printIDModifications();
|
|
|
|
writeIDChars();
|
|
|
|
writeIDReview();
|
|
|
|
generateDecompFile();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private void writeIDReview() throws IOException {
|
|
|
|
BagFormatter bf = new BagFormatter();
|
|
|
|
bf.setUnicodePropertyFactory(ups);
|
|
|
|
bf.setLabelSource(null);
|
|
|
|
bf.setShowLiteral(bf.toHTMLControl);
|
|
|
|
bf.setMergeRanges(true);
|
|
|
|
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
|
|
|
|
//reviews.putAll(UNASSIGNED, "");
|
|
|
|
out.print("\uFEFF");
|
|
|
|
out.println("# Review List for IDN");
|
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
|
|
|
out.println("");
|
|
|
|
|
|
|
|
UnicodeSet fullSet = reviews.getSet("").complement();
|
|
|
|
|
|
|
|
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
|
|
|
|
}).set(reviews).setMain("Reviews", "GCB",
|
|
|
|
UnicodeProperty.ENUMERATED, "1.0"));
|
|
|
|
//bf.setMergeRanges(false);
|
|
|
|
|
|
|
|
FakeBreak fakeBreak = new FakeBreak();
|
|
|
|
bf.setRangeBreakSource(fakeBreak);
|
|
|
|
out.println("");
|
|
|
|
out.println("# Characters allowed in IDNA");
|
|
|
|
out.println("");
|
|
|
|
bf.showSetNames(out, new UnicodeSet(fullSet)); // .removeAll(bigSets)
|
|
|
|
//bf.setMergeRanges(true);
|
|
|
|
// out.println("");
|
|
|
|
// out.println("# Large Ranges");
|
|
|
|
// out.println("");
|
|
|
|
// bf.showSetNames(out, new UnicodeSet(fullSet).retainAll(bigSets));
|
|
|
|
out.println("");
|
|
|
|
out.println("# Characters disallowed in IDNA");
|
|
|
|
out
|
|
|
|
.println("# The IDNA spec doesn't allow any of these characters,");
|
|
|
|
out
|
|
|
|
.println("# so don't report any of them as being missing from the above list.");
|
|
|
|
out
|
|
|
|
.println("# Some possible future additions, once IDNA updates to Unicode 4.1, are given.");
|
|
|
|
out.println("");
|
|
|
|
//bf.setRangeBreakSource(UnicodeLabel.NULL);
|
|
|
|
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
|
|
|
|
}).set(removals2).setMain("Removals", "GCB",
|
|
|
|
UnicodeProperty.ENUMERATED, "1.0"));
|
|
|
|
//bf.setValueSource(UnicodeLabel.NULL);
|
|
|
|
bf.showSetNames(out, new UnicodeSet(IDNInputSet).complement()
|
|
|
|
.removeAll(UNASSIGNED));
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private void writeIDChars() throws IOException {
|
|
|
|
BagFormatter bf = new BagFormatter();
|
|
|
|
bf.setUnicodePropertyFactory(ups);
|
|
|
|
bf.setLabelSource(null);
|
|
|
|
bf.setShowLiteral(bf.toHTMLControl);
|
|
|
|
bf.setMergeRanges(true);
|
|
|
|
|
|
|
|
UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
|
|
|
|
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
|
|
|
|
|
|
|
|
out.println("# Recommended Identifier Profiles for IDN");
|
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
out.println("# Output Characters");
|
|
|
|
out.println("");
|
|
|
|
bf.setValueSource("output");
|
|
|
|
bf.showSetNames(out, remainingOutputSet);
|
|
|
|
showExtras(bf, remainingOutputSet, letters);
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
out.println("# Input Characters");
|
|
|
|
out.println("");
|
|
|
|
bf.setValueSource("input");
|
|
|
|
bf.showSetNames(out, inputSet_strict);
|
|
|
|
showExtras(bf, inputSet_strict, letters);
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
out.println("# Input Characters (lenient)");
|
|
|
|
out.println("");
|
|
|
|
bf.setValueSource("input-lenient");
|
|
|
|
bf.showSetNames(out, inputSet_lenient);
|
|
|
|
showExtras(bf, inputSet_lenient, letters);
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
out
|
|
|
|
.println("# Not allowed at start of identifier");
|
|
|
|
out.println("");
|
|
|
|
bf.setValueSource("nonstarting");
|
|
|
|
bf.showSetNames(out, nonstarting);
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
|
|
|
|
showRemapped(out,
|
|
|
|
"Characters remapped on input (in GUIs)", remap);
|
|
|
|
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) {
|
|
|
|
UnicodeSet extra = new UnicodeSet(source).removeAll(letters);
|
|
|
|
if (extra.size() != 0) {
|
|
|
|
UnicodeSet fixed = new UnicodeSet();
|
|
|
|
for (UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next();) {
|
|
|
|
if (!letters.containsAll(Default.nfkd().normalize(it.getString()))) {
|
|
|
|
fixed.add(it.codepoint);
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
System.out.println(bf.showSetNames(fixed));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private void printIDModifications() throws IOException {
|
|
|
|
BagFormatter bf = new BagFormatter();
|
|
|
|
bf.setUnicodePropertyFactory(ups);
|
|
|
|
bf.setLabelSource(null);
|
|
|
|
bf.setShowLiteral(bf.toHTMLControl);
|
|
|
|
bf.setMergeRanges(true);
|
|
|
|
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir,
|
|
|
|
"xidmodifications.txt");
|
|
|
|
|
|
|
|
out.println("# Security Profile for General Identifiers");
|
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
|
|
|
out.println("");
|
|
|
|
|
|
|
|
out.println("# Characters restricted");
|
|
|
|
out.println("");
|
|
|
|
/*
|
|
|
|
* for (Iterator it = values.iterator(); it.hasNext();) { String
|
|
|
|
* reason1 = (String)it.next(); bf.setValueSource(reason1);
|
|
|
|
* out.println(""); bf.showSetNames(out, removals.getSet(reason1)); }
|
|
|
|
*/
|
|
|
|
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
|
|
|
|
}).set(removals).setMain("Removals", "GCB",
|
|
|
|
UnicodeProperty.ENUMERATED, "1.0"));
|
|
|
|
bf.showSetNames(out, removalSet);
|
|
|
|
|
|
|
|
out.println("");
|
|
|
|
out.println("# Characters added");
|
|
|
|
out.println("");
|
|
|
|
bf.setValueSource("addition");
|
|
|
|
bf.showSetNames(out, additions.getSet(null).complement());
|
|
|
|
|
|
|
|
showRemapped(out, "Characters remapped on input", remap);
|
|
|
|
|
|
|
|
out.close();
|
|
|
|
|
|
|
|
UnicodeMap someRemovals = new UnicodeMap();
|
|
|
|
UnicodeMap.Composer myComposer = new UnicodeMap.Composer() {
|
|
|
|
public Object compose(int codePoint, Object a, Object b) {
|
|
|
|
if (b == null) return null;
|
|
|
|
String x = (String)b;
|
|
|
|
if (!IDNOutputSet.contains(codePoint)) {
|
|
|
|
return "~IDNA";
|
|
|
|
}
|
|
|
|
if (!xidPlus.contains(codePoint)) {
|
|
|
|
return "~Unicode Identifier";
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
if (x.startsWith(PROHIBITED)) x = x.substring(PROHIBITED.length());
|
|
|
|
//if (!propNFKCSet.contains(codePoint)) x += "*";
|
|
|
|
if (lowercase.contains(codePoint)) {
|
|
|
|
String upper = Default.ucd().getCase(codePoint, UCD.FULL, UCD.UPPER);
|
|
|
|
if (upper.equals(UTF16.valueOf(codePoint))
|
|
|
|
&& x.equals("technical symbol (phonetic)")) x = "technical symbol (phonetic with no uppercase)";
|
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
someRemovals.composeWith(removals, myComposer);
|
|
|
|
//someRemovals = removals;
|
|
|
|
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
|
|
|
|
out.println("# Characters restricted in domain names");
|
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
|
|
|
out.println("#");
|
|
|
|
out.println("# This file contains a draft list of characters for use in");
|
|
|
|
out.println("# UTR #36: Unicode Security Considerations");
|
|
|
|
out.println("# http://unicode.org/draft/reports/tr36/tr36.html");
|
|
|
|
out.println("# According to the recommendations in that document, these characters");
|
|
|
|
out.println("# would be restricted in domain names: people would only be able to use them");
|
|
|
|
out.println("# by using lenient security settings.");
|
|
|
|
out.println("#");
|
|
|
|
out.println("# If you have any feedback on this list, please use the submission form at:");
|
|
|
|
out.println("# http://unicode.org/reporting.html.");
|
|
|
|
out.println("#");
|
|
|
|
out.println("# Notes:");
|
|
|
|
out.println("# - Characters are listed along with a reason for their removal.");
|
|
|
|
out.println("# - Characters listed as ~IDNA are excluded at this point in domain names,");
|
|
|
|
out.println("# in many cases because the international domain name specification does not contain");
|
|
|
|
out.println("# characters beyond Unicode 3.2. At this point in time, feedback on those characters");
|
|
|
|
out.println("# is not relevant.");
|
|
|
|
out.println("# - Characters listed as ~Unicode Identifiers are restricted because they");
|
|
|
|
out.println("# do not fit the specification of identifiers given in");
|
|
|
|
out.println("# UAX #31: Identifier and Pattern Syntax");
|
|
|
|
out.println("# http://unicode.org/reports/tr31/");
|
|
|
|
out.println("# - The files in this directory are 'live', and may change at any time.");
|
|
|
|
out.println("# Please include the above Revision number in your feedback.");
|
|
|
|
|
|
|
|
bf.setRangeBreakSource(new FakeBreak2());
|
|
|
|
if (true) {
|
|
|
|
Set values = new TreeSet(someRemovals.getAvailableValues());
|
|
|
|
for (Iterator it = values.iterator(); it.hasNext();) {
|
|
|
|
String reason1 = (String) it.next();
|
|
|
|
bf.setValueSource(reason1);
|
|
|
|
out.println("");
|
|
|
|
bf.showSetNames(out, someRemovals.getSet(reason1));
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
} else {
|
|
|
|
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
|
|
|
|
}).set(someRemovals).setMain("Removals", "GCB",
|
|
|
|
UnicodeProperty.ENUMERATED, "1.0"));
|
|
|
|
bf.showSetNames(out, someRemovals.getSet(null).complement());
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
out.close();
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-06-24 23:51:52 +00:00
|
|
|
static final String PROHIBITED = "restricted ; ";
|
|
|
|
static final String NOT_IN_XID = "not in XID+";
|
2005-06-21 21:28:31 +00:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private static void generateDecompFile() throws IOException {
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "decomps.txt");
|
|
|
|
UnicodeProperty dt = ups.getProperty("Decomposition_Type");
|
|
|
|
for (Iterator it = dt.getAvailableValues().iterator(); it.hasNext();) {
|
|
|
|
String value = (String) it.next();
|
|
|
|
if (value.equalsIgnoreCase("none") || value.equalsIgnoreCase("canonical")) continue;
|
|
|
|
UnicodeSet s = dt.getSet(value);
|
|
|
|
out.println("");
|
|
|
|
out.println("# Decomposition_Type = " + value);
|
|
|
|
out.println("");
|
|
|
|
for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) {
|
|
|
|
String source = usi.getString();
|
|
|
|
String target = Default.nfkc().normalize(source);
|
|
|
|
writeSourceTargetLine(out, source, null, target, value);
|
|
|
|
}
|
|
|
|
//bf.showSetNames(out, s);
|
|
|
|
out.flush();
|
|
|
|
}
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
|
|
|
|
static class FakeBreak extends UnicodeLabel {
|
2005-06-24 23:51:52 +00:00
|
|
|
UnicodeSet nobreakSet = setsToAbbreviate;
|
|
|
|
public String getValue(int codepoint, boolean isShort) {
|
|
|
|
return nobreakSet.contains(codepoint) ? ""
|
|
|
|
: (codepoint & 1) == 0 ? "O"
|
|
|
|
: "E";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static class FakeBreak2 extends UnicodeLabel {
|
|
|
|
UnicodeSet nobreakSet = new UnicodeSet(setsToAbbreviate)
|
|
|
|
.addAll(new UnicodeSet(IDNOutputSet).complement())
|
|
|
|
.addAll(new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus).complement());
|
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
public String getValue(int codepoint, boolean isShort) {
|
|
|
|
return nobreakSet.contains(codepoint) ? ""
|
|
|
|
: (codepoint & 1) == 0 ? "O"
|
|
|
|
: "E";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private static void showRemapped(PrintWriter out, String title, UnicodeMap remap) {
|
|
|
|
out.println("");
|
|
|
|
out.println("# " + title);
|
|
|
|
out.println("");
|
|
|
|
int count = 0;
|
|
|
|
for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.getSet(null).complement()); usi.next();) {
|
|
|
|
writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null);
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
out.println("");
|
|
|
|
out.println("# Total code points: " + count);
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static UnicodeSet XIDContinueSet = new UnicodeSet("[:XID_Continue:]");
|
|
|
|
private static UnicodeSet IDNOutputSet, IDNInputSet, _preferredIDSet;
|
|
|
|
|
|
|
|
static UnicodeSet getIdentifierSet() {
|
|
|
|
if (_preferredIDSet == null) {
|
|
|
|
IDNOutputSet = new UnicodeSet();
|
|
|
|
IDNInputSet = new UnicodeSet();
|
|
|
|
IDNOutputSet.add('-'); // HACK
|
|
|
|
IDNInputSet.add('-');
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
|
|
Utility.dot(cp);
|
|
|
|
int cat = Default.ucd().getCategory(cp);
|
|
|
|
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
|
|
|
// get IDNA
|
|
|
|
int idnaType = GenerateStringPrep.getIDNAType(cp);
|
|
|
|
if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp);
|
|
|
|
if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp);
|
|
|
|
}
|
|
|
|
_preferredIDSet = new UnicodeSet(IDNOutputSet).addAll(XIDContinueSet);
|
|
|
|
}
|
|
|
|
_preferredIDSet.add(0x2018).add(0x2019);
|
|
|
|
return _preferredIDSet;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static UnicodeSet getSkipNFKD() {
|
|
|
|
if (_skipNFKD == null) {
|
|
|
|
_skipNFKD = new UnicodeSet();
|
|
|
|
UnicodeSet idSet = getIdentifierSet();
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
|
|
Utility.dot(cp);
|
|
|
|
int cat = Default.ucd().getCategory(cp);
|
|
|
|
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
|
|
|
int decompType = Default.ucd().getDecompositionType(cp);
|
|
|
|
if (decompType == UCD.COMPAT_CIRCLE
|
|
|
|
|| decompType == UCD.COMPAT_SUPER
|
|
|
|
|| decompType == UCD.COMPAT_SUB
|
|
|
|
|| decompType == UCD.COMPAT_VERTICAL
|
|
|
|
|| decompType == UCD.COMPAT_SMALL
|
|
|
|
|| decompType == UCD.COMPAT_SQUARE
|
|
|
|
|| decompType == UCD.COMPAT_FRACTION) {
|
|
|
|
_skipNFKD.add(cp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
String mapped = Default.nfkd().normalize(cp);
|
|
|
|
if (mapped.equals(UTF16.valueOf(cp))) continue;
|
|
|
|
if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp);
|
|
|
|
else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp);
|
|
|
|
if (decompType == UCD.CANONICAL) nfcMap.put(cp, Default.nfd().normalize(cp));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
nfcMap.setMissing("");
|
|
|
|
return _skipNFKD;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isMixedScript(String source) {
|
|
|
|
int lastScript = UScript.INVALID_CODE;
|
|
|
|
int cp;
|
|
|
|
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
|
|
|
|
cp = UTF16.charAt(source, i);
|
|
|
|
int script = UScript.getScript(cp);
|
|
|
|
if (script == UScript.COMMON || script == UScript.INHERITED) {
|
|
|
|
if (XIDContinueSet.contains(cp)) continue; // skip if not identifier
|
|
|
|
script = UScript.COMMON;
|
|
|
|
}
|
|
|
|
if (lastScript == UScript.INVALID_CODE) lastScript = script;
|
|
|
|
else if (script != lastScript) return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2005-05-27 21:43:46 +00:00
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private static void generateConfusables() throws IOException {
|
|
|
|
log = BagFormatter.openUTF8Writer(outdir, "log.txt");
|
|
|
|
//fixMichel(indir, outdir);
|
|
|
|
generateConfusables(indir, outdir);
|
|
|
|
log.close();
|
|
|
|
if (false) for (Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext();) {
|
|
|
|
String source = (String)it.next();
|
|
|
|
System.out.println(Default.ucd().getCodeAndName(source)
|
|
|
|
+ " => " + Default.ucd().getCodeAndName((String)gatheredNFKD.get(source)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* static class Data2 {
|
2005-05-27 21:43:46 +00:00
|
|
|
String source;
|
|
|
|
String target;
|
|
|
|
int count;
|
|
|
|
Data2(String target, int count) {
|
|
|
|
this.target = target;
|
|
|
|
this.count = count;
|
|
|
|
}
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
*/
|
|
|
|
/* static class Data implements Comparable {
|
2005-05-27 21:43:46 +00:00
|
|
|
String source;
|
|
|
|
String target;
|
|
|
|
String type;
|
|
|
|
Data(String source, String target, String type) {
|
|
|
|
this.source = source;
|
|
|
|
this.target = target;
|
|
|
|
this.type = type;
|
|
|
|
}
|
|
|
|
public int compareTo(Object o) {
|
|
|
|
int result;
|
|
|
|
Data that = (Data)o;
|
|
|
|
if (0 != (result = target.compareTo(that.target))) return result;
|
|
|
|
if (0 != (result = source.compareTo(that.source))) return result;
|
|
|
|
if (0 != (result = type.compareTo(that.type))) return result;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
*/
|
2005-05-27 21:43:46 +00:00
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static void writeSourceTargetLine(PrintWriter out, String source, String tag, String target, String reason) {
|
|
|
|
out.print(
|
|
|
|
Utility.hex(source)
|
|
|
|
+ " ;\t" + Utility.hex(target)
|
|
|
|
+ (tag == null ? "" : " ;\t" + tag)
|
|
|
|
//+ " ;\t" + (preferredID.contains(source) ? "ID" : "")
|
|
|
|
+ "\t# "
|
|
|
|
+ "( " + source + " " + ARROW + " " + target + ") "
|
|
|
|
+ Default.ucd().getName(source) + " " + ARROW + " "
|
|
|
|
+ Default.ucd().getName(target)
|
|
|
|
);
|
|
|
|
if (reason != null) out.print("\t# " + reason);
|
|
|
|
out.println();
|
|
|
|
}
|
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
static UnicodeSet controls = new UnicodeSet("[:Cc:]");
|
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
static class MyEquivalenceClass extends XEquivalenceClass {
|
|
|
|
public MyEquivalenceClass() {
|
2005-06-24 23:51:52 +00:00
|
|
|
super("NONE");
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
|
|
|
public boolean addCheck(String a, String b, String reason) {
|
|
|
|
// quick check for illegal containment, before changing object
|
|
|
|
if (checkForBad(a, b, reason) || checkForBad(b, a, reason)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
super.add(a, b, reason);
|
|
|
|
// full check for any resulting illegal containment.
|
|
|
|
// illegal if for any x, y, x is a proper superstring of y
|
|
|
|
Set equivalences = getEquivalences(a);
|
|
|
|
for (Iterator it = equivalences.iterator(); it.hasNext();) {
|
|
|
|
String x = (String)it.next();
|
|
|
|
if (!UTF16.hasMoreCodePointsThan(x,1)) continue;
|
|
|
|
for (Iterator it2 = equivalences.iterator(); it2.hasNext();) {
|
|
|
|
String y = (String)it2.next();
|
|
|
|
if (x.equals(y)) continue;
|
|
|
|
if (x.indexOf(y) >= 0) throw new RuntimeException("Illegal containment: "
|
|
|
|
+ Default.ucd().getCodeAndName(x) + " contains "
|
|
|
|
+ Default.ucd().getCodeAndName(y) + " because "
|
|
|
|
+ Default.ucd().getCodeAndName(a) + " ~ "
|
|
|
|
+ Default.ucd().getCodeAndName(b) + " because of "
|
|
|
|
+ reason);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private boolean checkForBad(String a, String b, String reason) {
|
|
|
|
Set equivalences = getEquivalences(b);
|
|
|
|
for (Iterator it = equivalences.iterator(); it.hasNext();) {
|
|
|
|
String b2 = (String)it.next();
|
|
|
|
if (a.equals(b2)) continue;
|
|
|
|
if (b2.indexOf(a) >= 0 || a.indexOf(b2) >= 0) {
|
|
|
|
log.println("Illegal containment: "
|
|
|
|
+ Default.ucd().getCodeAndName(a)
|
|
|
|
+ " overlaps "
|
|
|
|
+ Default.ucd().getCodeAndName(b2)
|
|
|
|
+ "\r\n\tfrom "
|
|
|
|
+ Default.ucd().getCodeAndName(b)
|
|
|
|
+ "\r\n\twith reason "
|
|
|
|
+ reason + " plus "
|
|
|
|
+ getReasons(b2, b));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
public XEquivalenceClass add(Object a1, Object b1, String reason) {
|
|
|
|
String a = (String)a1;
|
|
|
|
String b = (String)b1;
|
|
|
|
try {
|
|
|
|
addCheck(a, b, reason);
|
|
|
|
return this;
|
|
|
|
} catch (RuntimeException e) {
|
|
|
|
throw (RuntimeException) new RuntimeException("Failure adding "
|
|
|
|
+ Default.ucd().getCodeAndName(a) + "; "
|
|
|
|
+ Default.ucd().getCodeAndName(b)
|
|
|
|
+ "; " + reason).initCause(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
* Only NFKD if the result doesn't cross from ID set to nonID set, and space is not added
|
|
|
|
*/
|
|
|
|
// private String specialNFKD(String item) {
|
|
|
|
// UnicodeSet skipSet = getSkipNFKD();
|
|
|
|
// StringBuffer result = new StringBuffer();
|
|
|
|
// int cp;
|
|
|
|
// for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
|
|
|
|
// cp = UTF16.charAt(item, i);
|
|
|
|
// if (skipSet.contains(cp)) {
|
|
|
|
// UTF16.append(result, cp);
|
|
|
|
// continue;
|
|
|
|
// }
|
|
|
|
// String cps = UTF16.valueOf(cp);
|
|
|
|
// String mapped = Default.nfkd().normalize(cps);
|
|
|
|
// if (cps.equals(mapped)) {
|
|
|
|
// UTF16.append(result, cp);
|
|
|
|
// continue;
|
|
|
|
// }
|
|
|
|
// result.append(mapped);
|
|
|
|
// gatheredNFKD.put(cps, mapped);
|
|
|
|
// }
|
|
|
|
// return result.toString();
|
|
|
|
// }
|
|
|
|
|
|
|
|
public void close(String reason) {
|
|
|
|
boolean addedItem;
|
|
|
|
StringBuffer reasons = new StringBuffer();
|
|
|
|
do {
|
|
|
|
addedItem = false;
|
2005-06-24 23:51:52 +00:00
|
|
|
Set cloneForSafety = getOrderedExplicitItems();
|
2005-06-21 21:28:31 +00:00
|
|
|
for (Iterator it = cloneForSafety.iterator(); it.hasNext();) {
|
|
|
|
String item = (String) it.next();
|
|
|
|
if (!UTF16.hasMoreCodePointsThan(item,1)) continue; // just for speed
|
|
|
|
reasons.setLength(0);
|
|
|
|
String mapped = mapString(item, reasons);
|
|
|
|
if (!isEquivalent(item, mapped)) {
|
|
|
|
if (addCheck(item, mapped, reasons.toString())) {
|
|
|
|
// System.out.println("Closing: " + Default.ucd().getCodeAndName(item) + " => " + Default.ucd().getCodeAndName(mapped));
|
|
|
|
addedItem = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} while (addedItem);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private String mapString(String item, StringBuffer reasons) {
|
|
|
|
if (false && item.startsWith("\u03D2")) {
|
|
|
|
System.out.println("foo");
|
|
|
|
}
|
|
|
|
StringBuffer result = new StringBuffer();
|
|
|
|
int cp;
|
|
|
|
for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
|
|
|
|
cp = UTF16.charAt(item, i);
|
|
|
|
String cps = UTF16.valueOf(cp);
|
2005-06-24 23:51:52 +00:00
|
|
|
String mapped = getParadigm(cps);
|
2005-06-21 21:28:31 +00:00
|
|
|
if (mapped.indexOf(cps) >= 0) result.append(cps);
|
|
|
|
else {
|
|
|
|
result.append(mapped);
|
|
|
|
reasons.append("[" + getReasons(cps, mapped) + "]");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result.toString();
|
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
|
|
|
|
public String getParadigm(Object item) {
|
|
|
|
return (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
public Set getOrderedExplicitItems() {
|
|
|
|
Set cloneForSafety = new TreeSet(codepointComparator);
|
|
|
|
cloneForSafety.addAll(getExplicitItems());
|
|
|
|
return cloneForSafety;
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
public void writeSource(PrintWriter out) {
|
2005-06-24 23:51:52 +00:00
|
|
|
Set items = getOrderedExplicitItems();
|
2005-06-21 21:28:31 +00:00
|
|
|
for (Iterator it = items.iterator(); it.hasNext();) {
|
|
|
|
String item = (String) it.next();
|
2005-06-24 23:51:52 +00:00
|
|
|
String paradigm = (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
|
2005-06-21 21:28:31 +00:00
|
|
|
if (item.equals(paradigm)) continue;
|
|
|
|
writeSourceTargetLine(out, item, null, paradigm, null);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
static class DataSet {
|
2005-06-21 21:28:31 +00:00
|
|
|
MyEquivalenceClass dataMixedLowercase = new MyEquivalenceClass();
|
|
|
|
MyEquivalenceClass dataMixedAnycase = new MyEquivalenceClass();
|
|
|
|
MyEquivalenceClass dataSingleLowercase = new MyEquivalenceClass();
|
|
|
|
MyEquivalenceClass dataSingleAnycase = new MyEquivalenceClass();
|
|
|
|
|
|
|
|
public DataSet add(String source, String target, String type, int lineCount, String errorLine) {
|
2005-05-27 21:43:46 +00:00
|
|
|
if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
|
2005-06-21 21:28:31 +00:00
|
|
|
String nsource = Default.nfd().normalize(source);
|
|
|
|
String ntarget = Default.nfd().normalize(target);
|
2005-05-27 21:43:46 +00:00
|
|
|
|
|
|
|
// if it is just a compatibility match, return
|
2005-06-21 21:28:31 +00:00
|
|
|
//if (nsource.equals(ntarget)) return this;
|
|
|
|
if (type.indexOf("skip") >= 0) return this;
|
|
|
|
if (target.indexOf('\u203D') >= 0) return this;
|
2005-05-27 21:43:46 +00:00
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
type = getReasonFromFilename(type);
|
2005-05-27 21:43:46 +00:00
|
|
|
|
|
|
|
// if it is base + combining sequence => base2 + same combining sequence, do just the base
|
|
|
|
int nsourceFirst = UTF16.charAt(nsource,0);
|
|
|
|
String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
|
|
|
|
int ntargetFirst = UTF16.charAt(ntarget,0);
|
|
|
|
String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
|
2005-06-21 21:28:31 +00:00
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
|
|
|
|
source = UTF16.valueOf(nsourceFirst);
|
|
|
|
target = UTF16.valueOf(ntargetFirst);
|
|
|
|
type += "-base";
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
type += ":" + lineCount;
|
2005-05-27 21:43:46 +00:00
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
String combined = source + target;
|
|
|
|
boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
|
|
|
|
boolean isMixed = isMixedScript(combined);
|
|
|
|
dataMixedAnycase.add(source, target, type);
|
|
|
|
if (isLowercase) dataMixedLowercase.add(source, target, type);
|
|
|
|
if (!isMixed) dataSingleAnycase.add(source, target, type);
|
|
|
|
if (!isMixed && isLowercase) dataSingleLowercase.add(source, target, type);
|
|
|
|
return this;
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
|
|
|
|
/* *//**
|
2005-05-27 21:43:46 +00:00
|
|
|
* @param errorLine TODO
|
|
|
|
*
|
2005-06-21 21:28:31 +00:00
|
|
|
*//*
|
2005-05-27 21:43:46 +00:00
|
|
|
private DataSet add(Data newData, String errorLine) {
|
|
|
|
if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
|
|
|
|
System.out.println("Problem with " + errorLine);
|
|
|
|
System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
|
|
|
|
}
|
|
|
|
String[] key = {newData.source, newData.target};
|
|
|
|
Data old = (Data) dataMap.get(key);
|
|
|
|
if (old == null) {
|
|
|
|
dataSet.add(newData);
|
|
|
|
dataMap.put(key, newData);
|
|
|
|
}else {
|
|
|
|
old.type = old.type + "/" + newData.type;
|
|
|
|
}
|
|
|
|
return this;
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
*/ // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
|
2005-05-27 21:43:46 +00:00
|
|
|
static final int NORMAL = 0, FOLDING = 1, OLD = 2;
|
|
|
|
|
|
|
|
public DataSet addFile(String directory, String filename) throws IOException {
|
2005-06-21 21:28:31 +00:00
|
|
|
String line = null;
|
|
|
|
int count = 0;
|
|
|
|
try {
|
|
|
|
BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
|
|
|
|
int kind = NORMAL;
|
|
|
|
if (filename.indexOf("Folding") >= 0) kind = FOLDING;
|
|
|
|
else if (false && filename.indexOf("-old") >= 0) kind = OLD;
|
|
|
|
while (true) {
|
|
|
|
count++;
|
|
|
|
line = Utility.readDataLine(in);
|
|
|
|
if (line == null) break;
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
String[] pieces = Utility.split(line,';');
|
|
|
|
if (pieces.length < 2) {
|
|
|
|
System.out.println("Error on: " + line);
|
|
|
|
continue;
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
String type = filename;
|
|
|
|
if (kind==FOLDING) {
|
|
|
|
String source = Utility.fromHex(pieces[0].trim(),true);
|
|
|
|
String target = Utility.fromHex(pieces[1].trim(),true);
|
|
|
|
String nsource = Default.nfkd().normalize(source);
|
|
|
|
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
|
|
|
|
if (!first.equals(target)) {
|
|
|
|
add(source, target, type, count, line);
|
|
|
|
}
|
|
|
|
} else if (kind == OLD) {
|
|
|
|
String target = pieces[0].trim();
|
|
|
|
for (int i = 1; i < pieces.length; ++i) {
|
|
|
|
add(pieces[i].trim(), target, type, count, line);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
String source = Utility.fromHex(pieces[0].trim(),true);
|
|
|
|
String target = Utility.fromHex(pieces[1].trim(),true);
|
|
|
|
//if (pieces.length > 2) type = pieces[2].trim();
|
|
|
|
add(source, target, type, count, line);
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
in.close();
|
|
|
|
return this;
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw (RuntimeException) new RuntimeException("Failure with file: "
|
|
|
|
+ directory + filename + " on line: " + count
|
|
|
|
+ ": " + line).initCause(e);
|
|
|
|
}
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
|
|
|
|
public void writeSource(String directory, String filename) throws IOException {
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
|
|
|
out.println("# Source File for IDN Confusables");
|
2005-06-24 23:51:52 +00:00
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
2005-06-21 21:28:31 +00:00
|
|
|
out.println("");
|
|
|
|
dataMixedAnycase.writeSource(out);
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
|
|
|
|
public void write(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
|
2005-05-27 21:43:46 +00:00
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
2005-06-21 21:28:31 +00:00
|
|
|
out.print('\uFEFF');
|
|
|
|
out.println("# Recommended confusable mapping for IDN");
|
2005-06-24 23:51:52 +00:00
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
2005-06-21 21:28:31 +00:00
|
|
|
out.println("");
|
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
if (appendFile) {
|
|
|
|
String[] replacements = {"%date%", Default.getDate()};
|
|
|
|
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
|
|
|
|
Utility.UTF8_WINDOWS, out, replacements);
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
writeData(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
|
|
|
|
writeData(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
|
|
|
|
writeData(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
|
|
|
|
writeData(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
|
2005-05-27 21:43:46 +00:00
|
|
|
out.close();
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
/**
|
|
|
|
* @param skipNFKEquivs TODO
|
|
|
|
*
|
|
|
|
*/
|
2005-06-24 23:51:52 +00:00
|
|
|
private void writeData(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
|
|
|
|
Set items = data.getOrderedExplicitItems();
|
2005-06-21 21:28:31 +00:00
|
|
|
out.println();
|
|
|
|
out.println("# " + title);
|
|
|
|
out.println();
|
|
|
|
int count = 0;
|
|
|
|
UnicodeSet preferredID = getIdentifierSet();
|
|
|
|
for (Iterator it = items.iterator(); it.hasNext();) {
|
|
|
|
String source = (String) it.next();
|
|
|
|
if (UTF16.hasMoreCodePointsThan(source,1)) continue;
|
2005-06-24 23:51:52 +00:00
|
|
|
String target = data.getParadigm(source);
|
2005-06-21 21:28:31 +00:00
|
|
|
if (source.equals(target)) continue;
|
|
|
|
if (skipNFKEquivs) {
|
|
|
|
if (!Default.nfkd().normalize(source).equals(source)) continue;
|
|
|
|
}
|
|
|
|
String reason = fixReason(data.getReasons(source, target));
|
|
|
|
writeSourceTargetLine(out, source, tag, target, reason);
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
out.println();
|
|
|
|
out.println("# total for (" + tag + "): " + count);
|
|
|
|
out.println();
|
|
|
|
}
|
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
2005-06-21 21:28:31 +00:00
|
|
|
private String fixReason(List reasons) {
|
|
|
|
List first = (List)reasons.get(0);
|
|
|
|
String result = "";
|
|
|
|
for (int i = 0; i < first.size(); ++i) {
|
|
|
|
if (i != 0) result += " ";
|
|
|
|
Object item = first.get(i);
|
|
|
|
if (item instanceof String) {
|
|
|
|
result += item;
|
|
|
|
} else {
|
|
|
|
String temp = "";
|
|
|
|
for (Iterator it = ((Set)item).iterator(); it.hasNext();) {
|
|
|
|
if (temp.length() != 0) temp += "|";
|
|
|
|
temp += it.next();
|
|
|
|
}
|
|
|
|
result += "{" + temp + "}";
|
|
|
|
}
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
return result.toString();
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
|
|
|
|
public void addAll(DataSet ds) {
|
|
|
|
dataMixedAnycase.addAll(ds.dataMixedAnycase);
|
|
|
|
dataMixedLowercase.addAll(ds.dataMixedLowercase);
|
|
|
|
dataSingleAnycase.addAll(ds.dataSingleAnycase);
|
|
|
|
dataSingleLowercase.addAll(ds.dataSingleLowercase);
|
|
|
|
}
|
|
|
|
/* *//**
|
|
|
|
*
|
|
|
|
*//*
|
2005-05-27 21:43:46 +00:00
|
|
|
public DataSet clean() {
|
|
|
|
// remove all skips
|
|
|
|
DataSet tempSet = new DataSet();
|
|
|
|
Map m = new HashMap();
|
|
|
|
for (Iterator it = dataSet.iterator(); it.hasNext();) {
|
|
|
|
Data d = (Data) it.next();
|
|
|
|
if (d.type.indexOf("skip") >= 0) continue;
|
|
|
|
String newTarget = Default.nfkd().normalize(d.target);
|
|
|
|
String newSource = Default.nfkd().normalize(d.source);
|
|
|
|
String type = d.type;
|
|
|
|
if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
|
|
|
|
type += "-nf";
|
|
|
|
log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
|
|
|
|
log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// swap order
|
|
|
|
if (preferSecondAsSource(newSource, newTarget)) {
|
|
|
|
String temp = newTarget;
|
|
|
|
newTarget = newSource;
|
|
|
|
newSource = temp;
|
|
|
|
}
|
|
|
|
|
|
|
|
Data already = (Data) m.get(newSource);
|
|
|
|
if (already != null && !newTarget.equals(already.target)) {
|
|
|
|
log.println("X " + getCodeCharName(newSource) + " " + ARROW);
|
|
|
|
log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
|
|
|
|
log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
|
|
|
|
if (preferSecondAsSource(already.target, newTarget)) {
|
|
|
|
// just fix new guy
|
|
|
|
type += "[" + newSource + "]" + already.type;
|
|
|
|
newSource = newTarget;
|
|
|
|
newTarget = already.target;
|
|
|
|
} else {
|
|
|
|
// need to fix new guy, AND fix old guy.
|
|
|
|
tempSet.remove(already);
|
|
|
|
type += "[" + newSource + "]" + already.type;
|
|
|
|
newSource = already.target;
|
|
|
|
already.type += "[" + already.target + "]" + type;
|
|
|
|
already.target = newTarget;
|
|
|
|
tempSet.add(already, "");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Data newData = new Data(newSource, newTarget, type);
|
|
|
|
m.put(newSource, newData);
|
|
|
|
tempSet.add(newData, "");
|
|
|
|
}
|
|
|
|
// now recursively apply
|
|
|
|
DataSet s = new DataSet();
|
|
|
|
for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
|
|
|
|
Data d = (Data) it.next();
|
|
|
|
int cp = 0;
|
|
|
|
StringBuffer result = new StringBuffer();
|
|
|
|
for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
|
|
|
|
cp = UTF16.charAt(d.target, i);
|
|
|
|
String src = UTF16.valueOf(cp);
|
|
|
|
while (true) {
|
|
|
|
Data rep = (Data) m.get(src);
|
|
|
|
if (rep == null) break;
|
|
|
|
src = rep.target;
|
|
|
|
}
|
|
|
|
result.append(src);
|
|
|
|
}
|
|
|
|
String newTarget = result.toString();
|
|
|
|
newTarget = Default.nfkd().normalize(newTarget);
|
|
|
|
s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
*//**
|
2005-05-27 21:43:46 +00:00
|
|
|
*
|
2005-06-21 21:28:31 +00:00
|
|
|
*//*
|
2005-05-27 21:43:46 +00:00
|
|
|
private void remove(Data already) {
|
|
|
|
String[] key = {already.source, already.target};
|
|
|
|
dataMap.remove(key);
|
|
|
|
dataSet.remove(already);
|
2005-06-21 21:28:31 +00:00
|
|
|
}*/
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
public void close(String reason) {
|
|
|
|
dataMixedAnycase.close(reason);
|
|
|
|
dataMixedLowercase.close(reason);
|
|
|
|
dataSingleAnycase.close(reason);
|
|
|
|
dataSingleLowercase.close(reason);
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) {
|
|
|
|
int count = 0;
|
|
|
|
for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.getSet(null).complement()); it.next(); ) {
|
|
|
|
add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine);
|
|
|
|
}
|
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
|
|
|
|
static class MyFilter implements XEquivalenceClass.Filter {
|
|
|
|
UnicodeSet output;
|
|
|
|
public boolean matches(Object o) {
|
|
|
|
return output.containsAll((String)o);
|
|
|
|
}
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @throws IOException
|
|
|
|
*
|
|
|
|
*/
|
2005-06-24 23:51:52 +00:00
|
|
|
public void writeSummary(String outdir, String string, boolean outputOnly) throws IOException {
|
2005-06-21 21:28:31 +00:00
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir, string);
|
|
|
|
out.print('\uFEFF');
|
|
|
|
out.println("# Summary: Recommended confusable mapping for IDN");
|
2005-06-24 23:51:52 +00:00
|
|
|
out.println("# $Revision: 1.3 $");
|
|
|
|
out.println("# $Date: 2005/06/24 23:51:52 $");
|
2005-06-21 21:28:31 +00:00
|
|
|
out.println("");
|
|
|
|
MyEquivalenceClass data = dataMixedAnycase;
|
2005-06-24 23:51:52 +00:00
|
|
|
Set items = data.getOrderedExplicitItems();
|
|
|
|
for (Iterator it = items.iterator(); it.hasNext();) {
|
|
|
|
System.out.println(Default.ucd().getCodeAndName((String)it.next()));
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
int count = 0;
|
|
|
|
UnicodeSet preferredID = getIdentifierSet();
|
|
|
|
String lastTarget = "";
|
2005-06-24 23:51:52 +00:00
|
|
|
UnicodeSet outputAllowed = IdentifierInfo.getIdentifierInfo().remainingOutputSet;
|
|
|
|
Set itemsSeen = new HashSet();
|
|
|
|
Set equivalents = new TreeSet(betterTargetIsLess);
|
2005-06-21 21:28:31 +00:00
|
|
|
for (Iterator it = items.iterator(); it.hasNext();) {
|
2005-06-24 23:51:52 +00:00
|
|
|
String target = (String) it.next();
|
|
|
|
if (itemsSeen.contains(target)) continue;
|
|
|
|
out.println();
|
|
|
|
out.println(getStatus(target) + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + Default.ucd().getName(target));
|
|
|
|
//if (UTF16.hasMoreCodePointsThan(source,1)) continue;
|
|
|
|
equivalents.clear();
|
|
|
|
equivalents.addAll(data.getEquivalences(target));
|
|
|
|
for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
|
|
|
|
String source = (String) it2.next();
|
|
|
|
if (source.equals(target)) continue;
|
|
|
|
//boolean compatEqual = Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target));
|
|
|
|
//if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue;
|
|
|
|
String reason = fixReason(data.getReasons(source, target));
|
|
|
|
//if (!outputAllowed.containsAll(source)) continue;
|
|
|
|
// if (compatEqual) {
|
|
|
|
// out.print("\u21D0");
|
|
|
|
// } else {
|
|
|
|
// out.print("\u2190");
|
|
|
|
// }
|
|
|
|
out.println("\u2190" + getStatus(source) + "\t" + "(\u200E " + source + " \u200E)\t" + Utility.hex(source) + "\t " + Default.ucd().getName(source)
|
|
|
|
+ "\t# " + reason);
|
|
|
|
count++;
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
out.println();
|
|
|
|
out.println("# total : " + count);
|
|
|
|
out.println();
|
|
|
|
|
|
|
|
out.close();
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private String getStatus(String source) {
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
int val = betterTargetIsLess.getValue(source);
|
|
|
|
if (val == MARK_NOT_NFC.intValue()) return "[x]";
|
|
|
|
if (val == MARK_NFC.intValue()) return "[x]";
|
|
|
|
if (val == MARK_INPUT_LENIENT.intValue()) return "[L]";
|
|
|
|
if (val == MARK_INPUT_STRICT.intValue()) return "[I]";
|
|
|
|
if (val == MARK_OUTPUT.intValue()) return "[O]";
|
|
|
|
return "?";
|
|
|
|
}
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
|
|
|
/**
|
|
|
|
* @throws IOException
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private static void fixMichel(String indir, String outdir) throws IOException {
|
|
|
|
BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
|
|
|
|
while (true) {
|
|
|
|
String line = Utility.readDataLine(in);
|
|
|
|
if (line == null) break;
|
|
|
|
String[] pieces = Utility.split(line,'\t');
|
|
|
|
if (pieces.length < 2) {
|
|
|
|
out.println(line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
String source = Utility.fromHex(pieces[0].trim());
|
|
|
|
if (Default.nfkd().isNormalized(source)) {
|
|
|
|
out.println(line);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
in.close();
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
2005-06-21 21:28:31 +00:00
|
|
|
|
|
|
|
private static void generateSource() throws IOException {
|
|
|
|
File dir = new File(indir);
|
|
|
|
String[] names = dir.list();
|
|
|
|
Set sources = new TreeSet(new ArrayComparator(
|
|
|
|
new Comparator[] {codepointComparator, codepointComparator}));
|
|
|
|
|
|
|
|
int[] count = new int[1];
|
|
|
|
for (int i = 0; i < names.length; ++i) {
|
|
|
|
if (new File(indir + names[i]).isDirectory()) continue;
|
|
|
|
if (!names[i].startsWith("confusables")) continue;
|
|
|
|
String reason = getReasonFromFilename(names[i]);
|
|
|
|
System.out.println(names[i]);
|
|
|
|
BufferedReader in = BagFormatter.openUTF8Reader(indir, names[i]);
|
|
|
|
String line;
|
|
|
|
count[0] = 0;
|
|
|
|
while (true) {
|
|
|
|
line = Utility.readDataLine(in, count);
|
|
|
|
if (line == null) break;
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
String[] pieces = Utility.split(line,';');
|
|
|
|
if (pieces.length < 2) {
|
|
|
|
System.out.println("Error on: " + line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
String source = Utility.fromHex(pieces[0].trim(),true);
|
|
|
|
String target = Utility.fromHex(pieces[1].trim(),true);
|
|
|
|
|
|
|
|
if (source.length() == 0 || target.length() == 0) {
|
|
|
|
throw new IllegalArgumentException("zero-length item: " + count[0] + ":\t" + line);
|
|
|
|
}
|
|
|
|
|
|
|
|
// check for identical combining sequences
|
|
|
|
String nsource = Default.nfc().normalize(source);
|
|
|
|
String ntarget = Default.nfc().normalize(target);
|
|
|
|
if (nsource.equals(ntarget)) continue;
|
|
|
|
|
|
|
|
if (true) {
|
|
|
|
int nsourceFirst = UTF16.charAt(nsource,0);
|
|
|
|
String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
|
|
|
|
int ntargetFirst = UTF16.charAt(ntarget,0);
|
|
|
|
String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
|
|
|
|
if (nsourceRest.equals(ntargetRest)) {
|
|
|
|
source = UTF16.valueOf(nsourceFirst);
|
|
|
|
target = UTF16.valueOf(ntargetFirst);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-06-24 23:51:52 +00:00
|
|
|
if (betterTargetIsLess.compare(source, target) < 0) {
|
2005-06-21 21:28:31 +00:00
|
|
|
String temp = source;
|
|
|
|
source = target;
|
|
|
|
target = temp;
|
|
|
|
}
|
|
|
|
sources.add(new String[] {source, target});
|
|
|
|
}
|
|
|
|
in.close();
|
|
|
|
}
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "confusableSource.txt");
|
|
|
|
for (Iterator it = sources.iterator(); it.hasNext();) {
|
|
|
|
String[] sourceItem = (String[]) it.next();
|
|
|
|
writeSourceTargetLine(out, sourceItem[0], null, sourceItem[1], null);
|
|
|
|
}
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
private static void generateConfusables(String indir, String outdir) throws IOException {
|
2005-06-24 23:51:52 +00:00
|
|
|
betterTargetIsLess.compare("\u0020", "\u2004");
|
2005-05-27 21:43:46 +00:00
|
|
|
File dir = new File(indir);
|
|
|
|
String[] names = dir.list();
|
|
|
|
DataSet total = new DataSet();
|
|
|
|
for (int i = 0; i < names.length; ++i) {
|
|
|
|
if (new File(indir + names[i]).isDirectory()) continue;
|
2005-06-21 21:28:31 +00:00
|
|
|
if (!names[i].startsWith("confusables")) continue;
|
2005-05-27 21:43:46 +00:00
|
|
|
System.out.println(names[i]);
|
|
|
|
DataSet ds = new DataSet();
|
|
|
|
ds.addFile(indir, names[i]);
|
2005-06-21 21:28:31 +00:00
|
|
|
ds.writeSource(outdir, "new-" + names[i]);
|
|
|
|
ds.close("*");
|
|
|
|
total.addAll(ds);
|
|
|
|
total.close("t*" + names[i]);
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
DataSet ds = new DataSet();
|
|
|
|
ds.addUnicodeMap(nfcMap, "nfc", "nfc");
|
|
|
|
ds.close("*");
|
|
|
|
ds.write(outdir, "new-decomp.txt", false, false);
|
|
|
|
total.addAll(ds);
|
|
|
|
total.close("*");
|
2005-06-24 23:51:52 +00:00
|
|
|
total.writeSummary(outdir, "confusablesSummary.txt", false);
|
|
|
|
total.writeSummary(outdir, "confusablesSummaryOutput.txt", true);
|
2005-06-21 21:28:31 +00:00
|
|
|
total.write(outdir, "confusables.txt", false, false);
|
|
|
|
//DataSet clean = total.clean();
|
|
|
|
//clean.write(outdir, "confusables.txt", true);
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
|
|
|
|
Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(),
|
|
|
|
new UTF16.StringComparator()}));
|
|
|
|
while (true) {
|
|
|
|
String line = Utility.readDataLine(in);
|
|
|
|
if (line == null) break;
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
String[] pieces = Utility.split(line,';');
|
|
|
|
if (pieces.length < 2) {
|
|
|
|
System.out.println("Error on: " + line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
String source = Utility.fromHex(pieces[0].trim());
|
|
|
|
String target = Utility.fromHex(pieces[1].trim());
|
|
|
|
String nsource = Default.nfkd().normalize(source);
|
|
|
|
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
|
|
|
|
if (!first.equals(target)) {
|
|
|
|
set.add(new String[]{source, target});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
in.close();
|
|
|
|
|
|
|
|
}
|
|
|
|
public static void gen() throws IOException {
|
|
|
|
Map m = new TreeMap();
|
|
|
|
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
|
|
|
|
while (true) {
|
|
|
|
String line = in.readLine();
|
|
|
|
if (line == null) break;
|
|
|
|
String[] pieces = Utility.split(line,';');
|
|
|
|
if (pieces.length < 3) {
|
|
|
|
System.out.println("Error on: " + line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
int codepoint = Integer.parseInt(pieces[1], 16);
|
|
|
|
int cat = Default.ucd().getCategory(codepoint);
|
|
|
|
if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
|
|
|
|
if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
|
|
|
|
String result = Utility.fromHex(pieces[0]);
|
|
|
|
if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
|
|
|
|
int count = Integer.parseInt(pieces[2]);
|
|
|
|
String source = UTF16.valueOf(codepoint);
|
|
|
|
add(m, source, result, count);
|
|
|
|
}
|
|
|
|
in.close();
|
|
|
|
|
|
|
|
in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
|
|
|
|
while (true) {
|
|
|
|
String line = in.readLine();
|
|
|
|
if (line == null) break;
|
|
|
|
line = line.trim();
|
|
|
|
int pos = line.indexOf("#");
|
|
|
|
if (pos >= 0) line = line.substring(0,pos).trim();
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
if (line.startsWith("@")) continue;
|
|
|
|
String[] pieces = Utility.split(line,';');
|
|
|
|
if (pieces.length < 2) {
|
|
|
|
System.out.println("Error on: " + line);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
String source = pieces[0].trim();
|
|
|
|
for (int i = 1; i < pieces.length; ++i) {
|
|
|
|
add(m, source, pieces[i].trim(), -1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
in.close();
|
|
|
|
|
|
|
|
boolean gotOne;
|
|
|
|
// close the set
|
|
|
|
do {
|
|
|
|
gotOne = false;
|
|
|
|
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
|
|
|
|
String source = (String) it.next();
|
|
|
|
Data2 data = (Data2) m.get(source);
|
|
|
|
Data2 data2 = (Data2) m.get(data.target);
|
|
|
|
if (data2 == null) continue;
|
|
|
|
data.target = data2.target;
|
|
|
|
gotOne = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} while (gotOne);
|
|
|
|
// put into different sorting order
|
|
|
|
Set s = new TreeSet();
|
|
|
|
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
|
|
|
|
String source = (String) it.next();
|
|
|
|
Data2 data = (Data2) m.get(source);
|
|
|
|
s.add(new Data(source, data.target, data.count));
|
|
|
|
}
|
|
|
|
// write it out
|
|
|
|
PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
|
|
|
|
String[] replacements = {"%date%", Default.getDate()};
|
|
|
|
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
|
|
|
|
Utility.UTF8_WINDOWS, out, replacements);
|
|
|
|
for (Iterator it = s.iterator(); it.hasNext();) {
|
|
|
|
Data d = (Data) it.next();
|
|
|
|
if (d == null) continue;
|
|
|
|
out.println(formatLine(d.source, d.target, d.count));
|
|
|
|
}
|
|
|
|
|
|
|
|
out.close();
|
|
|
|
System.out.println("Done");
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
private static String formatLine(String source, String target, int count) {
|
|
|
|
return Utility.hex(source) + " ; " + Utility.hex(target," ")
|
|
|
|
+ " ; " + count
|
|
|
|
+ " # "
|
|
|
|
+ "(" + source + " " + ARROW + " " + target + ") "
|
|
|
|
+ Default.ucd().getName(source)
|
|
|
|
+ " " + ARROW + " " + Default.ucd().getName(target);
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*/
|
2005-06-21 21:28:31 +00:00
|
|
|
/* private static void add(Map m, String source, String target, int count) {
|
2005-05-27 21:43:46 +00:00
|
|
|
if (source.length() == 0 || target.length() == 0) return;
|
|
|
|
if (preferSecondAsSource(source, target)) {
|
|
|
|
String temp = target;
|
|
|
|
target = source;
|
|
|
|
source = temp;
|
|
|
|
}
|
|
|
|
Data2 other = (Data2) m.get(source);
|
|
|
|
if (other != null) {
|
|
|
|
if (target.equals(other.target)) return;
|
|
|
|
System.out.println("conflict");
|
|
|
|
System.out.println(formatLine(source, target, count));
|
|
|
|
System.out.println(formatLine(source, other.target, other.count));
|
|
|
|
// skip adding this, and instead add result -> other.target
|
|
|
|
add(m, target, other.target, count);
|
|
|
|
} else {
|
|
|
|
m.put(source, new Data2(target, count));
|
|
|
|
}
|
|
|
|
};
|
2005-06-21 21:28:31 +00:00
|
|
|
*/
|
2005-06-24 23:51:52 +00:00
|
|
|
|
|
|
|
static Integer
|
|
|
|
MARK_NOT_NFC = new Integer(50),
|
|
|
|
MARK_NFC = new Integer(40),
|
|
|
|
MARK_INPUT_LENIENT = new Integer(30),
|
|
|
|
MARK_INPUT_STRICT = new Integer(20),
|
|
|
|
MARK_OUTPUT = new Integer(10);
|
|
|
|
|
|
|
|
static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();
|
|
|
|
static class _BetterTargetIsLess implements Comparator {
|
|
|
|
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
|
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
public int compare(Object o1, Object o2) {
|
|
|
|
String a = (String)o1;
|
|
|
|
String b = (String)o2;
|
|
|
|
int ca = UTF16.countCodePoint(a);
|
|
|
|
int cb = UTF16.countCodePoint(b);
|
2005-06-24 23:51:52 +00:00
|
|
|
if (ca != cb) return ca > cb ? -1 : 1;
|
|
|
|
int aok = getValue(a);
|
|
|
|
int bok = getValue(b);
|
|
|
|
if (aok != bok) return aok < bok ? -1 : 1;
|
|
|
|
return codepointComparator.compare(a, b);
|
2005-06-21 21:28:31 +00:00
|
|
|
}
|
2005-06-24 23:51:52 +00:00
|
|
|
static final int BAD = 1000;
|
2005-06-21 21:28:31 +00:00
|
|
|
|
2005-06-24 23:51:52 +00:00
|
|
|
private int getValue(String a) { // lower is better
|
|
|
|
int cp;
|
|
|
|
int lastValue = 0;
|
|
|
|
for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
|
|
|
|
cp = UTF16.charAt(a, i);
|
|
|
|
Object objValue = info.lowerIsBetter.getValue(cp);
|
|
|
|
int value = ((Integer) objValue).intValue();
|
|
|
|
if (value > lastValue) lastValue = value;
|
|
|
|
}
|
|
|
|
return lastValue;
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
};
|
2005-05-27 21:43:46 +00:00
|
|
|
|
2005-06-21 21:28:31 +00:00
|
|
|
/* static private boolean preferSecondAsSource(String a, String b) {
|
2005-05-27 21:43:46 +00:00
|
|
|
// if first is longer, prefer second
|
|
|
|
int ca = UTF16.countCodePoint(a);
|
|
|
|
int cb = UTF16.countCodePoint(b);
|
|
|
|
if (ca != cb) {
|
|
|
|
return ca > cb;
|
|
|
|
}
|
|
|
|
// if first is lower, prefer second
|
|
|
|
return a.compareTo(b) < 0;
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
*/
|
2005-05-27 21:43:46 +00:00
|
|
|
static String getCodeCharName(String a) {
|
|
|
|
return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a);
|
|
|
|
}
|
2005-06-21 21:28:31 +00:00
|
|
|
/**
|
|
|
|
* Returns the part between - and .
|
|
|
|
*/
|
|
|
|
public static String getReasonFromFilename(String type) {
|
|
|
|
int period = type.lastIndexOf('.');
|
|
|
|
if (period < 0) period = type.length();
|
|
|
|
int dash = type.lastIndexOf('-', period);
|
|
|
|
return type.substring(dash+1,period);
|
|
|
|
}
|
|
|
|
|
2005-05-27 21:43:46 +00:00
|
|
|
}
|