scuffed-code/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java

1645 lines
54 KiB
Java
Raw Normal View History

2005-05-27 21:43:46 +00:00
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
* $Date: 2005/06/24 23:51:52 $
* $Revision: 1.3 $
2005-05-27 21:43:46 +00:00
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
2005-05-27 21:43:46 +00:00
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
2005-05-27 21:43:46 +00:00
import java.util.Iterator;
import java.util.List;
2005-05-27 21:43:46 +00:00
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.transaction.xa.Xid;
import com.ibm.icu.dev.demo.translit.InfoDialog;
2005-05-27 21:43:46 +00:00
import com.ibm.icu.dev.test.util.ArrayComparator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.CollectionUtilities;
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
import com.ibm.icu.dev.test.util.UnicodeLabel;
2005-05-27 21:43:46 +00:00
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
2005-05-27 21:43:46 +00:00
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.lang.UScript;
2005-05-27 21:43:46 +00:00
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
2005-05-27 21:43:46 +00:00
import com.ibm.text.utility.Utility;
import com.ibm.text.utility.XEquivalenceClass;
import com.sun.corba.se.connection.GetEndPointInfoAgainException;
2005-05-27 21:43:46 +00:00
public class GenerateConfusables {
public static boolean EXCLUDE_CONFUSABLE_COMPAT = true;
public static void main(String[] args) throws IOException {
Set arg2 = new HashSet(Arrays.asList(args));
try {
if (arg2.contains("-b")) generateIDN();
if (arg2.contains("-c")) generateConfusables();
if (arg2.contains("-d")) generateDecompFile();
if (arg2.contains("-s")) generateSource();
} catch (Exception e) {
e.printStackTrace();
} finally {
System.out.println("Done");
}
}
2005-05-27 21:43:46 +00:00
static PrintWriter log;
static final String ARROW = "\u2192";
static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn")
.addAll(ups.getSet("gc=Co"))
.addAll(ups.getSet("gc=Cs"));
static UnicodeSet skipSet = ups.getSet("gc=Cc")
.addAll(ups.getSet("gc=Cf"))
.addAll(UNASSIGNED);
static UnicodeSet whiteSpace = ups.getSet("Whitespace=TRUE");
static UnicodeSet lowercase = ups.getSet("gc=Ll");
static UnicodeSet _skipNFKD;
static Map gatheredNFKD = new TreeMap();
static UnicodeMap nfcMap = new UnicodeMap();
static String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\source\\";
static String outdir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
static Comparator codepointComparator = new UTF16.StringComparator();
static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
"\\u3400-\\u4DB5" +
"\\u4E00-\\u9FA5" +
"\\uA000-\\uA48C" +
"\\uAC00-\\uD7A3" +
"\\u1100-\\u11FF" +
"\\uFB00-\\uFEFC" +
"\\u2460-\\u24FF" +
"\\u3251-\\u33FF" +
"\\u4DC0-\\u4DFF" +
"\\u3165-\\u318E" +
"\\uA490-\\uA4C6" +
"\\U00010140-\\U00010174" +
"\\U0001D300-\\U0001D356" +
"\\U0001D000-\\U0001D1DD" +
"\\U00020000-\\U0002A6D6" +
"\\U0001D400-\\U0001D7FF" +
"[:script=Canadian_Aboriginal:]" +
"[:script=ETHIOPIC:]" +
"[:script=Tagalog:]" +
"[:script=Hanunoo:]" +
"[:script=Buhid:]" +
"[:script=Tagbanwa:]" +
"[:script=Deseret:]" +
"[:script=Shavian:]" +
"[:script=Ogham:]" +
"[:script=Old Italic:]" +
"[:script=Runic:]" +
"[:script=Gothic:]" +
"[:script=Ugaritic:]" +
"[:script=Linear B:]" +
"[:script=Cypriot:]" +
"[:script=Coptic:]" +
"[:script=Syriac:]" +
"[:script=Glagolitic:]" +
"[:script=Glagolitic:]" +
"[:script=Old Persian:]" +
"[:script=Kharoshthi:]" +
"[:script=Osmanya:]" +
"[:default ignorable code point:]" +
"]");
/**
* @throws IOException
*
*/
private static void generateIDN() throws IOException {
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
info.printIDNStuff();
}
private static class IdentifierInfo {
static private IdentifierInfo info;
static IdentifierInfo getIdentifierInfo() {
try {
if (info == null) info = new IdentifierInfo();
return info;
} catch (Exception e) {
throw (RuntimeException) new IllegalArgumentException("Unable to access data").initCause(e);
}
}
private boolean mergeRanges = true;
private UnicodeSet removalSet, remainingOutputSet, inputSet_strict, inputSet_lenient, nonstarting;
UnicodeSet propNFKCSet, notInXID, xidPlus;
private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(),
reviews, removals2, lowerIsBetter;
private IdentifierInfo() throws IOException {
propNFKCSet = ups.getSet("NFKC_QuickCheck=N")
.complement();
UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE");
loadFileData();
xidPlus = new UnicodeSet(propXIDContinueSet).addAll(
additions.getSet(null).complement()).retainAll(propNFKCSet);
getIdentifierSet();
notInXID = new UnicodeSet(IDNOutputSet)
.removeAll(xidPlus);
removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
removalSet = removals.getSet(null).complement();
remainingOutputSet = new UnicodeSet(IDNOutputSet)
.removeAll(removalSet);
UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet)
.removeAll(removalSet).removeAll(remainingOutputSet);
UnicodeSet remainingInputSet = new UnicodeSet();
UnicodeSet specialRemove = new UnicodeSet();
// remove any others that don't normalize/case fold to something in
// the output set
for (UnicodeSetIterator usi = new UnicodeSetIterator(
remainingInputSet1); usi.next();) {
String nss = Default.nfkc().normalize(usi.getString());
String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD);
String cf2 = Default.nfkc().normalize(cf);
if (remainingOutputSet.containsAll(cf2))
remainingInputSet.add(usi.codepoint);
else
specialRemove.add(usi.codepoint);
}
// filter out the items that are case foldings of items in output
inputSet_strict = new UnicodeSet();
for (UnicodeSetIterator usi = new UnicodeSetIterator(
remainingInputSet); usi.next();) {
String ss = usi.getString();
String nss = Default.nfkc().normalize(ss);
String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD);
if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) {
System.out.println("check");
}
//> > 2126 ; retained-input-only-CF # (?) OHM SIGN
//> > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN
if (!remainingOutputSet.containsAll(nss)
&& remainingOutputSet.containsAll(cf))
inputSet_strict.add(ss);
}
// hack
inputSet_strict.remove(0x03F4).remove(0x2126).remove(0x212B);
inputSet_lenient = new UnicodeSet(remainingInputSet)
.removeAll(inputSet_strict);
nonstarting = new UnicodeSet(remainingOutputSet).addAll(
remainingInputSet).retainAll(new UnicodeSet("[:M:]"));
reviews = new UnicodeMap().putAll(removals);
reviews.putAll(remainingOutputSet, "output");
reviews.putAll(inputSet_strict, "input");
reviews.putAll(inputSet_lenient, "input-lenient");
reviews.putAll(specialRemove, PROHIBITED + "output-disallowed");
lowerIsBetter = new UnicodeMap();
lowerIsBetter.putAll(propNFKCSet, MARK_NFC); // nfkc is better than the alternative
lowerIsBetter.putAll(inputSet_lenient, MARK_INPUT_LENIENT);
lowerIsBetter.putAll(inputSet_strict, MARK_INPUT_STRICT);
lowerIsBetter.putAll(remainingOutputSet, MARK_OUTPUT);
lowerIsBetter.setMissing(MARK_NOT_NFC);
lowerIsBetter.lock();
// add special values:
//lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0));
UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting,
"nonstarting");
UnicodeMap.Composer composer = new UnicodeMap.Composer() {
public Object compose(int codePoint, Object a, Object b) {
if (a == null)
return b;
else if (b == null)
return a;
else
return a.toString() + "-" + b.toString();
}
};
reviews.composeWith(nonstartingmap, composer);
reviews.putAll(new UnicodeSet(IDNInputSet).complement(), "");
UnicodeMap.Composer composer2 = new UnicodeMap.Composer() {
public Object compose(int codePoint, Object a, Object b) {
if (b == null)
return a;
return "remap-to-" + Utility.hex(b.toString());
}
};
reviews.composeWith(remap, composer2);
removals2 = new UnicodeMap().putAll(removals);
removals2.putAll(ups.getSet("XID_Continue=TRUE").complement(),
PROHIBITED + NOT_IN_XID);
removals2.setMissing("future?");
additions.lock();
remap.lock();
removals.lock();
reviews.lock();
removals2.lock();
}
/**
*
*/
private void loadFileData() throws IOException {
// get the word chars
BufferedReader br = BagFormatter.openUTF8Reader(indir,
"wordchars.txt");
String line = null;
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null)
break;
if (line.length() == 0)
continue;
String[] pieces = Utility.split(line, ';');
int code = Integer.parseInt(pieces[0].trim(), 16);
if (pieces[1].trim().equals("remap-to")) {
remap.put(code, UTF16.valueOf(Integer.parseInt(
pieces[2].trim(), 16)));
} else {
if (XIDContinueSet.contains(code)) {
System.out.println("Already in XID continue: "
+ line);
continue;
}
additions.put(code, "addition");
}
}
} catch (Exception e) {
throw (RuntimeException) new RuntimeException(
"Failure on line " + line).initCause(e);
}
br.close();
// get all the removals.
br = BagFormatter.openUTF8Reader(indir, "removals.txt");
UnicodeSet allocated = ups.getSet("generalcategory=cn").complement();
UnicodeSet sources = new UnicodeSet();
line = null;
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null)
break;
if (line.length() == 0)
continue;
sources.clear();
String[] pieces = Utility.split(line, ';');
if (pieces.length < 2) {
System.out.println("Missing line " + line);
continue;
}
String codelist = pieces[0].trim();
String reasons = pieces[1].trim();
if (pieces[0].startsWith("[")) {
sources = new UnicodeSet(codelist).retainAll(allocated);
} else {
String[] codes = Utility.split(codelist, ' ');
for (int i = 0; i < codes.length; ++i) {
if (codes[i].length() == 0)
continue;
String[] range = codes[i].split("\\.\\.");
int start = Integer.parseInt(range[0], 16);
int end = start;
if (range.length > 1)
end = Integer.parseInt(range[1], 16);
sources.add(start, end);
}
}
removals.putAll(sources, PROHIBITED + reasons);
}
} catch (Exception e) {
throw (RuntimeException) new RuntimeException(
"Failure on line " + line).initCause(e);
}
br.close();
}
void printIDNStuff() throws IOException {
PrintWriter out;
printIDModifications();
writeIDChars();
writeIDReview();
generateDecompFile();
}
/**
*
*/
private void writeIDReview() throws IOException {
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
bf.setLabelSource(null);
bf.setShowLiteral(bf.toHTMLControl);
bf.setMergeRanges(true);
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
//reviews.putAll(UNASSIGNED, "");
out.print("\uFEFF");
out.println("# Review List for IDN");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("");
UnicodeSet fullSet = reviews.getSet("").complement();
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
}).set(reviews).setMain("Reviews", "GCB",
UnicodeProperty.ENUMERATED, "1.0"));
//bf.setMergeRanges(false);
FakeBreak fakeBreak = new FakeBreak();
bf.setRangeBreakSource(fakeBreak);
out.println("");
out.println("# Characters allowed in IDNA");
out.println("");
bf.showSetNames(out, new UnicodeSet(fullSet)); // .removeAll(bigSets)
//bf.setMergeRanges(true);
// out.println("");
// out.println("# Large Ranges");
// out.println("");
// bf.showSetNames(out, new UnicodeSet(fullSet).retainAll(bigSets));
out.println("");
out.println("# Characters disallowed in IDNA");
out
.println("# The IDNA spec doesn't allow any of these characters,");
out
.println("# so don't report any of them as being missing from the above list.");
out
.println("# Some possible future additions, once IDNA updates to Unicode 4.1, are given.");
out.println("");
//bf.setRangeBreakSource(UnicodeLabel.NULL);
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
}).set(removals2).setMain("Removals", "GCB",
UnicodeProperty.ENUMERATED, "1.0"));
//bf.setValueSource(UnicodeLabel.NULL);
bf.showSetNames(out, new UnicodeSet(IDNInputSet).complement()
.removeAll(UNASSIGNED));
out.close();
}
/**
*
*/
private void writeIDChars() throws IOException {
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
bf.setLabelSource(null);
bf.setShowLiteral(bf.toHTMLControl);
bf.setMergeRanges(true);
UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
out.println("# Recommended Identifier Profiles for IDN");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("");
out.println("# Output Characters");
out.println("");
bf.setValueSource("output");
bf.showSetNames(out, remainingOutputSet);
showExtras(bf, remainingOutputSet, letters);
out.println("");
out.println("");
out.println("# Input Characters");
out.println("");
bf.setValueSource("input");
bf.showSetNames(out, inputSet_strict);
showExtras(bf, inputSet_strict, letters);
out.println("");
out.println("# Input Characters (lenient)");
out.println("");
bf.setValueSource("input-lenient");
bf.showSetNames(out, inputSet_lenient);
showExtras(bf, inputSet_lenient, letters);
out.println("");
out
.println("# Not allowed at start of identifier");
out.println("");
bf.setValueSource("nonstarting");
bf.showSetNames(out, nonstarting);
out.println("");
showRemapped(out,
"Characters remapped on input (in GUIs)", remap);
out.close();
}
/**
*
*/
private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) {
UnicodeSet extra = new UnicodeSet(source).removeAll(letters);
if (extra.size() != 0) {
UnicodeSet fixed = new UnicodeSet();
for (UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next();) {
if (!letters.containsAll(Default.nfkd().normalize(it.getString()))) {
fixed.add(it.codepoint);
}
}
System.out.println(bf.showSetNames(fixed));
}
}
/**
*
*/
private void printIDModifications() throws IOException {
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
bf.setLabelSource(null);
bf.setShowLiteral(bf.toHTMLControl);
bf.setMergeRanges(true);
PrintWriter out = BagFormatter.openUTF8Writer(outdir,
"xidmodifications.txt");
out.println("# Security Profile for General Identifiers");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("");
out.println("# Characters restricted");
out.println("");
/*
* for (Iterator it = values.iterator(); it.hasNext();) { String
* reason1 = (String)it.next(); bf.setValueSource(reason1);
* out.println(""); bf.showSetNames(out, removals.getSet(reason1)); }
*/
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
}).set(removals).setMain("Removals", "GCB",
UnicodeProperty.ENUMERATED, "1.0"));
bf.showSetNames(out, removalSet);
out.println("");
out.println("# Characters added");
out.println("");
bf.setValueSource("addition");
bf.showSetNames(out, additions.getSet(null).complement());
showRemapped(out, "Characters remapped on input", remap);
out.close();
UnicodeMap someRemovals = new UnicodeMap();
UnicodeMap.Composer myComposer = new UnicodeMap.Composer() {
public Object compose(int codePoint, Object a, Object b) {
if (b == null) return null;
String x = (String)b;
if (!IDNOutputSet.contains(codePoint)) {
return "~IDNA";
}
if (!xidPlus.contains(codePoint)) {
return "~Unicode Identifier";
}
if (x.startsWith(PROHIBITED)) x = x.substring(PROHIBITED.length());
//if (!propNFKCSet.contains(codePoint)) x += "*";
if (lowercase.contains(codePoint)) {
String upper = Default.ucd().getCase(codePoint, UCD.FULL, UCD.UPPER);
if (upper.equals(UTF16.valueOf(codePoint))
&& x.equals("technical symbol (phonetic)")) x = "technical symbol (phonetic with no uppercase)";
}
return x;
}
};
someRemovals.composeWith(removals, myComposer);
//someRemovals = removals;
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
out.println("# Characters restricted in domain names");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("#");
out.println("# This file contains a draft list of characters for use in");
out.println("# UTR #36: Unicode Security Considerations");
out.println("# http://unicode.org/draft/reports/tr36/tr36.html");
out.println("# According to the recommendations in that document, these characters");
out.println("# would be restricted in domain names: people would only be able to use them");
out.println("# by using lenient security settings.");
out.println("#");
out.println("# If you have any feedback on this list, please use the submission form at:");
out.println("# http://unicode.org/reporting.html.");
out.println("#");
out.println("# Notes:");
out.println("# - Characters are listed along with a reason for their removal.");
out.println("# - Characters listed as ~IDNA are excluded at this point in domain names,");
out.println("# in many cases because the international domain name specification does not contain");
out.println("# characters beyond Unicode 3.2. At this point in time, feedback on those characters");
out.println("# is not relevant.");
out.println("# - Characters listed as ~Unicode Identifiers are restricted because they");
out.println("# do not fit the specification of identifiers given in");
out.println("# UAX #31: Identifier and Pattern Syntax");
out.println("# http://unicode.org/reports/tr31/");
out.println("# - The files in this directory are 'live', and may change at any time.");
out.println("# Please include the above Revision number in your feedback.");
bf.setRangeBreakSource(new FakeBreak2());
if (true) {
Set values = new TreeSet(someRemovals.getAvailableValues());
for (Iterator it = values.iterator(); it.hasNext();) {
String reason1 = (String) it.next();
bf.setValueSource(reason1);
out.println("");
bf.showSetNames(out, someRemovals.getSet(reason1));
}
} else {
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
}).set(someRemovals).setMain("Removals", "GCB",
UnicodeProperty.ENUMERATED, "1.0"));
bf.showSetNames(out, someRemovals.getSet(null).complement());
}
out.close();
}
}
static final String PROHIBITED = "restricted ; ";
static final String NOT_IN_XID = "not in XID+";
/**
*
*/
/**
*
*/
private static void generateDecompFile() throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "decomps.txt");
UnicodeProperty dt = ups.getProperty("Decomposition_Type");
for (Iterator it = dt.getAvailableValues().iterator(); it.hasNext();) {
String value = (String) it.next();
if (value.equalsIgnoreCase("none") || value.equalsIgnoreCase("canonical")) continue;
UnicodeSet s = dt.getSet(value);
out.println("");
out.println("# Decomposition_Type = " + value);
out.println("");
for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) {
String source = usi.getString();
String target = Default.nfkc().normalize(source);
writeSourceTargetLine(out, source, null, target, value);
}
//bf.showSetNames(out, s);
out.flush();
}
out.close();
}
static class FakeBreak extends UnicodeLabel {
UnicodeSet nobreakSet = setsToAbbreviate;
public String getValue(int codepoint, boolean isShort) {
return nobreakSet.contains(codepoint) ? ""
: (codepoint & 1) == 0 ? "O"
: "E";
}
}
static class FakeBreak2 extends UnicodeLabel {
UnicodeSet nobreakSet = new UnicodeSet(setsToAbbreviate)
.addAll(new UnicodeSet(IDNOutputSet).complement())
.addAll(new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus).complement());
public String getValue(int codepoint, boolean isShort) {
return nobreakSet.contains(codepoint) ? ""
: (codepoint & 1) == 0 ? "O"
: "E";
}
}
/**
*
*/
private static void showRemapped(PrintWriter out, String title, UnicodeMap remap) {
out.println("");
out.println("# " + title);
out.println("");
int count = 0;
for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.getSet(null).complement()); usi.next();) {
writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null);
count++;
}
out.println("");
out.println("# Total code points: " + count);
}
/**
*
*/
static UnicodeSet XIDContinueSet = new UnicodeSet("[:XID_Continue:]");
private static UnicodeSet IDNOutputSet, IDNInputSet, _preferredIDSet;
static UnicodeSet getIdentifierSet() {
if (_preferredIDSet == null) {
IDNOutputSet = new UnicodeSet();
IDNInputSet = new UnicodeSet();
IDNOutputSet.add('-'); // HACK
IDNInputSet.add('-');
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
// get IDNA
int idnaType = GenerateStringPrep.getIDNAType(cp);
if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp);
if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp);
}
_preferredIDSet = new UnicodeSet(IDNOutputSet).addAll(XIDContinueSet);
}
_preferredIDSet.add(0x2018).add(0x2019);
return _preferredIDSet;
}
private static UnicodeSet getSkipNFKD() {
if (_skipNFKD == null) {
_skipNFKD = new UnicodeSet();
UnicodeSet idSet = getIdentifierSet();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
int decompType = Default.ucd().getDecompositionType(cp);
if (decompType == UCD.COMPAT_CIRCLE
|| decompType == UCD.COMPAT_SUPER
|| decompType == UCD.COMPAT_SUB
|| decompType == UCD.COMPAT_VERTICAL
|| decompType == UCD.COMPAT_SMALL
|| decompType == UCD.COMPAT_SQUARE
|| decompType == UCD.COMPAT_FRACTION) {
_skipNFKD.add(cp);
continue;
}
String mapped = Default.nfkd().normalize(cp);
if (mapped.equals(UTF16.valueOf(cp))) continue;
if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp);
else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp);
if (decompType == UCD.CANONICAL) nfcMap.put(cp, Default.nfd().normalize(cp));
}
}
nfcMap.setMissing("");
return _skipNFKD;
}
private static boolean isMixedScript(String source) {
int lastScript = UScript.INVALID_CODE;
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
int script = UScript.getScript(cp);
if (script == UScript.COMMON || script == UScript.INHERITED) {
if (XIDContinueSet.contains(cp)) continue; // skip if not identifier
script = UScript.COMMON;
}
if (lastScript == UScript.INVALID_CODE) lastScript = script;
else if (script != lastScript) return true;
}
return false;
}
2005-05-27 21:43:46 +00:00
/**
*
*/
private static void generateConfusables() throws IOException {
log = BagFormatter.openUTF8Writer(outdir, "log.txt");
//fixMichel(indir, outdir);
generateConfusables(indir, outdir);
log.close();
if (false) for (Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext();) {
String source = (String)it.next();
System.out.println(Default.ucd().getCodeAndName(source)
+ " => " + Default.ucd().getCodeAndName((String)gatheredNFKD.get(source)));
}
}
/* static class Data2 {
2005-05-27 21:43:46 +00:00
String source;
String target;
int count;
Data2(String target, int count) {
this.target = target;
this.count = count;
}
}
*/
/* static class Data implements Comparable {
2005-05-27 21:43:46 +00:00
String source;
String target;
String type;
Data(String source, String target, String type) {
this.source = source;
this.target = target;
this.type = type;
}
public int compareTo(Object o) {
int result;
Data that = (Data)o;
if (0 != (result = target.compareTo(that.target))) return result;
if (0 != (result = source.compareTo(that.source))) return result;
if (0 != (result = type.compareTo(that.type))) return result;
return 0;
}
}
*/
2005-05-27 21:43:46 +00:00
/**
*
*/
static void writeSourceTargetLine(PrintWriter out, String source, String tag, String target, String reason) {
out.print(
Utility.hex(source)
+ " ;\t" + Utility.hex(target)
+ (tag == null ? "" : " ;\t" + tag)
//+ " ;\t" + (preferredID.contains(source) ? "ID" : "")
+ "\t# "
+ "( " + source + " " + ARROW + " " + target + ") "
+ Default.ucd().getName(source) + " " + ARROW + " "
+ Default.ucd().getName(target)
);
if (reason != null) out.print("\t# " + reason);
out.println();
}
2005-05-27 21:43:46 +00:00
static UnicodeSet controls = new UnicodeSet("[:Cc:]");
static class MyEquivalenceClass extends XEquivalenceClass {
public MyEquivalenceClass() {
super("NONE");
}
public boolean addCheck(String a, String b, String reason) {
// quick check for illegal containment, before changing object
if (checkForBad(a, b, reason) || checkForBad(b, a, reason)) {
return false;
}
super.add(a, b, reason);
// full check for any resulting illegal containment.
// illegal if for any x, y, x is a proper superstring of y
Set equivalences = getEquivalences(a);
for (Iterator it = equivalences.iterator(); it.hasNext();) {
String x = (String)it.next();
if (!UTF16.hasMoreCodePointsThan(x,1)) continue;
for (Iterator it2 = equivalences.iterator(); it2.hasNext();) {
String y = (String)it2.next();
if (x.equals(y)) continue;
if (x.indexOf(y) >= 0) throw new RuntimeException("Illegal containment: "
+ Default.ucd().getCodeAndName(x) + " contains "
+ Default.ucd().getCodeAndName(y) + " because "
+ Default.ucd().getCodeAndName(a) + " ~ "
+ Default.ucd().getCodeAndName(b) + " because of "
+ reason);
}
}
return true;
}
/**
*
*/
private boolean checkForBad(String a, String b, String reason) {
Set equivalences = getEquivalences(b);
for (Iterator it = equivalences.iterator(); it.hasNext();) {
String b2 = (String)it.next();
if (a.equals(b2)) continue;
if (b2.indexOf(a) >= 0 || a.indexOf(b2) >= 0) {
log.println("Illegal containment: "
+ Default.ucd().getCodeAndName(a)
+ " overlaps "
+ Default.ucd().getCodeAndName(b2)
+ "\r\n\tfrom "
+ Default.ucd().getCodeAndName(b)
+ "\r\n\twith reason "
+ reason + " plus "
+ getReasons(b2, b));
return true;
}
}
return false;
}
public XEquivalenceClass add(Object a1, Object b1, String reason) {
String a = (String)a1;
String b = (String)b1;
try {
addCheck(a, b, reason);
return this;
} catch (RuntimeException e) {
throw (RuntimeException) new RuntimeException("Failure adding "
+ Default.ucd().getCodeAndName(a) + "; "
+ Default.ucd().getCodeAndName(b)
+ "; " + reason).initCause(e);
}
}
/**
* Only NFKD if the result doesn't cross from ID set to nonID set, and space is not added
*/
// private String specialNFKD(String item) {
// UnicodeSet skipSet = getSkipNFKD();
// StringBuffer result = new StringBuffer();
// int cp;
// for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
// cp = UTF16.charAt(item, i);
// if (skipSet.contains(cp)) {
// UTF16.append(result, cp);
// continue;
// }
// String cps = UTF16.valueOf(cp);
// String mapped = Default.nfkd().normalize(cps);
// if (cps.equals(mapped)) {
// UTF16.append(result, cp);
// continue;
// }
// result.append(mapped);
// gatheredNFKD.put(cps, mapped);
// }
// return result.toString();
// }
public void close(String reason) {
boolean addedItem;
StringBuffer reasons = new StringBuffer();
do {
addedItem = false;
Set cloneForSafety = getOrderedExplicitItems();
for (Iterator it = cloneForSafety.iterator(); it.hasNext();) {
String item = (String) it.next();
if (!UTF16.hasMoreCodePointsThan(item,1)) continue; // just for speed
reasons.setLength(0);
String mapped = mapString(item, reasons);
if (!isEquivalent(item, mapped)) {
if (addCheck(item, mapped, reasons.toString())) {
// System.out.println("Closing: " + Default.ucd().getCodeAndName(item) + " => " + Default.ucd().getCodeAndName(mapped));
addedItem = true;
}
}
}
} while (addedItem);
}
/**
*
*/
private String mapString(String item, StringBuffer reasons) {
if (false && item.startsWith("\u03D2")) {
System.out.println("foo");
}
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(item, i);
String cps = UTF16.valueOf(cp);
String mapped = getParadigm(cps);
if (mapped.indexOf(cps) >= 0) result.append(cps);
else {
result.append(mapped);
reasons.append("[" + getReasons(cps, mapped) + "]");
}
}
return result.toString();
}
public String getParadigm(Object item) {
return (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
}
public Set getOrderedExplicitItems() {
Set cloneForSafety = new TreeSet(codepointComparator);
cloneForSafety.addAll(getExplicitItems());
return cloneForSafety;
}
/**
*
*/
public void writeSource(PrintWriter out) {
Set items = getOrderedExplicitItems();
for (Iterator it = items.iterator(); it.hasNext();) {
String item = (String) it.next();
String paradigm = (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
if (item.equals(paradigm)) continue;
writeSourceTargetLine(out, item, null, paradigm, null);
}
}
}
2005-05-27 21:43:46 +00:00
static class DataSet {
MyEquivalenceClass dataMixedLowercase = new MyEquivalenceClass();
MyEquivalenceClass dataMixedAnycase = new MyEquivalenceClass();
MyEquivalenceClass dataSingleLowercase = new MyEquivalenceClass();
MyEquivalenceClass dataSingleAnycase = new MyEquivalenceClass();
public DataSet add(String source, String target, String type, int lineCount, String errorLine) {
2005-05-27 21:43:46 +00:00
if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
String nsource = Default.nfd().normalize(source);
String ntarget = Default.nfd().normalize(target);
2005-05-27 21:43:46 +00:00
// if it is just a compatibility match, return
//if (nsource.equals(ntarget)) return this;
if (type.indexOf("skip") >= 0) return this;
if (target.indexOf('\u203D') >= 0) return this;
2005-05-27 21:43:46 +00:00
type = getReasonFromFilename(type);
2005-05-27 21:43:46 +00:00
// if it is base + combining sequence => base2 + same combining sequence, do just the base
int nsourceFirst = UTF16.charAt(nsource,0);
String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
int ntargetFirst = UTF16.charAt(ntarget,0);
String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
2005-05-27 21:43:46 +00:00
if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
source = UTF16.valueOf(nsourceFirst);
target = UTF16.valueOf(ntargetFirst);
type += "-base";
}
type += ":" + lineCount;
2005-05-27 21:43:46 +00:00
String combined = source + target;
boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
boolean isMixed = isMixedScript(combined);
dataMixedAnycase.add(source, target, type);
if (isLowercase) dataMixedLowercase.add(source, target, type);
if (!isMixed) dataSingleAnycase.add(source, target, type);
if (!isMixed && isLowercase) dataSingleLowercase.add(source, target, type);
return this;
2005-05-27 21:43:46 +00:00
}
/* *//**
2005-05-27 21:43:46 +00:00
* @param errorLine TODO
*
*//*
2005-05-27 21:43:46 +00:00
private DataSet add(Data newData, String errorLine) {
if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
System.out.println("Problem with " + errorLine);
System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
}
String[] key = {newData.source, newData.target};
Data old = (Data) dataMap.get(key);
if (old == null) {
dataSet.add(newData);
dataMap.put(key, newData);
}else {
old.type = old.type + "/" + newData.type;
}
return this;
}
*/ // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
2005-05-27 21:43:46 +00:00
static final int NORMAL = 0, FOLDING = 1, OLD = 2;
public DataSet addFile(String directory, String filename) throws IOException {
String line = null;
int count = 0;
try {
BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
int kind = NORMAL;
if (filename.indexOf("Folding") >= 0) kind = FOLDING;
else if (false && filename.indexOf("-old") >= 0) kind = OLD;
while (true) {
count++;
line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
2005-05-27 21:43:46 +00:00
}
String type = filename;
if (kind==FOLDING) {
String source = Utility.fromHex(pieces[0].trim(),true);
String target = Utility.fromHex(pieces[1].trim(),true);
String nsource = Default.nfkd().normalize(source);
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
if (!first.equals(target)) {
add(source, target, type, count, line);
}
} else if (kind == OLD) {
String target = pieces[0].trim();
for (int i = 1; i < pieces.length; ++i) {
add(pieces[i].trim(), target, type, count, line);
}
} else {
String source = Utility.fromHex(pieces[0].trim(),true);
String target = Utility.fromHex(pieces[1].trim(),true);
//if (pieces.length > 2) type = pieces[2].trim();
add(source, target, type, count, line);
2005-05-27 21:43:46 +00:00
}
}
in.close();
return this;
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure with file: "
+ directory + filename + " on line: " + count
+ ": " + line).initCause(e);
}
2005-05-27 21:43:46 +00:00
}
public void writeSource(String directory, String filename) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.println("# Source File for IDN Confusables");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("");
dataMixedAnycase.writeSource(out);
out.close();
}
public void write(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
2005-05-27 21:43:46 +00:00
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.print('\uFEFF');
out.println("# Recommended confusable mapping for IDN");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("");
2005-05-27 21:43:46 +00:00
if (appendFile) {
String[] replacements = {"%date%", Default.getDate()};
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
Utility.UTF8_WINDOWS, out, replacements);
}
writeData(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
writeData(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
writeData(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
writeData(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
2005-05-27 21:43:46 +00:00
out.close();
}
/**
* @param skipNFKEquivs TODO
*
*/
private void writeData(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
Set items = data.getOrderedExplicitItems();
out.println();
out.println("# " + title);
out.println();
int count = 0;
UnicodeSet preferredID = getIdentifierSet();
for (Iterator it = items.iterator(); it.hasNext();) {
String source = (String) it.next();
if (UTF16.hasMoreCodePointsThan(source,1)) continue;
String target = data.getParadigm(source);
if (source.equals(target)) continue;
if (skipNFKEquivs) {
if (!Default.nfkd().normalize(source).equals(source)) continue;
}
String reason = fixReason(data.getReasons(source, target));
writeSourceTargetLine(out, source, tag, target, reason);
count++;
}
out.println();
out.println("# total for (" + tag + "): " + count);
out.println();
}
2005-05-27 21:43:46 +00:00
/**
*
*/
private String fixReason(List reasons) {
List first = (List)reasons.get(0);
String result = "";
for (int i = 0; i < first.size(); ++i) {
if (i != 0) result += " ";
Object item = first.get(i);
if (item instanceof String) {
result += item;
} else {
String temp = "";
for (Iterator it = ((Set)item).iterator(); it.hasNext();) {
if (temp.length() != 0) temp += "|";
temp += it.next();
}
result += "{" + temp + "}";
}
2005-05-27 21:43:46 +00:00
}
return result.toString();
2005-05-27 21:43:46 +00:00
}
public void addAll(DataSet ds) {
dataMixedAnycase.addAll(ds.dataMixedAnycase);
dataMixedLowercase.addAll(ds.dataMixedLowercase);
dataSingleAnycase.addAll(ds.dataSingleAnycase);
dataSingleLowercase.addAll(ds.dataSingleLowercase);
}
/* *//**
*
*//*
2005-05-27 21:43:46 +00:00
public DataSet clean() {
// remove all skips
DataSet tempSet = new DataSet();
Map m = new HashMap();
for (Iterator it = dataSet.iterator(); it.hasNext();) {
Data d = (Data) it.next();
if (d.type.indexOf("skip") >= 0) continue;
String newTarget = Default.nfkd().normalize(d.target);
String newSource = Default.nfkd().normalize(d.source);
String type = d.type;
if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
type += "-nf";
log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
continue;
}
// swap order
if (preferSecondAsSource(newSource, newTarget)) {
String temp = newTarget;
newTarget = newSource;
newSource = temp;
}
Data already = (Data) m.get(newSource);
if (already != null && !newTarget.equals(already.target)) {
log.println("X " + getCodeCharName(newSource) + " " + ARROW);
log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
if (preferSecondAsSource(already.target, newTarget)) {
// just fix new guy
type += "[" + newSource + "]" + already.type;
newSource = newTarget;
newTarget = already.target;
} else {
// need to fix new guy, AND fix old guy.
tempSet.remove(already);
type += "[" + newSource + "]" + already.type;
newSource = already.target;
already.type += "[" + already.target + "]" + type;
already.target = newTarget;
tempSet.add(already, "");
}
}
Data newData = new Data(newSource, newTarget, type);
m.put(newSource, newData);
tempSet.add(newData, "");
}
// now recursively apply
DataSet s = new DataSet();
for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
Data d = (Data) it.next();
int cp = 0;
StringBuffer result = new StringBuffer();
for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(d.target, i);
String src = UTF16.valueOf(cp);
while (true) {
Data rep = (Data) m.get(src);
if (rep == null) break;
src = rep.target;
}
result.append(src);
}
String newTarget = result.toString();
newTarget = Default.nfkd().normalize(newTarget);
s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
}
return s;
}
*//**
2005-05-27 21:43:46 +00:00
*
*//*
2005-05-27 21:43:46 +00:00
private void remove(Data already) {
String[] key = {already.source, already.target};
dataMap.remove(key);
dataSet.remove(already);
}*/
/**
*
*/
public void close(String reason) {
dataMixedAnycase.close(reason);
dataMixedLowercase.close(reason);
dataSingleAnycase.close(reason);
dataSingleLowercase.close(reason);
}
/**
*
*/
public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) {
int count = 0;
for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.getSet(null).complement()); it.next(); ) {
add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine);
}
}
static class MyFilter implements XEquivalenceClass.Filter {
UnicodeSet output;
public boolean matches(Object o) {
return output.containsAll((String)o);
}
}
/**
* @throws IOException
*
*/
public void writeSummary(String outdir, String string, boolean outputOnly) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, string);
out.print('\uFEFF');
out.println("# Summary: Recommended confusable mapping for IDN");
out.println("# $Revision: 1.3 $");
out.println("# $Date: 2005/06/24 23:51:52 $");
out.println("");
MyEquivalenceClass data = dataMixedAnycase;
Set items = data.getOrderedExplicitItems();
for (Iterator it = items.iterator(); it.hasNext();) {
System.out.println(Default.ucd().getCodeAndName((String)it.next()));
}
int count = 0;
UnicodeSet preferredID = getIdentifierSet();
String lastTarget = "";
UnicodeSet outputAllowed = IdentifierInfo.getIdentifierInfo().remainingOutputSet;
Set itemsSeen = new HashSet();
Set equivalents = new TreeSet(betterTargetIsLess);
for (Iterator it = items.iterator(); it.hasNext();) {
String target = (String) it.next();
if (itemsSeen.contains(target)) continue;
out.println();
out.println(getStatus(target) + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + Default.ucd().getName(target));
//if (UTF16.hasMoreCodePointsThan(source,1)) continue;
equivalents.clear();
equivalents.addAll(data.getEquivalences(target));
for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
String source = (String) it2.next();
if (source.equals(target)) continue;
//boolean compatEqual = Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target));
//if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue;
String reason = fixReason(data.getReasons(source, target));
//if (!outputAllowed.containsAll(source)) continue;
// if (compatEqual) {
// out.print("\u21D0");
// } else {
// out.print("\u2190");
// }
out.println("\u2190" + getStatus(source) + "\t" + "(\u200E " + source + " \u200E)\t" + Utility.hex(source) + "\t " + Default.ucd().getName(source)
+ "\t# " + reason);
count++;
}
}
out.println();
out.println("# total : " + count);
out.println();
out.close();
2005-05-27 21:43:46 +00:00
}
/**
*
*/
private String getStatus(String source) {
// TODO Auto-generated method stub
int val = betterTargetIsLess.getValue(source);
if (val == MARK_NOT_NFC.intValue()) return "[x]";
if (val == MARK_NFC.intValue()) return "[x]";
if (val == MARK_INPUT_LENIENT.intValue()) return "[L]";
if (val == MARK_INPUT_STRICT.intValue()) return "[I]";
if (val == MARK_OUTPUT.intValue()) return "[O]";
return "?";
}
2005-05-27 21:43:46 +00:00
}
/**
* @throws IOException
*
*/
private static void fixMichel(String indir, String outdir) throws IOException {
BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
String[] pieces = Utility.split(line,'\t');
if (pieces.length < 2) {
out.println(line);
continue;
}
String source = Utility.fromHex(pieces[0].trim());
if (Default.nfkd().isNormalized(source)) {
out.println(line);
}
}
in.close();
out.close();
}
/**
*
*/
private static void generateSource() throws IOException {
File dir = new File(indir);
String[] names = dir.list();
Set sources = new TreeSet(new ArrayComparator(
new Comparator[] {codepointComparator, codepointComparator}));
int[] count = new int[1];
for (int i = 0; i < names.length; ++i) {
if (new File(indir + names[i]).isDirectory()) continue;
if (!names[i].startsWith("confusables")) continue;
String reason = getReasonFromFilename(names[i]);
System.out.println(names[i]);
BufferedReader in = BagFormatter.openUTF8Reader(indir, names[i]);
String line;
count[0] = 0;
while (true) {
line = Utility.readDataLine(in, count);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
}
String source = Utility.fromHex(pieces[0].trim(),true);
String target = Utility.fromHex(pieces[1].trim(),true);
if (source.length() == 0 || target.length() == 0) {
throw new IllegalArgumentException("zero-length item: " + count[0] + ":\t" + line);
}
// check for identical combining sequences
String nsource = Default.nfc().normalize(source);
String ntarget = Default.nfc().normalize(target);
if (nsource.equals(ntarget)) continue;
if (true) {
int nsourceFirst = UTF16.charAt(nsource,0);
String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
int ntargetFirst = UTF16.charAt(ntarget,0);
String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
if (nsourceRest.equals(ntargetRest)) {
source = UTF16.valueOf(nsourceFirst);
target = UTF16.valueOf(ntargetFirst);
}
}
if (betterTargetIsLess.compare(source, target) < 0) {
String temp = source;
source = target;
target = temp;
}
sources.add(new String[] {source, target});
}
in.close();
}
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "confusableSource.txt");
for (Iterator it = sources.iterator(); it.hasNext();) {
String[] sourceItem = (String[]) it.next();
writeSourceTargetLine(out, sourceItem[0], null, sourceItem[1], null);
}
out.close();
}
2005-05-27 21:43:46 +00:00
private static void generateConfusables(String indir, String outdir) throws IOException {
betterTargetIsLess.compare("\u0020", "\u2004");
2005-05-27 21:43:46 +00:00
File dir = new File(indir);
String[] names = dir.list();
DataSet total = new DataSet();
for (int i = 0; i < names.length; ++i) {
if (new File(indir + names[i]).isDirectory()) continue;
if (!names[i].startsWith("confusables")) continue;
2005-05-27 21:43:46 +00:00
System.out.println(names[i]);
DataSet ds = new DataSet();
ds.addFile(indir, names[i]);
ds.writeSource(outdir, "new-" + names[i]);
ds.close("*");
total.addAll(ds);
total.close("t*" + names[i]);
2005-05-27 21:43:46 +00:00
}
DataSet ds = new DataSet();
ds.addUnicodeMap(nfcMap, "nfc", "nfc");
ds.close("*");
ds.write(outdir, "new-decomp.txt", false, false);
total.addAll(ds);
total.close("*");
total.writeSummary(outdir, "confusablesSummary.txt", false);
total.writeSummary(outdir, "confusablesSummaryOutput.txt", true);
total.write(outdir, "confusables.txt", false, false);
//DataSet clean = total.clean();
//clean.write(outdir, "confusables.txt", true);
2005-05-27 21:43:46 +00:00
}
/*
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(),
new UTF16.StringComparator()}));
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
}
String source = Utility.fromHex(pieces[0].trim());
String target = Utility.fromHex(pieces[1].trim());
String nsource = Default.nfkd().normalize(source);
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
if (!first.equals(target)) {
set.add(new String[]{source, target});
}
}
in.close();
}
public static void gen() throws IOException {
Map m = new TreeMap();
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
while (true) {
String line = in.readLine();
if (line == null) break;
String[] pieces = Utility.split(line,';');
if (pieces.length < 3) {
System.out.println("Error on: " + line);
continue;
}
int codepoint = Integer.parseInt(pieces[1], 16);
int cat = Default.ucd().getCategory(codepoint);
if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
String result = Utility.fromHex(pieces[0]);
if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
int count = Integer.parseInt(pieces[2]);
String source = UTF16.valueOf(codepoint);
add(m, source, result, count);
}
in.close();
in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
while (true) {
String line = in.readLine();
if (line == null) break;
line = line.trim();
int pos = line.indexOf("#");
if (pos >= 0) line = line.substring(0,pos).trim();
if (line.length() == 0) continue;
if (line.startsWith("@")) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
}
String source = pieces[0].trim();
for (int i = 1; i < pieces.length; ++i) {
add(m, source, pieces[i].trim(), -1);
}
}
in.close();
boolean gotOne;
// close the set
do {
gotOne = false;
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
String source = (String) it.next();
Data2 data = (Data2) m.get(source);
Data2 data2 = (Data2) m.get(data.target);
if (data2 == null) continue;
data.target = data2.target;
gotOne = true;
break;
}
} while (gotOne);
// put into different sorting order
Set s = new TreeSet();
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
String source = (String) it.next();
Data2 data = (Data2) m.get(source);
s.add(new Data(source, data.target, data.count));
}
// write it out
PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
String[] replacements = {"%date%", Default.getDate()};
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
Utility.UTF8_WINDOWS, out, replacements);
for (Iterator it = s.iterator(); it.hasNext();) {
Data d = (Data) it.next();
if (d == null) continue;
out.println(formatLine(d.source, d.target, d.count));
}
out.close();
System.out.println("Done");
}
/**
*
*/
private static String formatLine(String source, String target, int count) {
return Utility.hex(source) + " ; " + Utility.hex(target," ")
+ " ; " + count
+ " # "
+ "(" + source + " " + ARROW + " " + target + ") "
+ Default.ucd().getName(source)
+ " " + ARROW + " " + Default.ucd().getName(target);
}
/**
*
*/
/* private static void add(Map m, String source, String target, int count) {
2005-05-27 21:43:46 +00:00
if (source.length() == 0 || target.length() == 0) return;
if (preferSecondAsSource(source, target)) {
String temp = target;
target = source;
source = temp;
}
Data2 other = (Data2) m.get(source);
if (other != null) {
if (target.equals(other.target)) return;
System.out.println("conflict");
System.out.println(formatLine(source, target, count));
System.out.println(formatLine(source, other.target, other.count));
// skip adding this, and instead add result -> other.target
add(m, target, other.target, count);
} else {
m.put(source, new Data2(target, count));
}
};
*/
static Integer
MARK_NOT_NFC = new Integer(50),
MARK_NFC = new Integer(40),
MARK_INPUT_LENIENT = new Integer(30),
MARK_INPUT_STRICT = new Integer(20),
MARK_OUTPUT = new Integer(10);
static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();
static class _BetterTargetIsLess implements Comparator {
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
public int compare(Object o1, Object o2) {
String a = (String)o1;
String b = (String)o2;
int ca = UTF16.countCodePoint(a);
int cb = UTF16.countCodePoint(b);
if (ca != cb) return ca > cb ? -1 : 1;
int aok = getValue(a);
int bok = getValue(b);
if (aok != bok) return aok < bok ? -1 : 1;
return codepointComparator.compare(a, b);
}
static final int BAD = 1000;
private int getValue(String a) { // lower is better
int cp;
int lastValue = 0;
for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(a, i);
Object objValue = info.lowerIsBetter.getValue(cp);
int value = ((Integer) objValue).intValue();
if (value > lastValue) lastValue = value;
}
return lastValue;
}
};
2005-05-27 21:43:46 +00:00
/* static private boolean preferSecondAsSource(String a, String b) {
2005-05-27 21:43:46 +00:00
// if first is longer, prefer second
int ca = UTF16.countCodePoint(a);
int cb = UTF16.countCodePoint(b);
if (ca != cb) {
return ca > cb;
}
// if first is lower, prefer second
return a.compareTo(b) < 0;
}
*/
2005-05-27 21:43:46 +00:00
static String getCodeCharName(String a) {
return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a);
}
/**
* Returns the part between - and .
*/
public static String getReasonFromFilename(String type) {
int period = type.lastIndexOf('.');
if (period < 0) period = type.length();
int dash = type.lastIndexOf('-', period);
return type.substring(dash+1,period);
}
2005-05-27 21:43:46 +00:00
}