ICU-0 security tools
X-SVN-Rev: 18127
This commit is contained in:
parent
8112d3d593
commit
6395518f88
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
|
||||
* $Date: 2005/06/24 23:51:52 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2005/07/01 22:10:00 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -19,6 +19,7 @@ import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@ -44,6 +45,7 @@ import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.text.UCD.GenerateConfusables.WholeScript.UnicodeSetToScript;
|
||||
import com.ibm.text.utility.Utility;
|
||||
import com.ibm.text.utility.XEquivalenceClass;
|
||||
import com.sun.corba.se.connection.GetEndPointInfoAgainException;
|
||||
@ -64,6 +66,37 @@ public class GenerateConfusables {
|
||||
System.out.println("Done");
|
||||
}
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
static UnicodeSet _Non_IICore;
|
||||
|
||||
private static UnicodeSet getNonIICore() {
|
||||
//Main + IICore + (Ext-A intersect Chinese)
|
||||
//blk; n/a ; CJK_Unified_Ideographs
|
||||
//blk; n/a ; CJK_Unified_Ideographs_Extension_A
|
||||
//blk; n/a ; CJK_Unified_Ideographs_Extension_B
|
||||
|
||||
if (_Non_IICore == null) {
|
||||
// stuff to remove
|
||||
_Non_IICore = ups.getSet("block=CJK_Unified_Ideographs_Extension_A");
|
||||
_Non_IICore.addAll(ups.getSet("block=CJK_Unified_Ideographs_Extension_B"));
|
||||
_Non_IICore.removeAll(UNASSIGNED); // remove unassigned
|
||||
// stuff to restore
|
||||
UnicodeMap um = Default.ucd().getHanValue("kIICore");
|
||||
um.put(0x34E4, "2.1");
|
||||
um.put(0x3007, "2.1");
|
||||
_Non_IICore.removeAll(um.getSet("2.1"));
|
||||
// add Chinese?
|
||||
}
|
||||
return _Non_IICore;
|
||||
// for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) {
|
||||
// Object value = it.next();
|
||||
// UnicodeSet set = um.getSet(value);
|
||||
// System.out.println(value + "\t" + set);
|
||||
// }
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
static final String ARROW = "\u2192";
|
||||
static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
|
||||
@ -83,7 +116,7 @@ public class GenerateConfusables {
|
||||
static String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\source\\";
|
||||
static String outdir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
|
||||
|
||||
static Comparator codepointComparator = new UTF16.StringComparator();
|
||||
static Comparator codepointComparator = new UTF16.StringComparator(true,false,0);
|
||||
|
||||
static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
|
||||
"\\u3400-\\u4DB5" +
|
||||
@ -166,8 +199,7 @@ public class GenerateConfusables {
|
||||
additions.getSet(null).complement()).retainAll(propNFKCSet);
|
||||
|
||||
getIdentifierSet();
|
||||
notInXID = new UnicodeSet(IDNOutputSet)
|
||||
.removeAll(xidPlus);
|
||||
notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus);
|
||||
removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
|
||||
removalSet = removals.getSet(null).complement();
|
||||
|
||||
@ -225,6 +257,7 @@ public class GenerateConfusables {
|
||||
lowerIsBetter.putAll(inputSet_lenient, MARK_INPUT_LENIENT);
|
||||
lowerIsBetter.putAll(inputSet_strict, MARK_INPUT_STRICT);
|
||||
lowerIsBetter.putAll(remainingOutputSet, MARK_OUTPUT);
|
||||
lowerIsBetter.putAll(remainingOutputSet, MARK_ASCII);
|
||||
lowerIsBetter.setMissing(MARK_NOT_NFC);
|
||||
|
||||
lowerIsBetter.lock();
|
||||
@ -252,7 +285,7 @@ public class GenerateConfusables {
|
||||
return "remap-to-" + Utility.hex(b.toString());
|
||||
}
|
||||
};
|
||||
reviews.composeWith(remap, composer2);
|
||||
//reviews.composeWith(remap, composer2);
|
||||
removals2 = new UnicodeMap().putAll(removals);
|
||||
removals2.putAll(ups.getSet("XID_Continue=TRUE").complement(),
|
||||
PROHIBITED + NOT_IN_XID);
|
||||
@ -342,6 +375,7 @@ public class GenerateConfusables {
|
||||
throw (RuntimeException) new RuntimeException(
|
||||
"Failure on line " + line).initCause(e);
|
||||
}
|
||||
removals.putAll(getNonIICore(), "~IICore");
|
||||
br.close();
|
||||
}
|
||||
|
||||
@ -367,8 +401,8 @@ public class GenerateConfusables {
|
||||
//reviews.putAll(UNASSIGNED, "");
|
||||
out.print("\uFEFF");
|
||||
out.println("# Review List for IDN");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("");
|
||||
|
||||
UnicodeSet fullSet = reviews.getSet("").complement();
|
||||
@ -423,8 +457,8 @@ public class GenerateConfusables {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
|
||||
|
||||
out.println("# Recommended Identifier Profiles for IDN");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
|
||||
out.println("");
|
||||
out.println("# Output Characters");
|
||||
@ -456,10 +490,9 @@ public class GenerateConfusables {
|
||||
bf.setValueSource("nonstarting");
|
||||
bf.showSetNames(out, nonstarting);
|
||||
|
||||
out.println("");
|
||||
//out.println("");
|
||||
|
||||
showRemapped(out,
|
||||
"Characters remapped on input (in GUIs)", remap);
|
||||
//showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", remap);
|
||||
|
||||
out.close();
|
||||
}
|
||||
@ -494,8 +527,8 @@ public class GenerateConfusables {
|
||||
"xidmodifications.txt");
|
||||
|
||||
out.println("# Security Profile for General Identifiers");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("");
|
||||
|
||||
out.println("# Characters restricted");
|
||||
@ -516,7 +549,7 @@ public class GenerateConfusables {
|
||||
bf.setValueSource("addition");
|
||||
bf.showSetNames(out, additions.getSet(null).complement());
|
||||
|
||||
showRemapped(out, "Characters remapped on input", remap);
|
||||
//showRemapped(out, "Characters remapped on input", remap);
|
||||
|
||||
out.close();
|
||||
|
||||
@ -525,12 +558,14 @@ public class GenerateConfusables {
|
||||
public Object compose(int codePoint, Object a, Object b) {
|
||||
if (b == null) return null;
|
||||
String x = (String)b;
|
||||
if (false) {
|
||||
if (!IDNOutputSet.contains(codePoint)) {
|
||||
return "~IDNA";
|
||||
}
|
||||
if (!xidPlus.contains(codePoint)) {
|
||||
return "~Unicode Identifier";
|
||||
}
|
||||
}
|
||||
if (x.startsWith(PROHIBITED)) x = x.substring(PROHIBITED.length());
|
||||
//if (!propNFKCSet.contains(codePoint)) x += "*";
|
||||
if (lowercase.contains(codePoint)) {
|
||||
@ -542,11 +577,15 @@ public class GenerateConfusables {
|
||||
}
|
||||
};
|
||||
someRemovals.composeWith(removals, myComposer);
|
||||
UnicodeSet nonIDNA = new UnicodeSet(IDNOutputSet).addAll(IDNInputSet).complement();
|
||||
someRemovals.putAll(nonIDNA, "~IDNA");
|
||||
someRemovals.putAll(new UnicodeSet(xidPlus).complement(), "~Unicode Identifier");
|
||||
someRemovals.putAll(UNASSIGNED, null); // clear extras
|
||||
//someRemovals = removals;
|
||||
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
|
||||
out.println("# Characters restricted in domain names");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("#");
|
||||
out.println("# This file contains a draft list of characters for use in");
|
||||
out.println("# UTR #36: Unicode Security Considerations");
|
||||
@ -568,6 +607,9 @@ public class GenerateConfusables {
|
||||
out.println("# do not fit the specification of identifiers given in");
|
||||
out.println("# UAX #31: Identifier and Pattern Syntax");
|
||||
out.println("# http://unicode.org/reports/tr31/");
|
||||
out.println("# - Characters listed as ~IICore are restricted because they are Ideographic,");
|
||||
out.println("# but not part of the IICore set defined by the IRG as the minimal set");
|
||||
out.println("# of required ideographs for East Asian use.");
|
||||
out.println("# - The files in this directory are 'live', and may change at any time.");
|
||||
out.println("# Please include the above Revision number in your feedback.");
|
||||
|
||||
@ -715,19 +757,29 @@ public class GenerateConfusables {
|
||||
}
|
||||
|
||||
private static boolean isMixedScript(String source) {
|
||||
return getSingleScript(source) != UScript.INVALID_CODE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
|
||||
*/
|
||||
private static int getSingleScript(String source) {
|
||||
int lastScript = UScript.INVALID_CODE;
|
||||
int cp;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
int script = UScript.getScript(cp);
|
||||
if (script == UScript.COMMON || script == UScript.INHERITED) {
|
||||
if (XIDContinueSet.contains(cp)) continue; // skip if not identifier
|
||||
if (XIDContinueSet.contains(cp)) {
|
||||
if (lastScript == UScript.INVALID_CODE) lastScript = script;
|
||||
continue; // skip if not identifier
|
||||
}
|
||||
script = UScript.COMMON;
|
||||
}
|
||||
if (lastScript == UScript.INVALID_CODE) lastScript = script;
|
||||
else if (script != lastScript) return true;
|
||||
else if (script != lastScript) return UScript.INVALID_CODE;
|
||||
}
|
||||
return false;
|
||||
return lastScript;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1067,19 +1119,19 @@ public class GenerateConfusables {
|
||||
public void writeSource(String directory, String filename) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.println("# Source File for IDN Confusables");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("");
|
||||
dataMixedAnycase.writeSource(out);
|
||||
out.close();
|
||||
}
|
||||
|
||||
public void write(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
|
||||
public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("");
|
||||
|
||||
if (appendFile) {
|
||||
@ -1087,17 +1139,27 @@ public class GenerateConfusables {
|
||||
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
|
||||
Utility.UTF8_WINDOWS, out, replacements);
|
||||
}
|
||||
writeData(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
|
||||
writeData(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
|
||||
writeData(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
|
||||
writeData(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
|
||||
out.close();
|
||||
}
|
||||
/**
|
||||
* @param skipNFKEquivs TODO
|
||||
*
|
||||
*/
|
||||
private void writeData(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
|
||||
private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
|
||||
// first get all the sets. Then get the best paradigm from each. Then sort.
|
||||
// Set setOfSets = data.getEquivalenceSets();
|
||||
// Map orderedResults = new TreeMap(betterTargetIsLess);
|
||||
// for (Iterator it = setOfSets.iterator(); it.hasNext();) {
|
||||
// Set setOfEquivs = (Set) it.next();
|
||||
// Object item = CollectionUtilities.getBest(setOfEquivs, betterTargetIsLess, -1);
|
||||
//
|
||||
// }
|
||||
//int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00");
|
||||
//System.out.println("Code Point Compare: " + c);
|
||||
Set items = data.getOrderedExplicitItems();
|
||||
out.println();
|
||||
out.println("# " + title);
|
||||
@ -1256,36 +1318,78 @@ public class GenerateConfusables {
|
||||
}
|
||||
}
|
||||
|
||||
static class MyCollectionFilter implements CollectionUtilities.Filter {
|
||||
UnicodeSet outputAllowed;
|
||||
int minLength;
|
||||
public boolean matches(Object o) {
|
||||
String item = (String)o;
|
||||
if (!outputAllowed.containsAll(item)) return false;
|
||||
int len = UTF16.countCodePoint(item);
|
||||
if (len < minLength) minLength = len;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
/**
|
||||
* @param script TODO
|
||||
* @throws IOException
|
||||
*
|
||||
*/
|
||||
public void writeSummary(String outdir, String string, boolean outputOnly) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, string);
|
||||
public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
UnicodeSet representable = new UnicodeSet();
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.3 $");
|
||||
out.println("# $Date: 2005/06/24 23:51:52 $");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("");
|
||||
MyEquivalenceClass data = dataMixedAnycase;
|
||||
Set items = data.getOrderedExplicitItems();
|
||||
for (Iterator it = items.iterator(); it.hasNext();) {
|
||||
System.out.println(Default.ucd().getCodeAndName((String)it.next()));
|
||||
}
|
||||
// for (Iterator it = items.iterator(); it.hasNext();) {
|
||||
// System.out.println(Default.ucd().getCodeAndName((String)it.next()));
|
||||
// }
|
||||
int count = 0;
|
||||
UnicodeSet preferredID = getIdentifierSet();
|
||||
String lastTarget = "";
|
||||
UnicodeSet outputAllowed = IdentifierInfo.getIdentifierInfo().remainingOutputSet;
|
||||
Set itemsSeen = new HashSet();
|
||||
Set equivalents = new TreeSet(betterTargetIsLess);
|
||||
MyCollectionFilter myFilter = new MyCollectionFilter();
|
||||
myFilter.outputAllowed= new UnicodeSet("[[\u0021-\u007E]-[:letter:]]")
|
||||
.addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet)
|
||||
.addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict);
|
||||
|
||||
for (Iterator it = items.iterator(); it.hasNext();) {
|
||||
String target = (String) it.next();
|
||||
if (itemsSeen.contains(target)) continue;
|
||||
equivalents.clear();
|
||||
equivalents.addAll(data.getEquivalences(target));
|
||||
itemsSeen.addAll(equivalents);
|
||||
if (outputOnly) { // remove non-output
|
||||
myFilter.minLength = 1000;
|
||||
CollectionUtilities.retainAll(equivalents, myFilter);
|
||||
if (equivalents.size() <= 1) continue;
|
||||
if (myFilter.minLength > 1) continue;
|
||||
if (!equivalents.contains(target)) { // select new target if needed
|
||||
target = (String) equivalents.iterator().next();
|
||||
}
|
||||
}
|
||||
scriptTest:
|
||||
if (script != null) {
|
||||
// see if at least one item contains the target script
|
||||
for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
|
||||
String item = (String) it2.next();
|
||||
if (script.containsAll(item)) {
|
||||
target = item;
|
||||
for (Iterator it3 = equivalents.iterator(); it3.hasNext();) {
|
||||
representable.addAll((String)it3.next());
|
||||
}
|
||||
break scriptTest;
|
||||
}
|
||||
}
|
||||
continue; // skip this one
|
||||
}
|
||||
out.println();
|
||||
out.println(getStatus(target) + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + Default.ucd().getName(target));
|
||||
//if (UTF16.hasMoreCodePointsThan(source,1)) continue;
|
||||
equivalents.clear();
|
||||
equivalents.addAll(data.getEquivalences(target));
|
||||
for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
|
||||
String source = (String) it2.next();
|
||||
if (source.equals(target)) continue;
|
||||
@ -1306,10 +1410,54 @@ public class GenerateConfusables {
|
||||
out.println();
|
||||
out.println("# total : " + count);
|
||||
out.println();
|
||||
|
||||
if (script != null) {
|
||||
out.println();
|
||||
out.println("# Base Letters Representable with Script");
|
||||
out.println();
|
||||
representable.removeAll(script);
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setValueSource(ups.getProperty("script"));
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.showSetNames(out, representable);
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void writeWholeScripts(String outdir, String filename) throws IOException {
|
||||
WholeScript wsLower = new WholeScript(
|
||||
new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet)
|
||||
.removeAll(new UnicodeSet("[A-Z]")), "L");
|
||||
WholeScript wsAny = new WholeScript(
|
||||
new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet)
|
||||
.addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), "A");
|
||||
MyEquivalenceClass data = dataMixedAnycase;
|
||||
Set items = data.getOrderedExplicitItems();
|
||||
Set itemsSeen = new HashSet();
|
||||
for (Iterator it = items.iterator(); it.hasNext();) {
|
||||
String target = (String) it.next();
|
||||
if (itemsSeen.contains(target)) continue;
|
||||
Set equivalents = data.getEquivalences(target);
|
||||
itemsSeen.addAll(equivalents);
|
||||
wsAny.addEquivalents(equivalents);
|
||||
wsLower.addEquivalents(equivalents);
|
||||
}
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Whole-Script Confusables");
|
||||
out.println("# $Revision: 1.4 $");
|
||||
out.println("# $Date: 2005/07/01 22:10:00 $");
|
||||
out.println("");
|
||||
out.println("# Lowercase Only");
|
||||
out.println("");
|
||||
wsLower.write(out);
|
||||
out.println("");
|
||||
out.println("# Any-Case");
|
||||
out.println("");
|
||||
wsAny.write(out);
|
||||
out.close();
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@ -1321,9 +1469,176 @@ public class GenerateConfusables {
|
||||
if (val == MARK_INPUT_LENIENT.intValue()) return "[L]";
|
||||
if (val == MARK_INPUT_STRICT.intValue()) return "[I]";
|
||||
if (val == MARK_OUTPUT.intValue()) return "[O]";
|
||||
if (val == MARK_ASCII.intValue()) return "[A]";
|
||||
|
||||
return "?";
|
||||
}
|
||||
}
|
||||
|
||||
static class WholeScript {
|
||||
private static UnicodeSet commonAndInherited = new UnicodeSet("[[:script=common:][:script=inherited:]]");
|
||||
private UnicodeSet filterSet;
|
||||
private UnicodeSet[] script_representables = new UnicodeSet[UScript.CODE_LIMIT];
|
||||
private UnicodeSet[] script_set = new UnicodeSet[UScript.CODE_LIMIT];
|
||||
private BagFormatter bf = new BagFormatter();
|
||||
private String label;
|
||||
{
|
||||
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
|
||||
script_representables[i] = new UnicodeSet();
|
||||
script_set[i] = new UnicodeSet("[:script=" + UScript.getName(i) + ":]"); // ugly hack
|
||||
}
|
||||
bf.setValueSource(ups.getProperty("script"));
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
}
|
||||
WholeScript(UnicodeSet filterSet, String label) {
|
||||
this.filterSet = filterSet;
|
||||
this.label = label;
|
||||
finished = false;
|
||||
}
|
||||
void addEquivalents(Set set) {
|
||||
finished = false;
|
||||
// if we have y ~ x, and both are single scripts
|
||||
// that means that x can be represented in script(y),
|
||||
// and y can be represented in script(x).
|
||||
for (Iterator it = set.iterator(); it.hasNext();) {
|
||||
String item1 = (String)it.next();
|
||||
if (!filterSet.containsAll(item1)) continue;
|
||||
int script1 = getSingleScript(item1);
|
||||
if (script1 == UScript.INVALID_CODE) continue;
|
||||
for (Iterator it2 = set.iterator(); it2.hasNext();) {
|
||||
String item2 = (String)it2.next();
|
||||
if (!filterSet.containsAll(item2)) continue;
|
||||
int script2 = getSingleScript(item2);
|
||||
if (script2 == UScript.INVALID_CODE || script2 == script1) continue;
|
||||
script_representables[script1].addAll(item2).removeAll(commonAndInherited);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static class UnicodeSetToScript {
|
||||
public int getScript() {
|
||||
return script;
|
||||
}
|
||||
public UnicodeSetToScript setScript(int script) {
|
||||
this.script = script;
|
||||
return this;
|
||||
}
|
||||
public UnicodeSet getSet() {
|
||||
return set;
|
||||
}
|
||||
public UnicodeSetToScript setSet(UnicodeSet set) {
|
||||
this.set = set;
|
||||
return this;
|
||||
}
|
||||
private UnicodeSet set;
|
||||
private int script;
|
||||
}
|
||||
|
||||
UnicodeSetToScript[][] scriptToUnicodeSetToScript;
|
||||
UnicodeSet[] fastReject;
|
||||
boolean finished = false;
|
||||
|
||||
void finish() {
|
||||
if (finished) return;
|
||||
for (int j = 0; j < UScript.CODE_LIMIT; ++j) {
|
||||
if (j == UScript.COMMON || j == UScript.INHERITED) continue;
|
||||
if (script_representables[j].size() == 0) continue;
|
||||
UnicodeSet accept = new UnicodeSet();
|
||||
List curr = new ArrayList();
|
||||
for (int k = 0; k < UScript.CODE_LIMIT; ++k) {
|
||||
if (k == UScript.COMMON || k == UScript.INHERITED) continue;
|
||||
if (script_representables[k].size() == 0) continue;
|
||||
|
||||
if (script_set[j].containsNone(script_representables[k])) continue;
|
||||
UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]);
|
||||
UnicodeSetToScript uss = new UnicodeSetToScript().setScript(k).setSet(items);
|
||||
curr.add(uss);
|
||||
}
|
||||
scriptToUnicodeSetToScript[j] = (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]);
|
||||
fastReject[j] = accept.complement();
|
||||
}
|
||||
finished = true;
|
||||
}
|
||||
|
||||
void write(PrintWriter out) throws IOException {
|
||||
for (int j = 0; j < UScript.CODE_LIMIT; ++j) {
|
||||
if (j == UScript.COMMON || j == UScript.INHERITED) continue;
|
||||
if (script_representables[j].size() == 0) continue;
|
||||
for (int k = 0; k < UScript.CODE_LIMIT; ++k) {
|
||||
if (k == UScript.COMMON || k == UScript.INHERITED) continue;
|
||||
if (script_representables[k].size() == 0) continue;
|
||||
|
||||
if (script_set[j].containsNone(script_representables[k])) continue;
|
||||
UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]);
|
||||
String sname = UScript.getShortName(j) + "; " + UScript.getShortName(k) + "; " + label;
|
||||
String name = UScript.getName(j) + "; " + UScript.getName(k);
|
||||
out.println("# " + name + ": " + items.toPattern(false));
|
||||
out.println("");
|
||||
bf.setValueSource(sname);
|
||||
bf.showSetNames(out, items);
|
||||
out.println("");
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* for this routine, we don't care what the targetScripts are,
|
||||
* just whether there is at least one whole-script confusable.
|
||||
*/
|
||||
boolean hasWholeScriptConfusable(String givenString, BitSet resultingScripts) {
|
||||
int givenScript = getSingleScript(givenString);
|
||||
if (givenScript == UScript.INVALID_CODE) {
|
||||
throw new IllegalArgumentException("Not single script string");
|
||||
}
|
||||
UnicodeSet givenSet = new UnicodeSet()
|
||||
.addAll(givenString)
|
||||
.removeAll(commonAndInherited);
|
||||
return hasWholeScriptConfusable(givenScript, givenSet, resultingScripts);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private boolean hasWholeScriptConfusable(int givenScript, UnicodeSet givenSet, BitSet resultingScripts) {
|
||||
resultingScripts.clear();
|
||||
if (fastReject[givenScript].containsSome(givenSet)) return false;
|
||||
UnicodeSetToScript[] possibles = scriptToUnicodeSetToScript[givenScript];
|
||||
for (int i = 0; i < possibles.length; ++i) {
|
||||
if (possibles[i].set.containsAll(givenSet)) {
|
||||
resultingScripts.set(possibles[i].script);
|
||||
}
|
||||
}
|
||||
return resultingScripts.isEmpty();
|
||||
}
|
||||
/*
|
||||
* for this routine, we don't care what the targetScripts are,
|
||||
* just whether there is at least one whole-script confusable.
|
||||
*/
|
||||
boolean hasMixedScriptConfusable(String givenString) {
|
||||
// UnicodeSet givenSet = new UnicodeSet()
|
||||
// .addAll(givenString)
|
||||
// .removeAll(commonAndInherited);
|
||||
// BitSet givenScripts = getScriptsIn(givenString);
|
||||
// for (int i = 0; i < givenScripts.length(); ++i) {
|
||||
// ;
|
||||
// }
|
||||
// UnicodeSet givenSet = new UnicodeSet()
|
||||
// .addAll(givenString)
|
||||
// .removeAll(commonAndInherited);
|
||||
// resultingScripts.clear();
|
||||
// if (fastReject[givenScript].containsSome(givenSet)) return false;
|
||||
// UnicodeSetToScript[] possibles = scriptToUnicodeSets[givenScript];
|
||||
// for (int i = 0; i < possibles.length; ++i) {
|
||||
// if (possibles[i].set.containsAll(givenSet)) {
|
||||
// resultingScripts.set(possibles[i].script);
|
||||
// }
|
||||
// }
|
||||
// return resultingScripts.isEmpty();
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*
|
||||
@ -1434,12 +1749,15 @@ public class GenerateConfusables {
|
||||
DataSet ds = new DataSet();
|
||||
ds.addUnicodeMap(nfcMap, "nfc", "nfc");
|
||||
ds.close("*");
|
||||
ds.write(outdir, "new-decomp.txt", false, false);
|
||||
//ds.write(outdir, "new-decomp.txt", false, false);
|
||||
total.addAll(ds);
|
||||
total.close("*");
|
||||
total.writeSummary(outdir, "confusablesSummary.txt", false);
|
||||
total.writeSummary(outdir, "confusablesSummaryOutput.txt", true);
|
||||
total.write(outdir, "confusables.txt", false, false);
|
||||
total.writeSummary(outdir, "confusablesSummary.txt", false, null);
|
||||
total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null);
|
||||
//total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true,
|
||||
// new UnicodeSet("[[:script=Cyrillic:][:script=common:][:script=inherited:]]"));
|
||||
total.writeWholeScripts(outdir, "confusablesWholeScript.txt");
|
||||
total.writeSourceOrder(outdir, "confusables.txt", false, false);
|
||||
//DataSet clean = total.clean();
|
||||
//clean.write(outdir, "confusables.txt", true);
|
||||
}
|
||||
@ -1586,9 +1904,11 @@ public class GenerateConfusables {
|
||||
MARK_NFC = new Integer(40),
|
||||
MARK_INPUT_LENIENT = new Integer(30),
|
||||
MARK_INPUT_STRICT = new Integer(20),
|
||||
MARK_OUTPUT = new Integer(10);
|
||||
MARK_OUTPUT = new Integer(10),
|
||||
MARK_ASCII = new Integer(10);
|
||||
|
||||
static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();
|
||||
|
||||
static class _BetterTargetIsLess implements Comparator {
|
||||
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
|
||||
|
||||
|
@ -132,6 +132,15 @@ public class XEquivalenceClass {
|
||||
return Collections.unmodifiableSet(aPartitionSet);
|
||||
}
|
||||
|
||||
public Set getEquivalenceSets() {
|
||||
Set result = new HashSet();
|
||||
for (Iterator it = toPartitionSet.keySet().iterator(); it.hasNext();) {
|
||||
Object item = it.next();
|
||||
Set partition = (Set) toPartitionSet.get(item);
|
||||
result.add(Collections.unmodifiableSet(partition));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* returns true iff a is equivalent to b (or a.equals b)
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user