ICU-0 misc fixes
X-SVN-Rev: 17717
This commit is contained in:
parent
0176a784d1
commit
65e8ccde28
480
tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
Normal file
480
tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
Normal file
@ -0,0 +1,480 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
|
||||
* $Date: 2005/05/27 21:40:51 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.ArrayComparator;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class GenerateConfusables {
|
||||
static PrintWriter log;
|
||||
static final String ARROW = "\u2192";
|
||||
|
||||
static class Data2 {
|
||||
String source;
|
||||
String target;
|
||||
int count;
|
||||
Data2(String target, int count) {
|
||||
this.target = target;
|
||||
this.count = count;
|
||||
}
|
||||
}
|
||||
|
||||
static ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
static UnicodeSet skipSet = ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cc")).addAll(ups.getSet("gc=Cf"));
|
||||
|
||||
static class Data implements Comparable {
|
||||
String source;
|
||||
String target;
|
||||
String type;
|
||||
Data(String source, String target, String type) {
|
||||
this.source = source;
|
||||
this.target = target;
|
||||
this.type = type;
|
||||
}
|
||||
public int compareTo(Object o) {
|
||||
int result;
|
||||
Data that = (Data)o;
|
||||
if (0 != (result = target.compareTo(that.target))) return result;
|
||||
if (0 != (result = source.compareTo(that.source))) return result;
|
||||
if (0 != (result = type.compareTo(that.type))) return result;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static UnicodeSet controls = new UnicodeSet("[:Cc:]");
|
||||
|
||||
static class DataSet {
|
||||
Set dataSet = new TreeSet();
|
||||
Map dataMap = new TreeMap(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), new UTF16.StringComparator()}));
|
||||
|
||||
public DataSet add(String source, String target, String type, String errorLine) {
|
||||
if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
|
||||
String nsource = Default.nfkd().normalize(source);
|
||||
String ntarget = Default.nfkd().normalize(target);
|
||||
|
||||
// if it is just a compatibility match, return
|
||||
if (nsource.equals(ntarget)) return this;
|
||||
|
||||
if (type.startsWith("confusables-")) type = type.substring("confusables-".length());
|
||||
if (type.endsWith(".txt")) type = type.substring(0,type.length() - ".txt".length());
|
||||
|
||||
// if it is base + combining sequence => base2 + same combining sequence, do just the base
|
||||
int nsourceFirst = UTF16.charAt(nsource,0);
|
||||
String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
|
||||
int ntargetFirst = UTF16.charAt(ntarget,0);
|
||||
String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
|
||||
if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
|
||||
source = UTF16.valueOf(nsourceFirst);
|
||||
target = UTF16.valueOf(ntargetFirst);
|
||||
type += "-base";
|
||||
}
|
||||
|
||||
// swap order
|
||||
if (preferSecondAsSource(source, target)) {
|
||||
String temp = target;
|
||||
target = source;
|
||||
source = temp;
|
||||
}
|
||||
if (target.indexOf('\u203D') >= 0) type += "-skip";
|
||||
Data newData = new Data(source, target, type);
|
||||
return add(newData, errorLine);
|
||||
}
|
||||
/**
|
||||
* @param errorLine TODO
|
||||
*
|
||||
*/
|
||||
private DataSet add(Data newData, String errorLine) {
|
||||
if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
|
||||
System.out.println("Problem with " + errorLine);
|
||||
System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
|
||||
}
|
||||
String[] key = {newData.source, newData.target};
|
||||
Data old = (Data) dataMap.get(key);
|
||||
if (old == null) {
|
||||
dataSet.add(newData);
|
||||
dataMap.put(key, newData);
|
||||
}else {
|
||||
old.type = old.type + "/" + newData.type;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
// Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
|
||||
static final int NORMAL = 0, FOLDING = 1, OLD = 2;
|
||||
|
||||
public DataSet addFile(String directory, String filename) throws IOException {
|
||||
BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
|
||||
int kind = NORMAL;
|
||||
if (filename.indexOf("Folding") >= 0) kind = FOLDING;
|
||||
else if (false && filename.indexOf("-old") >= 0) kind = OLD;
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
String[] pieces = Utility.split(line,';');
|
||||
if (pieces.length < 2) {
|
||||
System.out.println("Error on: " + line);
|
||||
continue;
|
||||
}
|
||||
String type = filename;
|
||||
if (kind==FOLDING) {
|
||||
String source = Utility.fromHex(pieces[0].trim(),true);
|
||||
String target = Utility.fromHex(pieces[1].trim(),true);
|
||||
String nsource = Default.nfkd().normalize(source);
|
||||
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
|
||||
if (!first.equals(target)) {
|
||||
add(source, target, type, line);
|
||||
}
|
||||
} else if (kind == OLD) {
|
||||
String target = pieces[0].trim();
|
||||
for (int i = 1; i < pieces.length; ++i) {
|
||||
add(pieces[i].trim(), target, type, line);
|
||||
}
|
||||
} else {
|
||||
String source = Utility.fromHex(pieces[0].trim(),true);
|
||||
String target = Utility.fromHex(pieces[1].trim(),true);
|
||||
if (pieces.length > 2) type = pieces[2].trim();
|
||||
add(source, target, type, line);
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
return this;
|
||||
}
|
||||
public void write(String directory, String filename, boolean appendFile) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
if (appendFile) {
|
||||
String[] replacements = {"%date%", Default.getDate()};
|
||||
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
|
||||
Utility.UTF8_WINDOWS, out, replacements);
|
||||
}
|
||||
for (Iterator it = dataSet.iterator(); it.hasNext();) {
|
||||
Data item = (Data) it.next();
|
||||
out.println(
|
||||
Utility.hex(item.source)
|
||||
+ " ;\t" + Utility.hex(item.target)
|
||||
+ " ;\t" + item.type
|
||||
+ "\t# "
|
||||
+ "( " + item.source + " " + ARROW + " " + item.target + ") "
|
||||
+ Default.ucd().getName(item.source) + " " + ARROW + " "
|
||||
+ Default.ucd().getName(item.target));
|
||||
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public void add(DataSet ds) {
|
||||
for (Iterator it = ds.dataSet.iterator(); it.hasNext();) {
|
||||
add((Data)it.next(), "");
|
||||
}
|
||||
}
|
||||
public DataSet clean() {
|
||||
// remove all skips
|
||||
DataSet tempSet = new DataSet();
|
||||
Map m = new HashMap();
|
||||
for (Iterator it = dataSet.iterator(); it.hasNext();) {
|
||||
Data d = (Data) it.next();
|
||||
if (d.type.indexOf("skip") >= 0) continue;
|
||||
String newTarget = Default.nfkd().normalize(d.target);
|
||||
String newSource = Default.nfkd().normalize(d.source);
|
||||
String type = d.type;
|
||||
if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
|
||||
type += "-nf";
|
||||
log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
|
||||
log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
|
||||
continue;
|
||||
}
|
||||
// swap order
|
||||
if (preferSecondAsSource(newSource, newTarget)) {
|
||||
String temp = newTarget;
|
||||
newTarget = newSource;
|
||||
newSource = temp;
|
||||
}
|
||||
|
||||
Data already = (Data) m.get(newSource);
|
||||
if (already != null && !newTarget.equals(already.target)) {
|
||||
log.println("X " + getCodeCharName(newSource) + " " + ARROW);
|
||||
log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
|
||||
log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
|
||||
if (preferSecondAsSource(already.target, newTarget)) {
|
||||
// just fix new guy
|
||||
type += "[" + newSource + "]" + already.type;
|
||||
newSource = newTarget;
|
||||
newTarget = already.target;
|
||||
} else {
|
||||
// need to fix new guy, AND fix old guy.
|
||||
tempSet.remove(already);
|
||||
type += "[" + newSource + "]" + already.type;
|
||||
newSource = already.target;
|
||||
already.type += "[" + already.target + "]" + type;
|
||||
already.target = newTarget;
|
||||
tempSet.add(already, "");
|
||||
}
|
||||
}
|
||||
Data newData = new Data(newSource, newTarget, type);
|
||||
m.put(newSource, newData);
|
||||
tempSet.add(newData, "");
|
||||
}
|
||||
// now recursively apply
|
||||
DataSet s = new DataSet();
|
||||
for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
|
||||
Data d = (Data) it.next();
|
||||
int cp = 0;
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(d.target, i);
|
||||
String src = UTF16.valueOf(cp);
|
||||
while (true) {
|
||||
Data rep = (Data) m.get(src);
|
||||
if (rep == null) break;
|
||||
src = rep.target;
|
||||
}
|
||||
result.append(src);
|
||||
}
|
||||
String newTarget = result.toString();
|
||||
newTarget = Default.nfkd().normalize(newTarget);
|
||||
s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private void remove(Data already) {
|
||||
String[] key = {already.source, already.target};
|
||||
dataMap.remove(key);
|
||||
dataSet.remove(already);
|
||||
}
|
||||
}
|
||||
public static void main(String[] args) throws IOException {
|
||||
String indir = Utility.BASE_DIR + "confusables/";
|
||||
String outdir = Utility.GEN_DIR + "confusables/";
|
||||
log = BagFormatter.openUTF8Writer(outdir, "log.txt");
|
||||
//fixMichel(indir, outdir);
|
||||
generateConfusables(indir, outdir);
|
||||
log.close();
|
||||
System.out.println("Done");
|
||||
}
|
||||
/**
|
||||
* @throws IOException
|
||||
*
|
||||
*/
|
||||
private static void fixMichel(String indir, String outdir) throws IOException {
|
||||
BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
String[] pieces = Utility.split(line,'\t');
|
||||
if (pieces.length < 2) {
|
||||
out.println(line);
|
||||
continue;
|
||||
}
|
||||
String source = Utility.fromHex(pieces[0].trim());
|
||||
if (Default.nfkd().isNormalized(source)) {
|
||||
out.println(line);
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
out.close();
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void generateConfusables(String indir, String outdir) throws IOException {
|
||||
File dir = new File(indir);
|
||||
String[] names = dir.list();
|
||||
DataSet total = new DataSet();
|
||||
for (int i = 0; i < names.length; ++i) {
|
||||
if (new File(indir + names[i]).isDirectory()) continue;
|
||||
System.out.println(names[i]);
|
||||
DataSet ds = new DataSet();
|
||||
ds.addFile(indir, names[i]);
|
||||
ds.write(outdir, "new-" + names[i], false);
|
||||
total.add(ds);
|
||||
}
|
||||
total.write(outdir, "confusables-raw.txt", false);
|
||||
DataSet clean = total.clean();
|
||||
clean.write(outdir, "confusables.txt", true);
|
||||
}
|
||||
/*
|
||||
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
|
||||
Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(),
|
||||
new UTF16.StringComparator()}));
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
String[] pieces = Utility.split(line,';');
|
||||
if (pieces.length < 2) {
|
||||
System.out.println("Error on: " + line);
|
||||
continue;
|
||||
}
|
||||
String source = Utility.fromHex(pieces[0].trim());
|
||||
String target = Utility.fromHex(pieces[1].trim());
|
||||
String nsource = Default.nfkd().normalize(source);
|
||||
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
|
||||
if (!first.equals(target)) {
|
||||
set.add(new String[]{source, target});
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
|
||||
}
|
||||
public static void gen() throws IOException {
|
||||
Map m = new TreeMap();
|
||||
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
String[] pieces = Utility.split(line,';');
|
||||
if (pieces.length < 3) {
|
||||
System.out.println("Error on: " + line);
|
||||
continue;
|
||||
}
|
||||
int codepoint = Integer.parseInt(pieces[1], 16);
|
||||
int cat = Default.ucd().getCategory(codepoint);
|
||||
if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
|
||||
if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
|
||||
String result = Utility.fromHex(pieces[0]);
|
||||
if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
|
||||
int count = Integer.parseInt(pieces[2]);
|
||||
String source = UTF16.valueOf(codepoint);
|
||||
add(m, source, result, count);
|
||||
}
|
||||
in.close();
|
||||
|
||||
in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
line = line.trim();
|
||||
int pos = line.indexOf("#");
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
if (line.length() == 0) continue;
|
||||
if (line.startsWith("@")) continue;
|
||||
String[] pieces = Utility.split(line,';');
|
||||
if (pieces.length < 2) {
|
||||
System.out.println("Error on: " + line);
|
||||
continue;
|
||||
}
|
||||
String source = pieces[0].trim();
|
||||
for (int i = 1; i < pieces.length; ++i) {
|
||||
add(m, source, pieces[i].trim(), -1);
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
|
||||
boolean gotOne;
|
||||
// close the set
|
||||
do {
|
||||
gotOne = false;
|
||||
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
|
||||
String source = (String) it.next();
|
||||
Data2 data = (Data2) m.get(source);
|
||||
Data2 data2 = (Data2) m.get(data.target);
|
||||
if (data2 == null) continue;
|
||||
data.target = data2.target;
|
||||
gotOne = true;
|
||||
break;
|
||||
}
|
||||
} while (gotOne);
|
||||
// put into different sorting order
|
||||
Set s = new TreeSet();
|
||||
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
|
||||
String source = (String) it.next();
|
||||
Data2 data = (Data2) m.get(source);
|
||||
s.add(new Data(source, data.target, data.count));
|
||||
}
|
||||
// write it out
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
|
||||
String[] replacements = {"%date%", Default.getDate()};
|
||||
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
|
||||
Utility.UTF8_WINDOWS, out, replacements);
|
||||
for (Iterator it = s.iterator(); it.hasNext();) {
|
||||
Data d = (Data) it.next();
|
||||
if (d == null) continue;
|
||||
out.println(formatLine(d.source, d.target, d.count));
|
||||
}
|
||||
|
||||
out.close();
|
||||
System.out.println("Done");
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static String formatLine(String source, String target, int count) {
|
||||
return Utility.hex(source) + " ; " + Utility.hex(target," ")
|
||||
+ " ; " + count
|
||||
+ " # "
|
||||
+ "(" + source + " " + ARROW + " " + target + ") "
|
||||
+ Default.ucd().getName(source)
|
||||
+ " " + ARROW + " " + Default.ucd().getName(target);
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void add(Map m, String source, String target, int count) {
|
||||
if (source.length() == 0 || target.length() == 0) return;
|
||||
if (preferSecondAsSource(source, target)) {
|
||||
String temp = target;
|
||||
target = source;
|
||||
source = temp;
|
||||
}
|
||||
Data2 other = (Data2) m.get(source);
|
||||
if (other != null) {
|
||||
if (target.equals(other.target)) return;
|
||||
System.out.println("conflict");
|
||||
System.out.println(formatLine(source, target, count));
|
||||
System.out.println(formatLine(source, other.target, other.count));
|
||||
// skip adding this, and instead add result -> other.target
|
||||
add(m, target, other.target, count);
|
||||
} else {
|
||||
m.put(source, new Data2(target, count));
|
||||
}
|
||||
};
|
||||
|
||||
static private boolean preferSecondAsSource(String a, String b) {
|
||||
// if first is longer, prefer second
|
||||
int ca = UTF16.countCodePoint(a);
|
||||
int cb = UTF16.countCodePoint(b);
|
||||
if (ca != cb) {
|
||||
return ca > cb;
|
||||
}
|
||||
// if first is lower, prefer second
|
||||
return a.compareTo(b) < 0;
|
||||
}
|
||||
|
||||
static String getCodeCharName(String a) {
|
||||
return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a);
|
||||
}
|
||||
|
||||
}
|
125
tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java
Normal file
125
tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java
Normal file
@ -0,0 +1,125 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
|
||||
* $Date: 2005/05/27 21:40:51 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public final class GenerateNamedSequences implements UCD_Types {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static public String showVarGlyphs(String code0, String code1, String shape, String description) {
|
||||
if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
|
||||
|
||||
String abbShape = "";
|
||||
if (shape.length() != 0) {
|
||||
abbShape = '-' + shape.substring(0,4);
|
||||
if (description.indexOf("feminine") >= 0) abbShape += "fem";
|
||||
}
|
||||
|
||||
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
|
||||
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
|
||||
}
|
||||
|
||||
/*
|
||||
# Field 0: the variation sequence
|
||||
# Field 1: the description of the desired appearance
|
||||
# Field 2: where the appearance is only different in in particular shaping environments
|
||||
# this field lists them. The possible values are: isolated, initial, medial, final.
|
||||
# If more than one is present, there are spaces between them.
|
||||
*/
|
||||
static public void generate() throws IOException {
|
||||
|
||||
|
||||
// read the data and compose the table
|
||||
|
||||
String table = "<table><tr><th width='10%'>Rep Glyph</th><th>Hex Sequence</th><th>Name</th><th>Copyable</th></tr>";
|
||||
|
||||
String[] splits = new String[4];
|
||||
String[] codes = new String[20];
|
||||
String[] shapes = new String[4];
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1);
|
||||
Transliterator unicodexml = Transliterator.getInstance("hex/xml");
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line, ';', splits);
|
||||
String name = splits[0];
|
||||
int codeCount = Utility.split(splits[1], ' ', codes);
|
||||
StringBuffer codeBuffer = new StringBuffer();
|
||||
for (int i = 0; i < codeCount; ++i) {
|
||||
UTF16.append(codeBuffer, Integer.parseInt(codes[i],16));
|
||||
}
|
||||
String codeWithHyphens = splits[1].replaceAll("\\s", "-");
|
||||
String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+");
|
||||
String codeString = unicodexml.transliterate(codeBuffer.toString());
|
||||
|
||||
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
|
||||
|
||||
//table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
|
||||
String imageName = "images/U" + codeWithHyphens + ".gif";
|
||||
if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) {
|
||||
String codeNoSpaces2 = splits[1].replaceAll("\\s", "");
|
||||
imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif";
|
||||
}
|
||||
table += "<tr>"
|
||||
+ "<td class='copy'><img alt='(" + codeAlt + ")' src='" + imageName + "'><br><tt>"
|
||||
+ splits[1] + "</tt></td>"
|
||||
+ "<td>" + splits[1] + "</td>"
|
||||
+ "</td><td>" + name + "</td>"
|
||||
+ "<td class='copy'>" + codeString + "</td>"
|
||||
+ "</tr>\n";
|
||||
System.out.println(splits[1] + "\t" + codeString);
|
||||
}
|
||||
in.close();
|
||||
table += "</table>";
|
||||
|
||||
// now write out the results
|
||||
|
||||
String directory = "DerivedData/";
|
||||
String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
|
||||
/*
|
||||
String[] batName = {""};
|
||||
String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
|
||||
|
||||
String version = Default.ucd().getVersion();
|
||||
int lastDot = version.lastIndexOf('.');
|
||||
String updateDirectory = version.substring(0,lastDot) + "-Update";
|
||||
int updateV = version.charAt(version.length()-1) - '0';
|
||||
if (updateV != 0) updateDirectory += (char)('1' + updateV);
|
||||
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
|
||||
*/
|
||||
|
||||
String[] replacementList = {
|
||||
"@revision@", Default.ucd().getVersion(),
|
||||
//"@updateDirectory@", updateDirectory,
|
||||
"@date@", Default.getDate(),
|
||||
"@table@", table};
|
||||
|
||||
Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList);
|
||||
|
||||
out.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
|
||||
}
|
||||
}
|
515
tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java
Normal file
515
tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java
Normal file
@ -0,0 +1,515 @@
|
||||
/*
|
||||
* Created on May 3, 2005
|
||||
* Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
|
||||
* For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
*/
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.CollectionUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.text.UTF16.StringComparator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator;
|
||||
import com.ibm.text.UCD.TestData.RegexMatcher;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
|
||||
class GenerateStringPrep implements UCD_Types {
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
//checkChars(false);
|
||||
new GenerateStringPrep().genStringPrep();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet decomposable = new UnicodeSet();
|
||||
UnicodeMap suspect = new UnicodeMap();
|
||||
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0");
|
||||
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
|
||||
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
|
||||
UnicodeSet wordChars = new UnicodeSet();
|
||||
{
|
||||
if (false) {
|
||||
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
|
||||
wordChars.retainAll(ups.getSet("gc=Sk"));
|
||||
}
|
||||
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
|
||||
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
|
||||
" \\u055A \\u02B9 \\u02BA]"));
|
||||
//wordChars.removeAll(xid_continue);
|
||||
}
|
||||
|
||||
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
|
||||
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
|
||||
UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me"))
|
||||
.addAll(ups.getSet("gc=Mn"))
|
||||
.removeAll(ups.getSet("Default_Ignorable_Code_Point=true"));
|
||||
|
||||
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
|
||||
|
||||
//UnicodeSet[] decompChars = new UnicodeSet[100];
|
||||
UCD ucd = Default.ucd();
|
||||
|
||||
static Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
|
||||
{
|
||||
uca0.setStrength(Collator.IDENTICAL);
|
||||
}
|
||||
static GenerateHanTransliterator.MultiComparator uca
|
||||
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
|
||||
uca0, new UTF16.StringComparator()});
|
||||
|
||||
UnicodeSet bidiR = new UnicodeSet(
|
||||
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
|
||||
|
||||
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
|
||||
UnicodeSet hasNoUpper = new UnicodeSet();
|
||||
UnicodeSet hasNoUpperMinus = new UnicodeSet();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
UnicodeSet inIDN = new UnicodeSet();
|
||||
UnicodeSet isCaseFolded = new UnicodeSet();
|
||||
|
||||
void genStringPrep() throws IOException {
|
||||
//showScriptToBlock();
|
||||
bf.setShowLiteral(BagFormatter.toHTMLControl);
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
//bf.setValueSource(UnicodeLabel.NULL);
|
||||
if (false) {
|
||||
|
||||
System.out.println("word chars: " + bf.showSetNames(wordChars));
|
||||
System.out.println("pat: " + bf.showSetNames(patternProp));
|
||||
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
|
||||
}
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
||||
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
|
||||
// get IDNA
|
||||
int idnaType = getIDNAType(cp);
|
||||
idnaTypeSet[idnaType].add(cp);
|
||||
|
||||
String str = UTF16.valueOf(cp);
|
||||
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
|
||||
if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp);
|
||||
|
||||
// scripts
|
||||
int script = ucd.getScript(cp);
|
||||
if (coreChars[script] == null)
|
||||
coreChars[script] = new UnicodeSet();
|
||||
coreChars[script].add(cp);
|
||||
}
|
||||
// fix characters with no uppercase
|
||||
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
|
||||
System.out.println(bf.showSetNames(hasNoUpper));
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
|
||||
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
|
||||
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
|
||||
textOut.println('\uFEFF');
|
||||
textOut.println("For documentation, see idn-chars.html");
|
||||
|
||||
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
|
||||
new String[] {"%date%", Default.getDate()});
|
||||
/*
|
||||
out
|
||||
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
out.println("<title>IDN Characters</title><style>");
|
||||
out.println("<!--");
|
||||
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
|
||||
out.println(".Atomic { background-color: #CCCCFF }");
|
||||
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
|
||||
out.println(".Non-XID { background-color: #FFCCCC }");
|
||||
out.println(".Decomposable { background-color: #FFFFCC }");
|
||||
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
|
||||
|
||||
out.println("th { text-align: left }");
|
||||
out.println("-->");
|
||||
out.println("</style></head><body><table>");
|
||||
*/
|
||||
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
|
||||
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
|
||||
|
||||
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
|
||||
if (scriptCode == COMMON_SCRIPT
|
||||
|| scriptCode == INHERITED_SCRIPT)
|
||||
continue;
|
||||
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
|
||||
}
|
||||
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
|
||||
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
|
||||
|
||||
showCodes(htmlOut, textOut, non_spacing);
|
||||
htmlOut.println("</table></body></html>");
|
||||
htmlOut.close();
|
||||
htmlOut2.println("</table></body></html>");
|
||||
htmlOut2.close();
|
||||
bf.setMergeRanges(false);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
|
||||
textOut.println();
|
||||
bf.setValueSource("word-chars");
|
||||
bf.showSetNames(textOut, wordChars);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** FOR REVIEW ***");
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
|
||||
textOut.println();
|
||||
String value = (String)it.next();
|
||||
bf.setValueSource(value);
|
||||
bf.showSetNames(textOut, suspect.getSet(value));
|
||||
}
|
||||
textOut.close();
|
||||
textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt");
|
||||
bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
textOut.println();
|
||||
textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***");
|
||||
UnicodeSet U32 = ups32.getSet("gc=cn").complement();
|
||||
UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32);
|
||||
bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]);
|
||||
textOut.close();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private void showScriptToBlock() {
|
||||
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
|
||||
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
|
||||
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
|
||||
public Object compose(Object a, Object b) {
|
||||
return a + "\t" + b;
|
||||
}
|
||||
};
|
||||
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
|
||||
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
|
||||
System.out.println(it.next());
|
||||
}
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
|
||||
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
|
||||
|
||||
static String[][] script_to_gif = {
|
||||
|
||||
{"Common","common.gif"}, //Miscellaneous_Symbols
|
||||
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
|
||||
{"Arabic","arabic.gif"}, //Arabic
|
||||
{"Armenian","armenian.gif"}, //Armenian
|
||||
{"Bengali","bengali.gif"}, //Bengali
|
||||
{"Bopomofo","bopomofo.gif"}, //Bopomofo
|
||||
{"Braille","braillesymbols.gif"}, //Braille_Patterns
|
||||
{"Buginese","buginese.gif"}, //Buginese
|
||||
{"Buhid","buhid.gif"}, //Buhid
|
||||
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
|
||||
{"Cherokee","cherokee.gif"}, //Cherokee
|
||||
{"Coptic","coptic.gif"}, //Coptic
|
||||
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
|
||||
{"Cyrillic","cyrillic.gif"}, //Cyrillic
|
||||
{"Deseret","deseret.gif"}, //Deseret
|
||||
{"Devanagari","devanagari.gif"}, //Devanagari
|
||||
{"Ethiopic","ethiopic.gif"}, //Ethiopic
|
||||
{"Georgian","georgian.gif"}, //Georgian
|
||||
{"Glagolitic","glagolitic.gif"}, //Glagolitic
|
||||
{"Gothic","gothic.gif"}, //Gothic
|
||||
{"Greek","greek.gif"}, //Greek_and_Coptic
|
||||
{"Gujarati","gujarati.gif"}, //Gujarati
|
||||
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
|
||||
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
|
||||
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
|
||||
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
|
||||
{"Hanunoo","hanunoo.gif"}, //Hanunoo
|
||||
{"Hebrew","hebrew.gif"}, //Hebrew
|
||||
{"Hiragana","hiragana.gif"}, //Hiragana
|
||||
{"Kannada","kannada.gif"}, //Kannada
|
||||
{"Katakana","katakana.gif"}, //Katakana
|
||||
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
|
||||
{"Khmer","khmer.gif"}, //Khmer
|
||||
{"Lao","lao.gif"}, //Lao
|
||||
{"Latin","latin.gif"}, //Basic_Latin
|
||||
{"Limbu","limbu.gif"}, //Limbu
|
||||
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
|
||||
{"Malayalam","malayalam.gif"}, //Malayalam
|
||||
{"Mongolian","mongolian.gif"}, //Mongolian
|
||||
{"Myanmar","myanmar.gif"}, //Myanmar
|
||||
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
|
||||
{"Ogham","ogham.gif"}, //Ogham
|
||||
{"Old_Italic","olditalic.gif"}, //Old_Italic
|
||||
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
|
||||
{"Oriya","oriya.gif"}, //Oriya
|
||||
{"Osmanya","osmanya.gif"}, //Osmanya
|
||||
{"Runic","runic.gif"}, //Runic
|
||||
{"Shavian","shavian.gif"}, //Shavian
|
||||
{"Sinhala","sinhala.gif"}, //Sinhala
|
||||
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
|
||||
{"Syriac","syriac.gif"}, //Syriac
|
||||
{"Tagalog","tagalog.gif"}, //Tagalog
|
||||
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
|
||||
{"Tai_Le","taile.gif"}, //Tai_Le
|
||||
{"Tamil","tamil.gif"}, //Tamil
|
||||
{"Telugu","telugu.gif"}, //Telugu
|
||||
{"Thaana","thaana.gif"}, //Thaana
|
||||
{"Thai","thai.gif"}, //Thai
|
||||
{"Tibetan","tibetan.gif"}, //Tibetan
|
||||
{"Tifinagh","tifinagh.gif"}, //Tifinagh
|
||||
{"Ugaritic","ugaritic.gif"}, //Ugaritic
|
||||
{"Yi","yi.gif"}, //Yi_Syllables
|
||||
|
||||
};
|
||||
|
||||
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
|
||||
{
|
||||
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
|
||||
}
|
||||
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private int getIDNAType(int cp) {
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.DEFAULT); // USE_STD3_RULES
|
||||
if (intermediate.length() == 0)
|
||||
return DELETED;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
return ILLEGAL;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
return ILLEGAL;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
return REMAPPED;
|
||||
return OK;
|
||||
}
|
||||
StringBuffer inbuffer = new StringBuffer();
|
||||
StringBuffer intermediate, outbuffer;
|
||||
|
||||
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param scriptCode
|
||||
* @param htmlOut2 TODO
|
||||
* @param ucd
|
||||
* @param coreChars
|
||||
* @param decompChars
|
||||
*/
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
|
||||
if (coreChars[scriptCode] == null) return;
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
|
||||
script = Utility.getUnskeleton(script.toLowerCase(),true);
|
||||
System.out.println(script);
|
||||
|
||||
htmlOut.println();
|
||||
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
|
||||
+ "'> Script: " + script + "</th></tr>";
|
||||
htmlOut.println(scriptLine);
|
||||
htmlOut2.println(scriptLine);
|
||||
textOut.println();
|
||||
textOut.println("#*** Script: " + script + " ***");
|
||||
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
|
||||
|
||||
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
|
||||
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
|
||||
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
|
||||
|
||||
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
|
||||
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
|
||||
|
||||
UnicodeSet decomp = extract(decomposable, core);
|
||||
UnicodeSet pattern = extract(patternProp, core);
|
||||
UnicodeSet non_id = extract(not_xid_continue, core);
|
||||
|
||||
UnicodeSet bicameralNoupper = new UnicodeSet();
|
||||
if (!hasNoUpper.containsAll(core)) {
|
||||
bicameralNoupper = extract(hasNoUpperMinus, core);
|
||||
}
|
||||
|
||||
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
|
||||
String cat = Default.ucd().getCategoryID(it.codepoint);
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name.indexOf("MUSICAL SYMBOL") >= 0
|
||||
|| name.indexOf("DINGBA") >= 0
|
||||
|| name.indexOf("RADICAL ") >= 0
|
||||
) cat = "XX";
|
||||
suspect.put(it.codepoint, cat);
|
||||
}
|
||||
|
||||
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca);
|
||||
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca);
|
||||
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca);
|
||||
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca);
|
||||
|
||||
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca);
|
||||
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca);
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca);
|
||||
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca);
|
||||
}
|
||||
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException {
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT);
|
||||
script = Utility.getUnskeleton(script.toLowerCase(),true);
|
||||
String scriptLine = "<tr><th class='script'><img src='images/"
|
||||
+ ((String)scriptToGif.get(script)).toLowerCase()
|
||||
+ "'> Script: " + script + "</th></tr>";
|
||||
htmlOut.println(scriptLine);
|
||||
UnicodeMap m = getPositions();
|
||||
|
||||
for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) {
|
||||
String type = (String) it.next();
|
||||
UnicodeSet current = m.getSet(type).retainAll(non_spacing);
|
||||
if (current.size() == 0) continue;
|
||||
printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*
|
||||
*/
|
||||
private UnicodeMap getPositions() throws IOException {
|
||||
UnicodeMap result = new UnicodeMap();
|
||||
BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt");
|
||||
String type="Undetermined";
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
if (line.startsWith("@")) {
|
||||
type = line.substring(1);
|
||||
continue;
|
||||
}
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
String code = Utility.fromHex(pieces[0]);
|
||||
result.put(UTF16.charAt(code,0), type);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static Comparator positionComparator = new Comparator() {
|
||||
public int compare(Object o1, Object o2) {
|
||||
String s1 = (String)o1;
|
||||
String s2 = (String)o2;
|
||||
return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
|
||||
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
|
||||
core.removeAll(decomp);
|
||||
return decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param script TODO
|
||||
* @param unicodeset
|
||||
* @param scriptCode
|
||||
* @param comparator TODO
|
||||
* @param uca
|
||||
*/
|
||||
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
|
||||
String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) {
|
||||
if (unicodeset == null)
|
||||
return;
|
||||
int size = unicodeset.size();
|
||||
String dir = unicodeset.containsSome(bidiR)
|
||||
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
|
||||
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
|
||||
title + "'>" + title + "</a> ("
|
||||
+ TestData.nf.format(size) + ")</th></tr>");
|
||||
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
|
||||
// <a href="#Atomic">categorization</a>
|
||||
textOut.println();
|
||||
textOut.println("# " + title);
|
||||
bf.setValueSource(script + " ; " + title);
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator();
|
||||
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
|
||||
usi.reset(unicodeset);
|
||||
while (usi.nextRange()) {
|
||||
if (usi.codepoint == usi.codepointEnd) {
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint)));
|
||||
} else {
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint))
|
||||
+ ".. "
|
||||
+ formatCode(UTF16
|
||||
.valueOf(usi.codepointEnd)));
|
||||
}
|
||||
}
|
||||
bf.showSetNames(textOut, unicodeset);
|
||||
} else {
|
||||
Set reordered = new TreeSet(comparator);
|
||||
usi.reset(unicodeset);
|
||||
while (usi.next()) {
|
||||
String x = usi.getString();
|
||||
boolean foo = reordered.add(x);
|
||||
if (!foo)
|
||||
throw new IllegalArgumentException("Collision with "
|
||||
+ Default.ucd().getCodeAndName(x));
|
||||
}
|
||||
for (Iterator it = reordered.iterator(); it.hasNext();) {
|
||||
Object key = it.next();
|
||||
htmlOut.print(formatCode((String)key));
|
||||
}
|
||||
bf.showSetNames(textOut, reordered);
|
||||
}
|
||||
htmlOut.println("</td></tr>");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string
|
||||
* @return
|
||||
*/
|
||||
private String formatCode(String string) {
|
||||
int cat = ucd.getCategory(UTF16.charAt(string,0));
|
||||
String pad = "\u00A0", pad1 = pad;
|
||||
if (cat == Me || cat == Mn) {
|
||||
pad = "\u00A0\u00A0";
|
||||
pad1 = "\u00A0\u00A0\u25cc";
|
||||
}
|
||||
return "<span title='" + ucd.getCodeAndName(string) + "'>"
|
||||
+ pad1
|
||||
+ BagFormatter.toHTMLControl.transliterate(string)
|
||||
+ pad
|
||||
+ "</span> ";
|
||||
}
|
||||
}
|
153
tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html
Normal file
153
tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html
Normal file
@ -0,0 +1,153 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<meta name="keywords" content="unicode, variant glyphs">
|
||||
<meta name="description" content="Describes and displays standardized variant glyphs">
|
||||
<title>Named Sequences</title>
|
||||
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
|
||||
<style>
|
||||
<!--
|
||||
.copy { text-align: center; font-size: 150% }
|
||||
th, td { vertical-align: middle }
|
||||
tt { font-size: 8pt }
|
||||
table { padding: 2pt }
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body bgcolor="#ffffff">
|
||||
|
||||
<table class="header">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org">
|
||||
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a> <a class="bar" href="http://www.unicode.org/ucd">Unicode
|
||||
Character Database</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="gray"> </td>
|
||||
</tr>
|
||||
</table>
|
||||
<div style="margin:1em">
|
||||
<table border="1" cellpadding="0" cellspacing="1" style="border-collapse: collapse" bordercolor="#111111" width="100%" id="AutoNumber1">
|
||||
<tr>
|
||||
<td width="100%">
|
||||
<p style="text-align: right">L2-XXX</p>
|
||||
<p><i>To: UTC<br>
|
||||
From: Mark Davis<br>
|
||||
Date: 2005-04-28</i></p>
|
||||
<p><i>One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html,
|
||||
following the pattern of StandardizedVariants.html. This document was generated along those
|
||||
lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable
|
||||
style modifications, of course) as a chart someplace accessible under
|
||||
<a href="http://unicode.org/charts/">http://unicode.org/charts/</a>.</i></p>
|
||||
<p><i>Alternatively, we could also combine this with the StandardizedVariants.html to provide
|
||||
a unified chart of sequences, again someplace under <a href="http://unicode.org/charts/">
|
||||
http://unicode.org/charts/</a>.</i></p>
|
||||
<p><i><b>Note:</b> we don't have some of the glyphs quite right yet, but it should be
|
||||
sufficient for discussing the format. One of the innovations is having a separate column of
|
||||
text that for copy&paste; that needs discussion also.</i></td>
|
||||
</tr>
|
||||
</table>
|
||||
<h1><i><font color="#990000"> PROPOSED WORKING DRAFT<br>
|
||||
</font></i>Named Sequences</h1>
|
||||
<table class="wide">
|
||||
<tr>
|
||||
<td valign="top" width="144">Revision</td>
|
||||
<td valign="top">@revision@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Authors</td>
|
||||
<td valign="top">Members of the Editorial Committee</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Date</td>
|
||||
<td valign="top">@date@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">This Version</td>
|
||||
<td valign="top">
|
||||
<a href="http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html">
|
||||
http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Previous Version</td>
|
||||
<td valign="top">n/a</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Latest Version</td>
|
||||
<td valign="top">n/a</td>
|
||||
</tr>
|
||||
</table>
|
||||
<h3><br>
|
||||
<i>Summary</i></h3>
|
||||
<blockquote>
|
||||
<p>This file provides a visual display of the named sequences derived from NamedSequences.txt.<i>The
|
||||
proposal is to add this, </i></p>
|
||||
</blockquote>
|
||||
<h3><i>Status</i></h3>
|
||||
<blockquote>
|
||||
<p><i>The file and the files described herein are part of the
|
||||
<a href="http://www.unicode.org/ucd">Unicode Character Database</a> (UCD) and are governed by
|
||||
the <a href="#Terms of Use">UCD Terms of Use</a> stated at the end.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<h2>Introduction</h2>
|
||||
<p>The tables here exhaustively lists the valid, registered named sequences. The columns include a
|
||||
representative glyph, the sequence of code points in hex, and the name of the sequence. In
|
||||
addition, there is a last column entitled <i>Copyable</i>, which contains the literal text forming
|
||||
the sequence. That text can be copied and pasting in elsewhere. The display of the text in this
|
||||
column is up to the capabilities of the browser and the set of available fonts. For more
|
||||
information, see <a href="http://www.unicode.org/help/display_problems.html">Display Problems?</a>.</p>
|
||||
<blockquote>
|
||||
<p><a name="fonts"><b>Note: </b></a>The representative glyphs used to show the names sequences
|
||||
are often derived from different physical fonts than the representative glyphs in the standard.
|
||||
They may therefore exhibit minor differences in size, proportion, style, or weight.</p>
|
||||
</blockquote>
|
||||
<p>@table@</p>
|
||||
<hr width="50%">
|
||||
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
|
||||
<h3><i>Disclaimer</i></h3>
|
||||
<blockquote>
|
||||
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to
|
||||
fitness for any particular purpose. No warranties of any kind are expressed or implied. The
|
||||
recipient agrees to determine applicability of information provided. If this file has been
|
||||
purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be
|
||||
exchange of defective media within 90 days of receipt.</i></p>
|
||||
<p><i>This disclaimer is applicable for all other data files accompanying the Unicode Character
|
||||
Database, some of which have been compiled by the Unicode Consortium, and some of which have
|
||||
been supplied by other sources.</i></p>
|
||||
</blockquote>
|
||||
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
|
||||
<blockquote>
|
||||
<p><i>Recipient is granted the right to make copies in any form for internal distribution and to
|
||||
freely use the information supplied in the creation of products supporting the Unicode<sup>TM</sup>
|
||||
Standard. The files in the Unicode Character Database can be redistributed to third parties or
|
||||
other organizations (whether for profit or not) as long as this notice and the disclaimer notice
|
||||
are retained. Information can be extracted from these files and used in documentation or
|
||||
programs, as long as there is an accompanying notice indicating the source.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<div align="center">
|
||||
<center>
|
||||
<table cellspacing="0" cellpadding="0" border="0">
|
||||
<tr>
|
||||
<td><a href="http://www.unicode.org/unicode/copyright.html">
|
||||
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
|
||||
</tr>
|
||||
</table>
|
||||
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
|
||||
</script>
|
||||
</center>
|
||||
</div>
|
||||
<blockquote>
|
||||
</blockquote>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2005/05/02 15:39:53 $
|
||||
* $Revision: 1.22 $
|
||||
* $Date: 2005/05/27 21:38:51 $
|
||||
* $Revision: 1.23 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -46,8 +46,6 @@ public class TestData implements UCD_Types {
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
//checkChars(false);
|
||||
new GenStringPrep().genStringPrep();
|
||||
if (true) return;
|
||||
|
||||
System.out.println("main: " + Default.getDate());
|
||||
upf = ICUPropertyFactory.make();
|
||||
@ -152,404 +150,6 @@ public class TestData implements UCD_Types {
|
||||
}
|
||||
Matcher m;
|
||||
|
||||
static class GenStringPrep {
|
||||
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet decomposable = new UnicodeSet();
|
||||
UnicodeMap suspect = new UnicodeMap();
|
||||
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
|
||||
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
|
||||
UnicodeSet wordChars = new UnicodeSet();
|
||||
{
|
||||
if (false) {
|
||||
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
|
||||
wordChars.retainAll(ups.getSet("gc=Sk"));
|
||||
}
|
||||
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
|
||||
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
|
||||
" \\u055A \\u02B9 \\u02BA]"));
|
||||
//wordChars.removeAll(xid_continue);
|
||||
}
|
||||
|
||||
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
|
||||
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
|
||||
|
||||
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
|
||||
|
||||
//UnicodeSet[] decompChars = new UnicodeSet[100];
|
||||
UCD ucd = Default.ucd();
|
||||
|
||||
Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
|
||||
{
|
||||
uca0.setStrength(Collator.IDENTICAL);
|
||||
}
|
||||
GenerateHanTransliterator.MultiComparator uca
|
||||
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
|
||||
uca0, new UTF16.StringComparator()});
|
||||
|
||||
UnicodeSet bidiR = new UnicodeSet(
|
||||
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
|
||||
|
||||
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
|
||||
UnicodeSet hasNoUpper = new UnicodeSet();
|
||||
UnicodeSet hasNoUpperMinus = new UnicodeSet();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
UnicodeSet inIDN = new UnicodeSet();
|
||||
|
||||
void genStringPrep() throws IOException {
|
||||
//showScriptToBlock();
|
||||
bf.setShowLiteral(BagFormatter.toHTMLControl);
|
||||
//bf.setValueSource(UnicodeLabel.NULL);
|
||||
if (false) {
|
||||
|
||||
System.out.println("word chars: " + bf.showSetNames(wordChars));
|
||||
System.out.println("pat: " + bf.showSetNames(patternProp));
|
||||
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
|
||||
}
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
||||
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
|
||||
int idnaType = getIDNAType(cp);
|
||||
idnaTypeSet[idnaType].add(cp);
|
||||
String str = UTF16.valueOf(cp);
|
||||
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
|
||||
int script = ucd.getScript(cp);
|
||||
if (coreChars[script] == null)
|
||||
coreChars[script] = new UnicodeSet();
|
||||
coreChars[script].add(cp);
|
||||
}
|
||||
// fix characters with no uppercase
|
||||
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
|
||||
System.out.println(bf.showSetNames(hasNoUpper));
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
|
||||
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
|
||||
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
|
||||
textOut.println('\uFEFF');
|
||||
textOut.println("For documentation, see idn-chars.html");
|
||||
|
||||
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
|
||||
new String[] {"%date%", Default.getDate()});
|
||||
/*
|
||||
out
|
||||
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
out.println("<title>IDN Characters</title><style>");
|
||||
out.println("<!--");
|
||||
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
|
||||
out.println(".Atomic { background-color: #CCCCFF }");
|
||||
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
|
||||
out.println(".Non-XID { background-color: #FFCCCC }");
|
||||
out.println(".Decomposable { background-color: #FFFFCC }");
|
||||
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
|
||||
|
||||
out.println("th { text-align: left }");
|
||||
out.println("-->");
|
||||
out.println("</style></head><body><table>");
|
||||
*/
|
||||
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
|
||||
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
|
||||
|
||||
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
|
||||
if (scriptCode == COMMON_SCRIPT
|
||||
|| scriptCode == INHERITED_SCRIPT)
|
||||
continue;
|
||||
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
|
||||
}
|
||||
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
|
||||
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
|
||||
htmlOut.println("</table></body></html>");
|
||||
htmlOut.close();
|
||||
htmlOut2.println("</table></body></html>");
|
||||
htmlOut2.close();
|
||||
bf.setMergeRanges(false);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
|
||||
textOut.println();
|
||||
bf.setValueSource("word-chars");
|
||||
bf.showSetNames(textOut, wordChars);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** FOR REVIEW ***");
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
|
||||
textOut.println();
|
||||
String value = (String)it.next();
|
||||
bf.setValueSource(value);
|
||||
bf.showSetNames(textOut, suspect.getSet(value));
|
||||
}
|
||||
textOut.close();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private void showScriptToBlock() {
|
||||
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
|
||||
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
|
||||
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
|
||||
public Object compose(Object a, Object b) {
|
||||
return a + "\t" + b;
|
||||
}
|
||||
};
|
||||
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
|
||||
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
|
||||
System.out.println(it.next());
|
||||
}
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
|
||||
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
|
||||
|
||||
static String[][] script_to_gif = {
|
||||
|
||||
{"Common","common.gif"}, //Miscellaneous_Symbols
|
||||
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
|
||||
{"Arabic","arabic.gif"}, //Arabic
|
||||
{"Armenian","armenian.gif"}, //Armenian
|
||||
{"Bengali","bengali.gif"}, //Bengali
|
||||
{"Bopomofo","bopomofo.gif"}, //Bopomofo
|
||||
{"Braille","braillesymbols.gif"}, //Braille_Patterns
|
||||
{"Buginese","buginese.gif"}, //Buginese
|
||||
{"Buhid","buhid.gif"}, //Buhid
|
||||
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
|
||||
{"Cherokee","cherokee.gif"}, //Cherokee
|
||||
{"Coptic","coptic.gif"}, //Coptic
|
||||
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
|
||||
{"Cyrillic","cyrillic.gif"}, //Cyrillic
|
||||
{"Deseret","deseret.gif"}, //Deseret
|
||||
{"Devanagari","devanagari.gif"}, //Devanagari
|
||||
{"Ethiopic","ethiopic.gif"}, //Ethiopic
|
||||
{"Georgian","georgian.gif"}, //Georgian
|
||||
{"Glagolitic","glagolitic.gif"}, //Glagolitic
|
||||
{"Gothic","gothic.gif"}, //Gothic
|
||||
{"Greek","greek.gif"}, //Greek_and_Coptic
|
||||
{"Gujarati","gujarati.gif"}, //Gujarati
|
||||
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
|
||||
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
|
||||
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
|
||||
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
|
||||
{"Hanunoo","hanunoo.gif"}, //Hanunoo
|
||||
{"Hebrew","hebrew.gif"}, //Hebrew
|
||||
{"Hiragana","hiragana.gif"}, //Hiragana
|
||||
{"Kannada","kannada.gif"}, //Kannada
|
||||
{"Katakana","katakana.gif"}, //Katakana
|
||||
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
|
||||
{"Khmer","khmer.gif"}, //Khmer
|
||||
{"Lao","lao.gif"}, //Lao
|
||||
{"Latin","latin.gif"}, //Basic_Latin
|
||||
{"Limbu","limbu.gif"}, //Limbu
|
||||
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
|
||||
{"Malayalam","malayalam.gif"}, //Malayalam
|
||||
{"Mongolian","mongolian.gif"}, //Mongolian
|
||||
{"Myanmar","myanmar.gif"}, //Myanmar
|
||||
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
|
||||
{"Ogham","ogham.gif"}, //Ogham
|
||||
{"Old_Italic","olditalic.gif"}, //Old_Italic
|
||||
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
|
||||
{"Oriya","oriya.gif"}, //Oriya
|
||||
{"Osmanya","osmanya.gif"}, //Osmanya
|
||||
{"Runic","runic.gif"}, //Runic
|
||||
{"Shavian","shavian.gif"}, //Shavian
|
||||
{"Sinhala","sinhala.gif"}, //Sinhala
|
||||
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
|
||||
{"Syriac","syriac.gif"}, //Syriac
|
||||
{"Tagalog","tagalog.gif"}, //Tagalog
|
||||
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
|
||||
{"Tai_Le","taile.gif"}, //Tai_Le
|
||||
{"Tamil","tamil.gif"}, //Tamil
|
||||
{"Telugu","telugu.gif"}, //Telugu
|
||||
{"Thaana","thaana.gif"}, //Thaana
|
||||
{"Thai","thai.gif"}, //Thai
|
||||
{"Tibetan","tibetan.gif"}, //Tibetan
|
||||
{"Tifinagh","tifinagh.gif"}, //Tifinagh
|
||||
{"Ugaritic","ugaritic.gif"}, //Ugaritic
|
||||
{"Yi","yi.gif"}, //Yi_Syllables
|
||||
|
||||
};
|
||||
|
||||
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
|
||||
{
|
||||
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
|
||||
}
|
||||
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private int getIDNAType(int cp) {
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.DEFAULT); // USE_STD3_RULES
|
||||
if (intermediate.length() == 0)
|
||||
return DELETED;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
return ILLEGAL;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
return ILLEGAL;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
return REMAPPED;
|
||||
return OK;
|
||||
}
|
||||
StringBuffer inbuffer = new StringBuffer();
|
||||
StringBuffer intermediate, outbuffer;
|
||||
|
||||
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param scriptCode
|
||||
* @param htmlOut2 TODO
|
||||
* @param ucd
|
||||
* @param coreChars
|
||||
* @param decompChars
|
||||
*/
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
|
||||
if (coreChars[scriptCode] == null) return;
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
|
||||
script = Utility.getUnskeleton(script.toLowerCase(),true);
|
||||
System.out.println(script);
|
||||
|
||||
htmlOut.println();
|
||||
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
|
||||
+ "'> Script: " + script + "</th></tr>";
|
||||
htmlOut.println(scriptLine);
|
||||
htmlOut2.println(scriptLine);
|
||||
textOut.println();
|
||||
textOut.println("#*** Script: " + script + " ***");
|
||||
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
|
||||
|
||||
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
|
||||
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
|
||||
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
|
||||
|
||||
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
|
||||
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
|
||||
|
||||
UnicodeSet decomp = extract(decomposable, core);
|
||||
UnicodeSet pattern = extract(patternProp, core);
|
||||
UnicodeSet non_id = extract(not_xid_continue, core);
|
||||
|
||||
UnicodeSet bicameralNoupper = new UnicodeSet();
|
||||
if (!hasNoUpper.containsAll(core)) {
|
||||
bicameralNoupper = extract(hasNoUpperMinus, core);
|
||||
}
|
||||
|
||||
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
|
||||
String cat = Default.ucd().getCategoryID(it.codepoint);
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name.indexOf("MUSICAL SYMBOL") >= 0
|
||||
|| name.indexOf("DINGBA") >= 0
|
||||
|| name.indexOf("RADICAL ") >= 0
|
||||
) cat = "XX";
|
||||
suspect.put(it.codepoint, cat);
|
||||
}
|
||||
|
||||
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
|
||||
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
|
||||
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
|
||||
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);
|
||||
|
||||
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
|
||||
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
|
||||
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
|
||||
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
|
||||
core.removeAll(decomp);
|
||||
return decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param script TODO
|
||||
* @param unicodeset
|
||||
* @param scriptCode
|
||||
* @param uca
|
||||
*/
|
||||
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
|
||||
String script, String title, UnicodeSet unicodeset, int scriptCode) {
|
||||
if (unicodeset == null)
|
||||
return;
|
||||
int size = unicodeset.size();
|
||||
String dir = unicodeset.containsSome(bidiR)
|
||||
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
|
||||
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
|
||||
title + "'>" + title + "</a> ("
|
||||
+ nf.format(size) + ")</th></tr>");
|
||||
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
|
||||
// <a href="#Atomic">categorization</a>
|
||||
textOut.println();
|
||||
textOut.println("# " + title);
|
||||
bf.setValueSource(script + " ; " + title);
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator();
|
||||
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
|
||||
usi.reset(unicodeset);
|
||||
while (usi.nextRange()) {
|
||||
if (usi.codepoint == usi.codepointEnd) {
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint)));
|
||||
} else {
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint))
|
||||
+ ".. "
|
||||
+ formatCode(UTF16
|
||||
.valueOf(usi.codepointEnd)));
|
||||
}
|
||||
}
|
||||
bf.showSetNames(textOut, unicodeset);
|
||||
} else {
|
||||
Set reordered = new TreeSet(uca);
|
||||
usi.reset(unicodeset);
|
||||
while (usi.next()) {
|
||||
String x = usi.getString();
|
||||
boolean foo = reordered.add(x);
|
||||
if (!foo)
|
||||
throw new IllegalArgumentException("Collision with "
|
||||
+ Default.ucd().getCodeAndName(x));
|
||||
}
|
||||
for (Iterator it = reordered.iterator(); it.hasNext();) {
|
||||
Object key = it.next();
|
||||
htmlOut.print(formatCode((String)key));
|
||||
}
|
||||
bf.showSetNames(textOut, reordered);
|
||||
}
|
||||
htmlOut.println("</td></tr>");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string
|
||||
* @return
|
||||
*/
|
||||
private String formatCode(String string) {
|
||||
int cat = ucd.getCategory(UTF16.charAt(string,0));
|
||||
return "<span title='" + ucd.getCodeAndName(string) + "'>"
|
||||
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
|
||||
+ BagFormatter.toHTMLControl.transliterate(string)
|
||||
+ " </span>";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param inbuffer
|
||||
* @param outbuffer
|
||||
|
@ -240,7 +240,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
{"Control", "CN"},
|
||||
{"Extend", "EX"},
|
||||
{"Other", "XX"},
|
||||
}).swapFirst2ValueAliases());
|
||||
}, true).swapFirst2ValueAliases());
|
||||
|
||||
add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
@ -283,7 +283,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
{"Numeric", "NU"},
|
||||
{"ExtendNumLet", "EX"},
|
||||
{"Other", "XX"},
|
||||
}).swapFirst2ValueAliases());
|
||||
}, true).swapFirst2ValueAliases());
|
||||
|
||||
add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
@ -335,7 +335,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
{"STerm", "ST"},
|
||||
{"Close", "CL"},
|
||||
{"Other", "XX"},
|
||||
}).swapFirst2ValueAliases());
|
||||
}, false).swapFirst2ValueAliases());
|
||||
}
|
||||
|
||||
static String[] YES_NO_MAYBE = {"N", "M", "Y"};
|
||||
|
34
tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt
Normal file
34
tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt
Normal file
@ -0,0 +1,34 @@
|
||||
# Confusables.txt
|
||||
# Generated: %date%, MED
|
||||
# This is a draft list of visually confusable characters, for use in conjunction with the
|
||||
# recommendations in http://www.unicode.org/reports/tr36/
|
||||
#
|
||||
# To fold using this list, first perform NFKD (if not already performed),
|
||||
# then map each source character to the target character(s), then perform NFKD again.
|
||||
#
|
||||
# The format the standard Unicode semicolon-delimited hex.
|
||||
# <source> ; <target> ; <internal_info> # <comment>
|
||||
#
|
||||
# The characters may be visually distinguishable in many fonts, or at larger sizes.
|
||||
# Some anomalies are also introduced by 'closure'. That is, there may be a sequence of
|
||||
# characters where each is visually confusable from the next, but the start and end are
|
||||
# visually distinguishable. But when the set is closed, these will all map to together.
|
||||
#
|
||||
# This is unlike normalization data. There may be no connection between characters other
|
||||
# than visual confusability. This data should not be used except in assessing visual confusability.
|
||||
#
|
||||
# This list is not limited to Unicode Identifier characters (XID_Continue) although the primary
|
||||
# application will be to such characters. It is also not limited to lowercase characters,
|
||||
# although the recommendations are to lowercase for security.
|
||||
#
|
||||
# Note that a some characters have unusual characteristics, and are not yet accounted for.
|
||||
# For example, U+302E (?) HANGUL SINGLE DOT TONE MARK and U+302F (?) HANGUL DOUBLE DOT TONE MARK
|
||||
# appear to the left of the prevous character. So what looks like "a:b" can actually be "ab\u302F"
|
||||
#
|
||||
# WARNING: The data is not final; it is very draft at this point, put together from different
|
||||
# sources that need to be reviewed for accuracy and completeness of the mappings.
|
||||
# There are still clear errors in the data; do not use this in any implementations.
|
||||
# Ignore the internal_info field; it will be removed.
|
||||
#
|
||||
# Thanks especially to Eric van der Poel for collecting information about fonts using shared glyphs.
|
||||
# =================================
|
@ -86,6 +86,10 @@ Within each subcategory characters are sorted according to the default
|
||||
</tr>
|
||||
</table>
|
||||
</blockquote>
|
||||
<p>Characters that are normally invisible are represented in the chart by their Unicode number, such as "U+FE00".</p>
|
||||
<p>At the end of this document, there is an additional section that lists all <a href='#Visible_Combining_Marks_0'>visible non-spacing marks</a>.
|
||||
These are sorted first by combining character class (modified), then by script, then by code point..</p>
|
||||
<p>For comparison of Indic characters, see <a href='indic-trans.html'>indic-trans.html</a>.</p>
|
||||
<h3>Additional <a name="Word_Characters">Word Characters</a></h3>
|
||||
<p>This is a draft list of characters based on <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2005/03/30 17:19:32 $
|
||||
* $Revision: 1.48 $
|
||||
* $Date: 2005/05/27 21:39:03 $
|
||||
* $Revision: 1.49 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -336,6 +336,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
}
|
||||
|
||||
public static String fromHex(String p) {
|
||||
return fromHex(p, false);
|
||||
}
|
||||
|
||||
public static String fromHex(String p, boolean acceptChars) {
|
||||
StringBuffer output = new StringBuffer();
|
||||
int value = 0;
|
||||
int count = 0;
|
||||
@ -357,13 +361,31 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
default:
|
||||
int type = Character.getType(ch);
|
||||
if (type != Character.SPACE_SEPARATOR) {
|
||||
if (acceptChars) {
|
||||
if (count >= 4 && count <= 6) {
|
||||
UTF32.append32(output, value);
|
||||
count = 0;
|
||||
value = 0;
|
||||
} else if (count != 0) {
|
||||
output.append(p.substring(i-count, i)); // TODO fix supplementary characters
|
||||
}
|
||||
UTF32.append32(output, ch);
|
||||
continue main;
|
||||
|
||||
}
|
||||
throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
|
||||
new Object[] {String.valueOf(ch), new Integer(i), p});
|
||||
}
|
||||
// fall through!!
|
||||
case ' ': case ',': case ';': // do SPACE here, just for speed
|
||||
if (count != 0) {
|
||||
UTF32.append32(output, value);
|
||||
if (count < 4 || count > 6) {
|
||||
if (acceptChars) output.append(p.substring(i-count, i));
|
||||
else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
|
||||
new Object[] {String.valueOf(ch), new Integer(i), p});
|
||||
} else {
|
||||
UTF32.append32(output, value);
|
||||
}
|
||||
}
|
||||
count = 0;
|
||||
value = 0;
|
||||
@ -378,7 +400,13 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
count++;
|
||||
}
|
||||
if (count != 0) {
|
||||
UTF32.append32(output, value);
|
||||
if (count < 4 || count > 6) {
|
||||
if (acceptChars) output.append(p.substring(p.length()-count, p.length()));
|
||||
else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
|
||||
new Object[] {"EOS", new Integer(p.length()), p});
|
||||
} else {
|
||||
UTF32.append32(output, value);
|
||||
}
|
||||
}
|
||||
return output.toString();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user