ICU-0 misc fixes

X-SVN-Rev: 17717
This commit is contained in:
Mark Davis 2005-05-27 21:43:46 +00:00
parent 0176a784d1
commit 65e8ccde28
9 changed files with 1348 additions and 409 deletions

View File

@ -0,0 +1,480 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
* $Date: 2005/05/27 21:40:51 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.ArrayComparator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class GenerateConfusables {
static PrintWriter log;
static final String ARROW = "\u2192";
static class Data2 {
String source;
String target;
int count;
Data2(String target, int count) {
this.target = target;
this.count = count;
}
}
static ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
static UnicodeSet skipSet = ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cc")).addAll(ups.getSet("gc=Cf"));
static class Data implements Comparable {
String source;
String target;
String type;
Data(String source, String target, String type) {
this.source = source;
this.target = target;
this.type = type;
}
public int compareTo(Object o) {
int result;
Data that = (Data)o;
if (0 != (result = target.compareTo(that.target))) return result;
if (0 != (result = source.compareTo(that.source))) return result;
if (0 != (result = type.compareTo(that.type))) return result;
return 0;
}
}
static UnicodeSet controls = new UnicodeSet("[:Cc:]");
static class DataSet {
Set dataSet = new TreeSet();
Map dataMap = new TreeMap(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), new UTF16.StringComparator()}));
public DataSet add(String source, String target, String type, String errorLine) {
if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
String nsource = Default.nfkd().normalize(source);
String ntarget = Default.nfkd().normalize(target);
// if it is just a compatibility match, return
if (nsource.equals(ntarget)) return this;
if (type.startsWith("confusables-")) type = type.substring("confusables-".length());
if (type.endsWith(".txt")) type = type.substring(0,type.length() - ".txt".length());
// if it is base + combining sequence => base2 + same combining sequence, do just the base
int nsourceFirst = UTF16.charAt(nsource,0);
String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
int ntargetFirst = UTF16.charAt(ntarget,0);
String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
source = UTF16.valueOf(nsourceFirst);
target = UTF16.valueOf(ntargetFirst);
type += "-base";
}
// swap order
if (preferSecondAsSource(source, target)) {
String temp = target;
target = source;
source = temp;
}
if (target.indexOf('\u203D') >= 0) type += "-skip";
Data newData = new Data(source, target, type);
return add(newData, errorLine);
}
/**
* @param errorLine TODO
*
*/
private DataSet add(Data newData, String errorLine) {
if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
System.out.println("Problem with " + errorLine);
System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
}
String[] key = {newData.source, newData.target};
Data old = (Data) dataMap.get(key);
if (old == null) {
dataSet.add(newData);
dataMap.put(key, newData);
}else {
old.type = old.type + "/" + newData.type;
}
return this;
}
// Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
static final int NORMAL = 0, FOLDING = 1, OLD = 2;
public DataSet addFile(String directory, String filename) throws IOException {
BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
int kind = NORMAL;
if (filename.indexOf("Folding") >= 0) kind = FOLDING;
else if (false && filename.indexOf("-old") >= 0) kind = OLD;
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
}
String type = filename;
if (kind==FOLDING) {
String source = Utility.fromHex(pieces[0].trim(),true);
String target = Utility.fromHex(pieces[1].trim(),true);
String nsource = Default.nfkd().normalize(source);
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
if (!first.equals(target)) {
add(source, target, type, line);
}
} else if (kind == OLD) {
String target = pieces[0].trim();
for (int i = 1; i < pieces.length; ++i) {
add(pieces[i].trim(), target, type, line);
}
} else {
String source = Utility.fromHex(pieces[0].trim(),true);
String target = Utility.fromHex(pieces[1].trim(),true);
if (pieces.length > 2) type = pieces[2].trim();
add(source, target, type, line);
}
}
in.close();
return this;
}
public void write(String directory, String filename, boolean appendFile) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
if (appendFile) {
String[] replacements = {"%date%", Default.getDate()};
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
Utility.UTF8_WINDOWS, out, replacements);
}
for (Iterator it = dataSet.iterator(); it.hasNext();) {
Data item = (Data) it.next();
out.println(
Utility.hex(item.source)
+ " ;\t" + Utility.hex(item.target)
+ " ;\t" + item.type
+ "\t# "
+ "( " + item.source + " " + ARROW + " " + item.target + ") "
+ Default.ucd().getName(item.source) + " " + ARROW + " "
+ Default.ucd().getName(item.target));
}
out.close();
}
/**
*
*/
public void add(DataSet ds) {
for (Iterator it = ds.dataSet.iterator(); it.hasNext();) {
add((Data)it.next(), "");
}
}
public DataSet clean() {
// remove all skips
DataSet tempSet = new DataSet();
Map m = new HashMap();
for (Iterator it = dataSet.iterator(); it.hasNext();) {
Data d = (Data) it.next();
if (d.type.indexOf("skip") >= 0) continue;
String newTarget = Default.nfkd().normalize(d.target);
String newSource = Default.nfkd().normalize(d.source);
String type = d.type;
if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
type += "-nf";
log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
continue;
}
// swap order
if (preferSecondAsSource(newSource, newTarget)) {
String temp = newTarget;
newTarget = newSource;
newSource = temp;
}
Data already = (Data) m.get(newSource);
if (already != null && !newTarget.equals(already.target)) {
log.println("X " + getCodeCharName(newSource) + " " + ARROW);
log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
if (preferSecondAsSource(already.target, newTarget)) {
// just fix new guy
type += "[" + newSource + "]" + already.type;
newSource = newTarget;
newTarget = already.target;
} else {
// need to fix new guy, AND fix old guy.
tempSet.remove(already);
type += "[" + newSource + "]" + already.type;
newSource = already.target;
already.type += "[" + already.target + "]" + type;
already.target = newTarget;
tempSet.add(already, "");
}
}
Data newData = new Data(newSource, newTarget, type);
m.put(newSource, newData);
tempSet.add(newData, "");
}
// now recursively apply
DataSet s = new DataSet();
for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
Data d = (Data) it.next();
int cp = 0;
StringBuffer result = new StringBuffer();
for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(d.target, i);
String src = UTF16.valueOf(cp);
while (true) {
Data rep = (Data) m.get(src);
if (rep == null) break;
src = rep.target;
}
result.append(src);
}
String newTarget = result.toString();
newTarget = Default.nfkd().normalize(newTarget);
s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
}
return s;
}
/**
*
*/
private void remove(Data already) {
String[] key = {already.source, already.target};
dataMap.remove(key);
dataSet.remove(already);
}
}
public static void main(String[] args) throws IOException {
String indir = Utility.BASE_DIR + "confusables/";
String outdir = Utility.GEN_DIR + "confusables/";
log = BagFormatter.openUTF8Writer(outdir, "log.txt");
//fixMichel(indir, outdir);
generateConfusables(indir, outdir);
log.close();
System.out.println("Done");
}
/**
* @throws IOException
*
*/
private static void fixMichel(String indir, String outdir) throws IOException {
BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
String[] pieces = Utility.split(line,'\t');
if (pieces.length < 2) {
out.println(line);
continue;
}
String source = Utility.fromHex(pieces[0].trim());
if (Default.nfkd().isNormalized(source)) {
out.println(line);
}
}
in.close();
out.close();
}
/**
*
*/
private static void generateConfusables(String indir, String outdir) throws IOException {
File dir = new File(indir);
String[] names = dir.list();
DataSet total = new DataSet();
for (int i = 0; i < names.length; ++i) {
if (new File(indir + names[i]).isDirectory()) continue;
System.out.println(names[i]);
DataSet ds = new DataSet();
ds.addFile(indir, names[i]);
ds.write(outdir, "new-" + names[i], false);
total.add(ds);
}
total.write(outdir, "confusables-raw.txt", false);
DataSet clean = total.clean();
clean.write(outdir, "confusables.txt", true);
}
/*
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(),
new UTF16.StringComparator()}));
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
}
String source = Utility.fromHex(pieces[0].trim());
String target = Utility.fromHex(pieces[1].trim());
String nsource = Default.nfkd().normalize(source);
String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
if (!first.equals(target)) {
set.add(new String[]{source, target});
}
}
in.close();
}
public static void gen() throws IOException {
Map m = new TreeMap();
BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
while (true) {
String line = in.readLine();
if (line == null) break;
String[] pieces = Utility.split(line,';');
if (pieces.length < 3) {
System.out.println("Error on: " + line);
continue;
}
int codepoint = Integer.parseInt(pieces[1], 16);
int cat = Default.ucd().getCategory(codepoint);
if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
String result = Utility.fromHex(pieces[0]);
if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
int count = Integer.parseInt(pieces[2]);
String source = UTF16.valueOf(codepoint);
add(m, source, result, count);
}
in.close();
in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
while (true) {
String line = in.readLine();
if (line == null) break;
line = line.trim();
int pos = line.indexOf("#");
if (pos >= 0) line = line.substring(0,pos).trim();
if (line.length() == 0) continue;
if (line.startsWith("@")) continue;
String[] pieces = Utility.split(line,';');
if (pieces.length < 2) {
System.out.println("Error on: " + line);
continue;
}
String source = pieces[0].trim();
for (int i = 1; i < pieces.length; ++i) {
add(m, source, pieces[i].trim(), -1);
}
}
in.close();
boolean gotOne;
// close the set
do {
gotOne = false;
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
String source = (String) it.next();
Data2 data = (Data2) m.get(source);
Data2 data2 = (Data2) m.get(data.target);
if (data2 == null) continue;
data.target = data2.target;
gotOne = true;
break;
}
} while (gotOne);
// put into different sorting order
Set s = new TreeSet();
for (Iterator it = m.keySet().iterator(); it.hasNext();) {
String source = (String) it.next();
Data2 data = (Data2) m.get(source);
s.add(new Data(source, data.target, data.count));
}
// write it out
PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
String[] replacements = {"%date%", Default.getDate()};
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
Utility.UTF8_WINDOWS, out, replacements);
for (Iterator it = s.iterator(); it.hasNext();) {
Data d = (Data) it.next();
if (d == null) continue;
out.println(formatLine(d.source, d.target, d.count));
}
out.close();
System.out.println("Done");
}
/**
*
*/
private static String formatLine(String source, String target, int count) {
return Utility.hex(source) + " ; " + Utility.hex(target," ")
+ " ; " + count
+ " # "
+ "(" + source + " " + ARROW + " " + target + ") "
+ Default.ucd().getName(source)
+ " " + ARROW + " " + Default.ucd().getName(target);
}
/**
*
*/
private static void add(Map m, String source, String target, int count) {
if (source.length() == 0 || target.length() == 0) return;
if (preferSecondAsSource(source, target)) {
String temp = target;
target = source;
source = temp;
}
Data2 other = (Data2) m.get(source);
if (other != null) {
if (target.equals(other.target)) return;
System.out.println("conflict");
System.out.println(formatLine(source, target, count));
System.out.println(formatLine(source, other.target, other.count));
// skip adding this, and instead add result -> other.target
add(m, target, other.target, count);
} else {
m.put(source, new Data2(target, count));
}
};
static private boolean preferSecondAsSource(String a, String b) {
// if first is longer, prefer second
int ca = UTF16.countCodePoint(a);
int cb = UTF16.countCodePoint(b);
if (ca != cb) {
return ca > cb;
}
// if first is lower, prefer second
return a.compareTo(b) < 0;
}
static String getCodeCharName(String a) {
return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a);
}
}

View File

@ -0,0 +1,125 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
* $Date: 2005/05/27 21:40:51 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import java.io.*;
public final class GenerateNamedSequences implements UCD_Types {
static final boolean DEBUG = false;
static public String showVarGlyphs(String code0, String code1, String shape, String description) {
if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
String abbShape = "";
if (shape.length() != 0) {
abbShape = '-' + shape.substring(0,4);
if (description.indexOf("feminine") >= 0) abbShape += "fem";
}
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
}
/*
# Field 0: the variation sequence
# Field 1: the description of the desired appearance
# Field 2: where the appearance is only different in in particular shaping environments
# this field lists them. The possible values are: isolated, initial, medial, final.
# If more than one is present, there are spaces between them.
*/
static public void generate() throws IOException {
// read the data and compose the table
String table = "<table><tr><th width='10%'>Rep Glyph</th><th>Hex Sequence</th><th>Name</th><th>Copyable</th></tr>";
String[] splits = new String[4];
String[] codes = new String[20];
String[] shapes = new String[4];
BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1);
Transliterator unicodexml = Transliterator.getInstance("hex/xml");
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line, ';', splits);
String name = splits[0];
int codeCount = Utility.split(splits[1], ' ', codes);
StringBuffer codeBuffer = new StringBuffer();
for (int i = 0; i < codeCount; ++i) {
UTF16.append(codeBuffer, Integer.parseInt(codes[i],16));
}
String codeWithHyphens = splits[1].replaceAll("\\s", "-");
String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+");
String codeString = unicodexml.transliterate(codeBuffer.toString());
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
//table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
String imageName = "images/U" + codeWithHyphens + ".gif";
if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) {
String codeNoSpaces2 = splits[1].replaceAll("\\s", "");
imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif";
}
table += "<tr>"
+ "<td class='copy'><img alt='(" + codeAlt + ")' src='" + imageName + "'><br><tt>"
+ splits[1] + "</tt></td>"
+ "<td>" + splits[1] + "</td>"
+ "</td><td>" + name + "</td>"
+ "<td class='copy'>" + codeString + "</td>"
+ "</tr>\n";
System.out.println(splits[1] + "\t" + codeString);
}
in.close();
table += "</table>";
// now write out the results
String directory = "DerivedData/";
String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
/*
String[] batName = {""};
String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
String version = Default.ucd().getVersion();
int lastDot = version.lastIndexOf('.');
String updateDirectory = version.substring(0,lastDot) + "-Update";
int updateV = version.charAt(version.length()-1) - '0';
if (updateV != 0) updateDirectory += (char)('1' + updateV);
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
*/
String[] replacementList = {
"@revision@", Default.ucd().getVersion(),
//"@updateDirectory@", updateDirectory,
"@date@", Default.getDate(),
"@table@", table};
Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList);
out.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
}
}

View File

@ -0,0 +1,515 @@
/*
* Created on May 3, 2005
* Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
* For terms of use, see http://www.unicode.org/terms_of_use.html
*/
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.CollectionUtilities;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.text.UTF16.StringComparator;
import com.ibm.icu.util.ULocale;
import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator;
import com.ibm.text.UCD.TestData.RegexMatcher;
import com.ibm.text.utility.Utility;
class GenerateStringPrep implements UCD_Types {
public static void main (String[] args) throws IOException {
//checkChars(false);
new GenerateStringPrep().genStringPrep();
System.out.println("Done");
}
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeMap suspect = new UnicodeMap();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
UnicodeSet wordChars = new UnicodeSet();
{
if (false) {
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
wordChars.retainAll(ups.getSet("gc=Sk"));
}
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
" \\u055A \\u02B9 \\u02BA]"));
//wordChars.removeAll(xid_continue);
}
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me"))
.addAll(ups.getSet("gc=Mn"))
.removeAll(ups.getSet("Default_Ignorable_Code_Point=true"));
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
//UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
static Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
{
uca0.setStrength(Collator.IDENTICAL);
}
static GenerateHanTransliterator.MultiComparator uca
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
uca0, new UTF16.StringComparator()});
UnicodeSet bidiR = new UnicodeSet(
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
UnicodeSet hasNoUpper = new UnicodeSet();
UnicodeSet hasNoUpperMinus = new UnicodeSet();
BagFormatter bf = new BagFormatter();
UnicodeSet inIDN = new UnicodeSet();
UnicodeSet isCaseFolded = new UnicodeSet();
void genStringPrep() throws IOException {
//showScriptToBlock();
bf.setShowLiteral(BagFormatter.toHTMLControl);
bf.setUnicodePropertyFactory(ups);
//bf.setValueSource(UnicodeLabel.NULL);
if (false) {
System.out.println("word chars: " + bf.showSetNames(wordChars));
System.out.println("pat: " + bf.showSetNames(patternProp));
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
// get IDNA
int idnaType = getIDNAType(cp);
idnaTypeSet[idnaType].add(cp);
String str = UTF16.valueOf(cp);
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp);
// scripts
int script = ucd.getScript(cp);
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
coreChars[script].add(cp);
}
// fix characters with no uppercase
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
System.out.println(bf.showSetNames(hasNoUpper));
Utility.fixDot();
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
textOut.println('\uFEFF');
textOut.println("For documentation, see idn-chars.html");
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
new String[] {"%date%", Default.getDate()});
/*
out
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
out.println("<!--");
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
out.println(".Atomic { background-color: #CCCCFF }");
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
out.println(".Non-XID { background-color: #FFCCCC }");
out.println(".Decomposable { background-color: #FFFFCC }");
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
out.println("th { text-align: left }");
out.println("-->");
out.println("</style></head><body><table>");
*/
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
continue;
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
}
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
showCodes(htmlOut, textOut, non_spacing);
htmlOut.println("</table></body></html>");
htmlOut.close();
htmlOut2.println("</table></body></html>");
htmlOut2.close();
bf.setMergeRanges(false);
textOut.println();
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
textOut.println();
bf.setValueSource("word-chars");
bf.showSetNames(textOut, wordChars);
textOut.println();
textOut.println("# *** FOR REVIEW ***");
bf.setLabelSource(UnicodeLabel.NULL);
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
textOut.println();
String value = (String)it.next();
bf.setValueSource(value);
bf.showSetNames(textOut, suspect.getSet(value));
}
textOut.close();
textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt");
bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
textOut.println();
textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***");
UnicodeSet U32 = ups32.getSet("gc=cn").complement();
UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32);
bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]);
textOut.close();
}
/**
*
*/
private void showScriptToBlock() {
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
public Object compose(Object a, Object b) {
return a + "\t" + b;
}
};
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
System.out.println(it.next());
}
throw new IllegalArgumentException();
}
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
static String[][] script_to_gif = {
{"Common","common.gif"}, //Miscellaneous_Symbols
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
{"Arabic","arabic.gif"}, //Arabic
{"Armenian","armenian.gif"}, //Armenian
{"Bengali","bengali.gif"}, //Bengali
{"Bopomofo","bopomofo.gif"}, //Bopomofo
{"Braille","braillesymbols.gif"}, //Braille_Patterns
{"Buginese","buginese.gif"}, //Buginese
{"Buhid","buhid.gif"}, //Buhid
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
{"Cherokee","cherokee.gif"}, //Cherokee
{"Coptic","coptic.gif"}, //Coptic
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
{"Cyrillic","cyrillic.gif"}, //Cyrillic
{"Deseret","deseret.gif"}, //Deseret
{"Devanagari","devanagari.gif"}, //Devanagari
{"Ethiopic","ethiopic.gif"}, //Ethiopic
{"Georgian","georgian.gif"}, //Georgian
{"Glagolitic","glagolitic.gif"}, //Glagolitic
{"Gothic","gothic.gif"}, //Gothic
{"Greek","greek.gif"}, //Greek_and_Coptic
{"Gujarati","gujarati.gif"}, //Gujarati
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
{"Hanunoo","hanunoo.gif"}, //Hanunoo
{"Hebrew","hebrew.gif"}, //Hebrew
{"Hiragana","hiragana.gif"}, //Hiragana
{"Kannada","kannada.gif"}, //Kannada
{"Katakana","katakana.gif"}, //Katakana
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
{"Khmer","khmer.gif"}, //Khmer
{"Lao","lao.gif"}, //Lao
{"Latin","latin.gif"}, //Basic_Latin
{"Limbu","limbu.gif"}, //Limbu
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
{"Malayalam","malayalam.gif"}, //Malayalam
{"Mongolian","mongolian.gif"}, //Mongolian
{"Myanmar","myanmar.gif"}, //Myanmar
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
{"Ogham","ogham.gif"}, //Ogham
{"Old_Italic","olditalic.gif"}, //Old_Italic
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
{"Oriya","oriya.gif"}, //Oriya
{"Osmanya","osmanya.gif"}, //Osmanya
{"Runic","runic.gif"}, //Runic
{"Shavian","shavian.gif"}, //Shavian
{"Sinhala","sinhala.gif"}, //Sinhala
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
{"Syriac","syriac.gif"}, //Syriac
{"Tagalog","tagalog.gif"}, //Tagalog
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
{"Tai_Le","taile.gif"}, //Tai_Le
{"Tamil","tamil.gif"}, //Tamil
{"Telugu","telugu.gif"}, //Telugu
{"Thaana","thaana.gif"}, //Thaana
{"Thai","thai.gif"}, //Thai
{"Tibetan","tibetan.gif"}, //Tibetan
{"Tifinagh","tifinagh.gif"}, //Tifinagh
{"Ugaritic","ugaritic.gif"}, //Ugaritic
{"Yi","yi.gif"}, //Yi_Syllables
};
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
{
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
}
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
/**
*
*/
private int getIDNAType(int cp) {
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.DEFAULT); // USE_STD3_RULES
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
}
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
}
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
/**
* @param htmlOut
* @param textOut TODO
* @param scriptCode
* @param htmlOut2 TODO
* @param ucd
* @param coreChars
* @param decompChars
*/
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
if (coreChars[scriptCode] == null) return;
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
script = Utility.getUnskeleton(script.toLowerCase(),true);
System.out.println(script);
htmlOut.println();
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
+ "'> Script: " + script + "</th></tr>";
htmlOut.println(scriptLine);
htmlOut2.println(scriptLine);
textOut.println();
textOut.println("#*** Script: " + script + " ***");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
UnicodeSet decomp = extract(decomposable, core);
UnicodeSet pattern = extract(patternProp, core);
UnicodeSet non_id = extract(not_xid_continue, core);
UnicodeSet bicameralNoupper = new UnicodeSet();
if (!hasNoUpper.containsAll(core)) {
bicameralNoupper = extract(hasNoUpperMinus, core);
}
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
String cat = Default.ucd().getCategoryID(it.codepoint);
String name = Default.ucd().getName(it.codepoint);
if (name.indexOf("MUSICAL SYMBOL") >= 0
|| name.indexOf("DINGBA") >= 0
|| name.indexOf("RADICAL ") >= 0
) cat = "XX";
suspect.put(it.codepoint, cat);
}
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca);
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca);
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca);
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca);
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca);
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca);
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca);
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca);
}
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException {
String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT);
script = Utility.getUnskeleton(script.toLowerCase(),true);
String scriptLine = "<tr><th class='script'><img src='images/"
+ ((String)scriptToGif.get(script)).toLowerCase()
+ "'> Script: " + script + "</th></tr>";
htmlOut.println(scriptLine);
UnicodeMap m = getPositions();
for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) {
String type = (String) it.next();
UnicodeSet current = m.getSet(type).retainAll(non_spacing);
if (current.size() == 0) continue;
printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator);
}
}
/**
* @throws IOException
*
*/
private UnicodeMap getPositions() throws IOException {
UnicodeMap result = new UnicodeMap();
BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt");
String type="Undetermined";
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
if (line.startsWith("@")) {
type = line.substring(1);
continue;
}
String[] pieces = Utility.split(line, ';');
String code = Utility.fromHex(pieces[0]);
result.put(UTF16.charAt(code,0), type);
}
return result;
}
static Comparator positionComparator = new Comparator() {
public int compare(Object o1, Object o2) {
String s1 = (String)o1;
String s2 = (String)o2;
return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2));
}
};
/**
*
*/
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
core.removeAll(decomp);
return decomp;
}
/**
* @param htmlOut
* @param textOut TODO
* @param script TODO
* @param unicodeset
* @param scriptCode
* @param comparator TODO
* @param uca
*/
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) {
if (unicodeset == null)
return;
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
title + "'>" + title + "</a> ("
+ TestData.nf.format(size) + ")</th></tr>");
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
// <a href="#Atomic">categorization</a>
textOut.println();
textOut.println("# " + title);
bf.setValueSource(script + " ; " + title);
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
usi.reset(unicodeset);
while (usi.nextRange()) {
if (usi.codepoint == usi.codepointEnd) {
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint)));
} else {
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint))
+ ".. "
+ formatCode(UTF16
.valueOf(usi.codepointEnd)));
}
}
bf.showSetNames(textOut, unicodeset);
} else {
Set reordered = new TreeSet(comparator);
usi.reset(unicodeset);
while (usi.next()) {
String x = usi.getString();
boolean foo = reordered.add(x);
if (!foo)
throw new IllegalArgumentException("Collision with "
+ Default.ucd().getCodeAndName(x));
}
for (Iterator it = reordered.iterator(); it.hasNext();) {
Object key = it.next();
htmlOut.print(formatCode((String)key));
}
bf.showSetNames(textOut, reordered);
}
htmlOut.println("</td></tr>");
}
/**
* @param string
* @return
*/
private String formatCode(String string) {
int cat = ucd.getCategory(UTF16.charAt(string,0));
String pad = "\u00A0", pad1 = pad;
if (cat == Me || cat == Mn) {
pad = "\u00A0\u00A0";
pad1 = "\u00A0\u00A0\u25cc";
}
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ pad1
+ BagFormatter.toHTMLControl.transliterate(string)
+ pad
+ "</span> ";
}
}

View File

@ -0,0 +1,153 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta name="keywords" content="unicode, variant glyphs">
<meta name="description" content="Describes and displays standardized variant glyphs">
<title>Named Sequences</title>
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
<style>
<!--
.copy { text-align: center; font-size: 150% }
th, td { vertical-align: middle }
tt { font-size: 8pt }
table { padding: 2pt }
-->
</style>
</head>
<body bgcolor="#ffffff">
<table class="header">
<tr>
<td class="icon"><a href="http://www.unicode.org">
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode
Character Database</a></td>
</tr>
<tr>
<td class="gray">&nbsp;</td>
</tr>
</table>
<div style="margin:1em">
<table border="1" cellpadding="0" cellspacing="1" style="border-collapse: collapse" bordercolor="#111111" width="100%" id="AutoNumber1">
<tr>
<td width="100%">
<p style="text-align: right">L2-XXX</p>
<p><i>To: UTC<br>
From: Mark Davis<br>
Date: 2005-04-28</i></p>
<p><i>One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html,
following the pattern of StandardizedVariants.html. This document was generated along those
lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable
style modifications, of course) as a chart someplace accessible under
<a href="http://unicode.org/charts/">http://unicode.org/charts/</a>.</i></p>
<p><i>Alternatively, we could also combine this with the StandardizedVariants.html to provide
a unified chart of sequences, again someplace under <a href="http://unicode.org/charts/">
http://unicode.org/charts/</a>.</i></p>
<p><i><b>Note:</b> we don&#39;t have some of the glyphs quite right yet, but it should be
sufficient for discussing the format. One of the innovations is having a separate column of
text that for copy&amp;paste; that needs discussion also.</i></td>
</tr>
</table>
<h1><i><font color="#990000">&nbsp;PROPOSED WORKING DRAFT<br>
</font></i>Named Sequences</h1>
<table class="wide">
<tr>
<td valign="top" width="144">Revision</td>
<td valign="top">@revision@</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
<td valign="top">Members of the Editorial Committee</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
<td valign="top">@date@</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top">
<a href="http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html">
http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
<td valign="top">n/a</td>
</tr>
<tr>
<td valign="top" width="144">Latest Version</td>
<td valign="top">n/a</td>
</tr>
</table>
<h3><br>
<i>Summary</i></h3>
<blockquote>
<p>This file provides a visual display of the named sequences derived from NamedSequences.txt.<i>The
proposal is to add this, </i></p>
</blockquote>
<h3><i>Status</i></h3>
<blockquote>
<p><i>The file and the files described herein are part of the
<a href="http://www.unicode.org/ucd">Unicode Character Database</a> (UCD) and are governed by
the <a href="#Terms of Use">UCD Terms of Use</a> stated at the end.</i></p>
</blockquote>
<hr width="50%">
<h2>Introduction</h2>
<p>The tables here exhaustively lists the valid, registered named sequences. The columns include a
representative glyph, the sequence of code points in hex, and the name of the sequence. In
addition, there is a last column entitled <i>Copyable</i>, which contains the literal text forming
the sequence. That text can be copied and pasting in elsewhere. The display of the text in this
column is up to the capabilities of the browser and the set of available fonts. For more
information, see <a href="http://www.unicode.org/help/display_problems.html">Display Problems?</a>.</p>
<blockquote>
<p><a name="fonts"><b>Note: </b></a>The representative glyphs used to show the names sequences
are often derived from different physical fonts than the representative glyphs in the standard.
They may therefore exhibit minor differences in size, proportion, style, or weight.</p>
</blockquote>
<p>@table@</p>
<hr width="50%">
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
<h3><i>Disclaimer</i></h3>
<blockquote>
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to
fitness for any particular purpose. No warranties of any kind are expressed or implied. The
recipient agrees to determine applicability of information provided. If this file has been
purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be
exchange of defective media within 90 days of receipt.</i></p>
<p><i>This disclaimer is applicable for all other data files accompanying the Unicode Character
Database, some of which have been compiled by the Unicode Consortium, and some of which have
been supplied by other sources.</i></p>
</blockquote>
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
<blockquote>
<p><i>Recipient is granted the right to make copies in any form for internal distribution and to
freely use the information supplied in the creation of products supporting the Unicode<sup>TM</sup>
Standard. The files in the Unicode Character Database can be redistributed to third parties or
other organizations (whether for profit or not) as long as this notice and the disclaimer notice
are retained. Information can be extracted from these files and used in documentation or
programs, as long as there is an accompanying notice indicating the source.</i></p>
</blockquote>
<hr width="50%">
<div align="center">
<center>
<table cellspacing="0" cellpadding="0" border="0">
<tr>
<td><a href="http://www.unicode.org/unicode/copyright.html">
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
</tr>
</table>
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
</script>
</center>
</div>
<blockquote>
</blockquote>
</div>
</body>
</html>

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2005/05/02 15:39:53 $
* $Revision: 1.22 $
* $Date: 2005/05/27 21:38:51 $
* $Revision: 1.23 $
*
*******************************************************************************
*/
@ -46,8 +46,6 @@ public class TestData implements UCD_Types {
public static void main (String[] args) throws IOException {
//checkChars(false);
new GenStringPrep().genStringPrep();
if (true) return;
System.out.println("main: " + Default.getDate());
upf = ICUPropertyFactory.make();
@ -152,404 +150,6 @@ public class TestData implements UCD_Types {
}
Matcher m;
static class GenStringPrep {
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeMap suspect = new UnicodeMap();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
UnicodeSet wordChars = new UnicodeSet();
{
if (false) {
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
wordChars.retainAll(ups.getSet("gc=Sk"));
}
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
" \\u055A \\u02B9 \\u02BA]"));
//wordChars.removeAll(xid_continue);
}
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
//UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
{
uca0.setStrength(Collator.IDENTICAL);
}
GenerateHanTransliterator.MultiComparator uca
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
uca0, new UTF16.StringComparator()});
UnicodeSet bidiR = new UnicodeSet(
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
UnicodeSet hasNoUpper = new UnicodeSet();
UnicodeSet hasNoUpperMinus = new UnicodeSet();
BagFormatter bf = new BagFormatter();
UnicodeSet inIDN = new UnicodeSet();
void genStringPrep() throws IOException {
//showScriptToBlock();
bf.setShowLiteral(BagFormatter.toHTMLControl);
//bf.setValueSource(UnicodeLabel.NULL);
if (false) {
System.out.println("word chars: " + bf.showSetNames(wordChars));
System.out.println("pat: " + bf.showSetNames(patternProp));
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
int idnaType = getIDNAType(cp);
idnaTypeSet[idnaType].add(cp);
String str = UTF16.valueOf(cp);
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
int script = ucd.getScript(cp);
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
coreChars[script].add(cp);
}
// fix characters with no uppercase
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
System.out.println(bf.showSetNames(hasNoUpper));
Utility.fixDot();
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
textOut.println('\uFEFF');
textOut.println("For documentation, see idn-chars.html");
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
new String[] {"%date%", Default.getDate()});
/*
out
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
out.println("<!--");
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
out.println(".Atomic { background-color: #CCCCFF }");
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
out.println(".Non-XID { background-color: #FFCCCC }");
out.println(".Decomposable { background-color: #FFFFCC }");
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
out.println("th { text-align: left }");
out.println("-->");
out.println("</style></head><body><table>");
*/
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
continue;
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
}
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
htmlOut.println("</table></body></html>");
htmlOut.close();
htmlOut2.println("</table></body></html>");
htmlOut2.close();
bf.setMergeRanges(false);
textOut.println();
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
textOut.println();
bf.setValueSource("word-chars");
bf.showSetNames(textOut, wordChars);
textOut.println();
textOut.println("# *** FOR REVIEW ***");
bf.setLabelSource(UnicodeLabel.NULL);
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
textOut.println();
String value = (String)it.next();
bf.setValueSource(value);
bf.showSetNames(textOut, suspect.getSet(value));
}
textOut.close();
}
/**
*
*/
private void showScriptToBlock() {
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
public Object compose(Object a, Object b) {
return a + "\t" + b;
}
};
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
System.out.println(it.next());
}
throw new IllegalArgumentException();
}
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
static String[][] script_to_gif = {
{"Common","common.gif"}, //Miscellaneous_Symbols
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
{"Arabic","arabic.gif"}, //Arabic
{"Armenian","armenian.gif"}, //Armenian
{"Bengali","bengali.gif"}, //Bengali
{"Bopomofo","bopomofo.gif"}, //Bopomofo
{"Braille","braillesymbols.gif"}, //Braille_Patterns
{"Buginese","buginese.gif"}, //Buginese
{"Buhid","buhid.gif"}, //Buhid
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
{"Cherokee","cherokee.gif"}, //Cherokee
{"Coptic","coptic.gif"}, //Coptic
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
{"Cyrillic","cyrillic.gif"}, //Cyrillic
{"Deseret","deseret.gif"}, //Deseret
{"Devanagari","devanagari.gif"}, //Devanagari
{"Ethiopic","ethiopic.gif"}, //Ethiopic
{"Georgian","georgian.gif"}, //Georgian
{"Glagolitic","glagolitic.gif"}, //Glagolitic
{"Gothic","gothic.gif"}, //Gothic
{"Greek","greek.gif"}, //Greek_and_Coptic
{"Gujarati","gujarati.gif"}, //Gujarati
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
{"Hanunoo","hanunoo.gif"}, //Hanunoo
{"Hebrew","hebrew.gif"}, //Hebrew
{"Hiragana","hiragana.gif"}, //Hiragana
{"Kannada","kannada.gif"}, //Kannada
{"Katakana","katakana.gif"}, //Katakana
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
{"Khmer","khmer.gif"}, //Khmer
{"Lao","lao.gif"}, //Lao
{"Latin","latin.gif"}, //Basic_Latin
{"Limbu","limbu.gif"}, //Limbu
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
{"Malayalam","malayalam.gif"}, //Malayalam
{"Mongolian","mongolian.gif"}, //Mongolian
{"Myanmar","myanmar.gif"}, //Myanmar
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
{"Ogham","ogham.gif"}, //Ogham
{"Old_Italic","olditalic.gif"}, //Old_Italic
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
{"Oriya","oriya.gif"}, //Oriya
{"Osmanya","osmanya.gif"}, //Osmanya
{"Runic","runic.gif"}, //Runic
{"Shavian","shavian.gif"}, //Shavian
{"Sinhala","sinhala.gif"}, //Sinhala
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
{"Syriac","syriac.gif"}, //Syriac
{"Tagalog","tagalog.gif"}, //Tagalog
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
{"Tai_Le","taile.gif"}, //Tai_Le
{"Tamil","tamil.gif"}, //Tamil
{"Telugu","telugu.gif"}, //Telugu
{"Thaana","thaana.gif"}, //Thaana
{"Thai","thai.gif"}, //Thai
{"Tibetan","tibetan.gif"}, //Tibetan
{"Tifinagh","tifinagh.gif"}, //Tifinagh
{"Ugaritic","ugaritic.gif"}, //Ugaritic
{"Yi","yi.gif"}, //Yi_Syllables
};
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
{
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
}
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
/**
*
*/
private int getIDNAType(int cp) {
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.DEFAULT); // USE_STD3_RULES
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
}
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
}
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
/**
* @param htmlOut
* @param textOut TODO
* @param scriptCode
* @param htmlOut2 TODO
* @param ucd
* @param coreChars
* @param decompChars
*/
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
if (coreChars[scriptCode] == null) return;
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
script = Utility.getUnskeleton(script.toLowerCase(),true);
System.out.println(script);
htmlOut.println();
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
+ "'> Script: " + script + "</th></tr>";
htmlOut.println(scriptLine);
htmlOut2.println(scriptLine);
textOut.println();
textOut.println("#*** Script: " + script + " ***");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
UnicodeSet decomp = extract(decomposable, core);
UnicodeSet pattern = extract(patternProp, core);
UnicodeSet non_id = extract(not_xid_continue, core);
UnicodeSet bicameralNoupper = new UnicodeSet();
if (!hasNoUpper.containsAll(core)) {
bicameralNoupper = extract(hasNoUpperMinus, core);
}
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
String cat = Default.ucd().getCategoryID(it.codepoint);
String name = Default.ucd().getName(it.codepoint);
if (name.indexOf("MUSICAL SYMBOL") >= 0
|| name.indexOf("DINGBA") >= 0
|| name.indexOf("RADICAL ") >= 0
) cat = "XX";
suspect.put(it.codepoint, cat);
}
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
}
/**
*
*/
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
core.removeAll(decomp);
return decomp;
}
/**
* @param htmlOut
* @param textOut TODO
* @param script TODO
* @param unicodeset
* @param scriptCode
* @param uca
*/
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
String script, String title, UnicodeSet unicodeset, int scriptCode) {
if (unicodeset == null)
return;
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
title + "'>" + title + "</a> ("
+ nf.format(size) + ")</th></tr>");
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
// <a href="#Atomic">categorization</a>
textOut.println();
textOut.println("# " + title);
bf.setValueSource(script + " ; " + title);
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
usi.reset(unicodeset);
while (usi.nextRange()) {
if (usi.codepoint == usi.codepointEnd) {
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint)));
} else {
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint))
+ ".. "
+ formatCode(UTF16
.valueOf(usi.codepointEnd)));
}
}
bf.showSetNames(textOut, unicodeset);
} else {
Set reordered = new TreeSet(uca);
usi.reset(unicodeset);
while (usi.next()) {
String x = usi.getString();
boolean foo = reordered.add(x);
if (!foo)
throw new IllegalArgumentException("Collision with "
+ Default.ucd().getCodeAndName(x));
}
for (Iterator it = reordered.iterator(); it.hasNext();) {
Object key = it.next();
htmlOut.print(formatCode((String)key));
}
bf.showSetNames(textOut, reordered);
}
htmlOut.println("</td></tr>");
}
/**
* @param string
* @return
*/
private String formatCode(String string) {
int cat = ucd.getCategory(UTF16.charAt(string,0));
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
+ BagFormatter.toHTMLControl.transliterate(string)
+ " </span>";
}
}
/**
* @param inbuffer
* @param outbuffer

View File

@ -240,7 +240,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
{"Control", "CN"},
{"Extend", "EX"},
{"Other", "XX"},
}).swapFirst2ValueAliases());
}, true).swapFirst2ValueAliases());
add(new UnicodeProperty.UnicodeMapProperty() {
{
@ -283,7 +283,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
{"Numeric", "NU"},
{"ExtendNumLet", "EX"},
{"Other", "XX"},
}).swapFirst2ValueAliases());
}, true).swapFirst2ValueAliases());
add(new UnicodeProperty.UnicodeMapProperty() {
{
@ -335,7 +335,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
{"STerm", "ST"},
{"Close", "CL"},
{"Other", "XX"},
}).swapFirst2ValueAliases());
}, false).swapFirst2ValueAliases());
}
static String[] YES_NO_MAYBE = {"N", "M", "Y"};

View File

@ -0,0 +1,34 @@
# Confusables.txt
# Generated: %date%, MED
# This is a draft list of visually confusable characters, for use in conjunction with the
# recommendations in http://www.unicode.org/reports/tr36/
#
# To fold using this list, first perform NFKD (if not already performed),
# then map each source character to the target character(s), then perform NFKD again.
#
# The format the standard Unicode semicolon-delimited hex.
# <source> ; <target> ; <internal_info> # <comment>
#
# The characters may be visually distinguishable in many fonts, or at larger sizes.
# Some anomalies are also introduced by 'closure'. That is, there may be a sequence of
# characters where each is visually confusable from the next, but the start and end are
# visually distinguishable. But when the set is closed, these will all map to together.
#
# This is unlike normalization data. There may be no connection between characters other
# than visual confusability. This data should not be used except in assessing visual confusability.
#
# This list is not limited to Unicode Identifier characters (XID_Continue) although the primary
# application will be to such characters. It is also not limited to lowercase characters,
# although the recommendations are to lowercase for security.
#
# Note that a some characters have unusual characteristics, and are not yet accounted for.
# For example, U+302E (?) HANGUL SINGLE DOT TONE MARK and U+302F (?) HANGUL DOUBLE DOT TONE MARK
# appear to the left of the prevous character. So what looks like "a:b" can actually be "ab\u302F"
#
# WARNING: The data is not final; it is very draft at this point, put together from different
# sources that need to be reviewed for accuracy and completeness of the mappings.
# There are still clear errors in the data; do not use this in any implementations.
# Ignore the internal_info field; it will be removed.
#
# Thanks especially to Eric van der Poel for collecting information about fonts using shared glyphs.
# =================================

View File

@ -86,6 +86,10 @@ Within each subcategory characters are sorted according to the default
</tr>
</table>
</blockquote>
<p>Characters that are normally invisible are represented in the chart by their Unicode number, such as "U+FE00".</p>
<p>At the end of this document, there is an additional section that lists all <a href='#Visible_Combining_Marks_0'>visible non-spacing marks</a>.
These are sorted first by combining character class (modified), then by script, then by code point..</p>
<p>For comparison of Indic characters, see <a href='indic-trans.html'>indic-trans.html</a>.</p>
<h3>Additional <a name="Word_Characters">Word Characters</a></h3>
<p>This is a draft list of characters based on <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.48 $
* $Date: 2005/05/27 21:39:03 $
* $Revision: 1.49 $
*
*******************************************************************************
*/
@ -336,6 +336,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
}
public static String fromHex(String p) {
return fromHex(p, false);
}
public static String fromHex(String p, boolean acceptChars) {
StringBuffer output = new StringBuffer();
int value = 0;
int count = 0;
@ -357,13 +361,31 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
default:
int type = Character.getType(ch);
if (type != Character.SPACE_SEPARATOR) {
if (acceptChars) {
if (count >= 4 && count <= 6) {
UTF32.append32(output, value);
count = 0;
value = 0;
} else if (count != 0) {
output.append(p.substring(i-count, i)); // TODO fix supplementary characters
}
UTF32.append32(output, ch);
continue main;
}
throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
new Object[] {String.valueOf(ch), new Integer(i), p});
}
// fall through!!
case ' ': case ',': case ';': // do SPACE here, just for speed
if (count != 0) {
UTF32.append32(output, value);
if (count < 4 || count > 6) {
if (acceptChars) output.append(p.substring(i-count, i));
else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
new Object[] {String.valueOf(ch), new Integer(i), p});
} else {
UTF32.append32(output, value);
}
}
count = 0;
value = 0;
@ -378,7 +400,13 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
count++;
}
if (count != 0) {
UTF32.append32(output, value);
if (count < 4 || count > 6) {
if (acceptChars) output.append(p.substring(p.length()-count, p.length()));
else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
new Object[] {"EOS", new Integer(p.length()), p});
} else {
UTF32.append32(output, value);
}
}
return output.toString();
}