ICU-0 misc fixes

X-SVN-Rev: 17717
2005-05-27 21:43:46 +00:00 · 2005-05-27 21:43:46 +00:00 · 65e8ccde28
commit 65e8ccde28
parent 0176a784d1
9 changed files with 1348 additions and 409 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
@ -0,0 +1,480 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
+* $Date: 2005/05/27 21:40:51 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import com.ibm.icu.dev.test.util.ArrayComparator;
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.dev.test.util.UnicodePropertySource;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.text.utility.Utility;
+
+public class GenerateConfusables {
+	static PrintWriter log;
+	static final String ARROW = "\u2192";
+
+	static class Data2 {
+		String source;
+		String target;
+		int count;
+		Data2(String target, int count) {
+			this.target = target;
+			this.count = count;
+		}
+	}
+	
+	static ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+	static UnicodeSet skipSet = ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cc")).addAll(ups.getSet("gc=Cf"));
+	
+	static class Data implements Comparable {
+		String source;
+		String target;
+		String type;
+		Data(String source, String target, String type) {
+			this.source = source;
+			this.target = target;
+			this.type = type;
+		}
+		public int compareTo(Object o) {
+			int result;
+			Data that = (Data)o;
+			if (0 != (result = target.compareTo(that.target))) return result;
+			if (0 != (result = source.compareTo(that.source))) return result;
+			if (0 != (result = type.compareTo(that.type))) return result;
+			return 0;
+		}
+	}
+	
+	static UnicodeSet controls = new UnicodeSet("[:Cc:]");
+	
+	static class DataSet {
+		Set dataSet = new TreeSet();
+		Map dataMap = new TreeMap(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), new UTF16.StringComparator()}));
+
+		public DataSet add(String source, String target, String type, String errorLine) {
+			if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
+			String nsource = Default.nfkd().normalize(source);
+			String ntarget = Default.nfkd().normalize(target);
+			
+			// if it is just a compatibility match, return
+			if (nsource.equals(ntarget)) return this;
+			
+			if (type.startsWith("confusables-")) type = type.substring("confusables-".length());
+			if (type.endsWith(".txt")) type = type.substring(0,type.length() - ".txt".length());
+
+			// if it is base + combining sequence => base2 + same combining sequence, do just the base
+			int nsourceFirst = UTF16.charAt(nsource,0);
+			String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
+			int ntargetFirst = UTF16.charAt(ntarget,0);
+			String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
+			if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
+				source = UTF16.valueOf(nsourceFirst);
+				target = UTF16.valueOf(ntargetFirst);
+				type += "-base";
+			}
+			
+			// swap order
+			if (preferSecondAsSource(source, target)) {
+				String temp = target;
+				target = source;
+				source = temp;
+			}
+			if (target.indexOf('\u203D') >= 0) type += "-skip";
+			Data newData = new Data(source, target, type);
+			return add(newData, errorLine);
+		}
+		/**
+		 * @param errorLine TODO
+		 * 
+		 */
+		private DataSet add(Data newData, String errorLine) {
+			if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
+				System.out.println("Problem with " + errorLine);
+				System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
+			}
+			String[] key = {newData.source, newData.target};
+			Data old = (Data) dataMap.get(key);
+			if (old == null) {
+				dataSet.add(newData);
+				dataMap.put(key, newData);
+			}else {
+				old.type = old.type + "/" + newData.type;
+			}
+			return this;
+		}
+		// Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
+		static final int NORMAL = 0, FOLDING = 1, OLD = 2;
+		
+		public DataSet addFile(String directory, String filename) throws IOException {
+			BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
+			int kind = NORMAL;
+			if (filename.indexOf("Folding") >= 0) kind = FOLDING;
+			else if (false && filename.indexOf("-old") >= 0) kind = OLD;
+			while (true) {
+				String line = Utility.readDataLine(in);
+				if (line == null) break;
+				if (line.length() == 0) continue;
+				String[] pieces = Utility.split(line,';');
+				if (pieces.length < 2) {
+					System.out.println("Error on: " + line);
+					continue;
+				}
+				String type = filename;
+				if (kind==FOLDING) {
+					String source = Utility.fromHex(pieces[0].trim(),true);
+					String target = Utility.fromHex(pieces[1].trim(),true);
+					String nsource = Default.nfkd().normalize(source);
+					String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
+					if (!first.equals(target)) {
+						add(source, target, type, line);
+					}
+				} else if (kind == OLD) {
+					String target = pieces[0].trim();
+					for (int i = 1; i < pieces.length; ++i) {
+						add(pieces[i].trim(), target, type, line);
+					}
+				} else {
+					String source = Utility.fromHex(pieces[0].trim(),true);
+					String target = Utility.fromHex(pieces[1].trim(),true);
+					if (pieces.length > 2) type = pieces[2].trim();
+					add(source, target, type, line);
+				}
+			}
+			in.close();
+			return this;			
+		}
+		public void write(String directory, String filename, boolean appendFile) throws IOException {
+			PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
+			if (appendFile) {
+				String[] replacements = {"%date%", Default.getDate()};
+				Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", 
+						Utility.UTF8_WINDOWS, out, replacements);
+			}
+			for (Iterator it = dataSet.iterator(); it.hasNext();) {
+				Data item = (Data) it.next();
+				out.println(
+						Utility.hex(item.source)
+						+ " ;\t" + Utility.hex(item.target)
+						+ " ;\t" + item.type
+						+ "\t# "
+						+ "( " + item.source + " " + ARROW + " " + item.target + ") " 
+						+ Default.ucd().getName(item.source) + " " + ARROW + " "
+						+ Default.ucd().getName(item.target));
+
+			}
+			out.close();
+		}
+		/**
+		 * 
+		 */
+		public void add(DataSet ds) {
+			for (Iterator it = ds.dataSet.iterator(); it.hasNext();) {
+				add((Data)it.next(), "");
+			}
+		}
+		public DataSet clean() {
+			// remove all skips
+			DataSet tempSet = new DataSet();
+			Map m = new HashMap();
+			for (Iterator it = dataSet.iterator(); it.hasNext();) {
+				Data d = (Data) it.next();
+				if (d.type.indexOf("skip") >= 0) continue;
+				String newTarget = Default.nfkd().normalize(d.target);
+				String newSource = Default.nfkd().normalize(d.source);
+				String type = d.type;
+				if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
+					type += "-nf";
+					log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
+					log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
+					continue;
+				}
+				// swap order
+				if (preferSecondAsSource(newSource, newTarget)) {
+					String temp = newTarget;
+					newTarget = newSource;
+					newSource = temp;
+				}
+
+				Data already = (Data) m.get(newSource);
+				if (already != null && !newTarget.equals(already.target)) {
+					log.println("X " + getCodeCharName(newSource) + " " + ARROW);
+					log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
+					log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
+					if (preferSecondAsSource(already.target, newTarget)) {
+						// just fix new guy
+						type += "[" + newSource + "]" + already.type;
+						newSource = newTarget;
+						newTarget = already.target;
+					} else {
+						// need to fix new guy, AND fix old guy.
+						tempSet.remove(already);
+						type += "[" + newSource + "]" + already.type;
+						newSource = already.target;
+						already.type += "[" + already.target + "]" + type;
+						already.target = newTarget;
+						tempSet.add(already, "");
+					}
+				}
+				Data newData = new Data(newSource, newTarget, type);
+				m.put(newSource, newData);
+				tempSet.add(newData, "");
+			}
+			// now recursively apply
+			DataSet s = new DataSet();
+			for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
+				Data d = (Data) it.next();
+				int cp = 0;
+				StringBuffer result = new StringBuffer();
+				for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
+					cp = UTF16.charAt(d.target, i);
+					String src = UTF16.valueOf(cp);
+					while (true) {
+						Data rep = (Data) m.get(src);
+						if (rep == null) break;
+						src = rep.target;
+					}
+					result.append(src);
+				}
+				String newTarget = result.toString();
+				newTarget = Default.nfkd().normalize(newTarget);
+				s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
+			}
+			return s;
+		}
+		/**
+		 * 
+		 */
+		private void remove(Data already) {
+			String[] key = {already.source, already.target};
+			dataMap.remove(key);
+			dataSet.remove(already);
+		}
+	}
+	public static void main(String[] args) throws IOException {
+		String indir = Utility.BASE_DIR + "confusables/";
+		String outdir = Utility.GEN_DIR + "confusables/";
+		log = BagFormatter.openUTF8Writer(outdir, "log.txt");
+		//fixMichel(indir, outdir);
+		generateConfusables(indir, outdir);
+		log.close();
+		System.out.println("Done");
+	}
+		/**
+		 * @throws IOException
+	 * 
+	 */
+	private static void fixMichel(String indir, String outdir) throws IOException {
+		BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
+		PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
+		while (true) {
+			String line = Utility.readDataLine(in);
+			if (line == null) break;
+			String[] pieces = Utility.split(line,'\t');
+			if (pieces.length < 2) {
+				out.println(line);
+				continue;
+			}
+			String source = Utility.fromHex(pieces[0].trim());
+			if (Default.nfkd().isNormalized(source)) {
+				out.println(line);
+			}
+		}
+		in.close();
+		out.close();
+	}
+		/**
+	 * 
+	 */
+	private static void generateConfusables(String indir, String outdir) throws IOException {
+		File dir = new File(indir);
+		String[] names = dir.list();
+		DataSet total = new DataSet();
+		for (int i = 0; i < names.length; ++i) {
+			if (new File(indir + names[i]).isDirectory()) continue;
+			System.out.println(names[i]);
+			DataSet ds = new DataSet();
+			ds.addFile(indir, names[i]);
+			ds.write(outdir, "new-" + names[i], false);
+			total.add(ds);
+		}
+		total.write(outdir, "confusables-raw.txt", false);
+		DataSet clean = total.clean();
+		clean.write(outdir, "confusables.txt", true);		
+	}
+	/*
+		BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
+		Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), 
+				new UTF16.StringComparator()}));
+		while (true) {
+			String line = Utility.readDataLine(in);
+			if (line == null) break;
+			if (line.length() == 0) continue;
+			String[] pieces = Utility.split(line,';');
+			if (pieces.length < 2) {
+				System.out.println("Error on: " + line);
+				continue;
+			}
+			String source = Utility.fromHex(pieces[0].trim());
+			String target = Utility.fromHex(pieces[1].trim());
+			String nsource = Default.nfkd().normalize(source);
+			String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
+			if (!first.equals(target)) {
+				set.add(new String[]{source, target});
+			}
+		}
+		in.close();
+
+	}
+	public static void gen() throws IOException {
+		Map m = new TreeMap();
+		BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
+		while (true) {
+			String line = in.readLine();
+			if (line == null) break;
+			String[] pieces = Utility.split(line,';');
+			if (pieces.length < 3) {
+				System.out.println("Error on: " + line);
+				continue;
+			}
+			int codepoint = Integer.parseInt(pieces[1], 16);
+			int cat = Default.ucd().getCategory(codepoint);
+			if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
+			if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
+			String result = Utility.fromHex(pieces[0]);
+			if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
+			int count = Integer.parseInt(pieces[2]);
+			String source = UTF16.valueOf(codepoint);
+			add(m, source, result, count);
+		}
+		in.close();
+
+		in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
+		while (true) {
+			String line = in.readLine();
+			if (line == null) break;
+			line = line.trim();
+			int pos = line.indexOf("#");
+			if (pos >= 0) line = line.substring(0,pos).trim();
+			if (line.length() == 0) continue;
+			if (line.startsWith("@")) continue;
+			String[] pieces = Utility.split(line,';');
+			if (pieces.length < 2) {
+				System.out.println("Error on: " + line);
+				continue;
+			}
+			String source = pieces[0].trim();
+			for (int i = 1; i < pieces.length; ++i) {
+				add(m, source, pieces[i].trim(), -1);
+			}
+		}
+		in.close();
+
+		boolean gotOne;
+		// close the set
+		do {
+			gotOne = false;
+			for (Iterator it = m.keySet().iterator(); it.hasNext();) {
+				String source = (String) it.next();
+				Data2 data = (Data2) m.get(source);
+				Data2 data2 = (Data2) m.get(data.target);
+				if (data2 == null) continue;
+				data.target = data2.target;
+				gotOne = true;
+				break;
+			}
+		} while (gotOne);
+		// put into different sorting order
+		Set s = new TreeSet();
+		for (Iterator it = m.keySet().iterator(); it.hasNext();) {
+			String source = (String) it.next();
+			Data2 data = (Data2) m.get(source);
+			s.add(new Data(source, data.target, data.count));
+		}
+		// write it out
+		PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
+		String[] replacements = {"%date%", Default.getDate()};
+		Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", 
+				Utility.UTF8_WINDOWS, out, replacements);
+		for (Iterator it = s.iterator(); it.hasNext();) {
+			Data d = (Data) it.next();
+			if (d == null) continue;
+			out.println(formatLine(d.source, d.target, d.count));
+		}
+		
+		out.close();
+		System.out.println("Done");
+	}
+	/**
+	 * 
+	 */
+	private static String formatLine(String source, String target, int count) {
+		return Utility.hex(source) + " ; " + Utility.hex(target," ")
+				+ " ; " + count
+				+ " # "
+				+ "(" + source + " " + ARROW + " " + target + ") "
+				+ Default.ucd().getName(source) 
+				+ " " + ARROW + " " + Default.ucd().getName(target);
+	}
+	/**
+	 * 
+	 */
+	private static void add(Map m, String source, String target, int count) {
+		if (source.length() == 0 || target.length() == 0) return;
+		if (preferSecondAsSource(source, target)) {
+			String temp = target;
+			target = source;
+			source = temp;
+		}
+		Data2 other = (Data2) m.get(source);
+		if (other != null) {
+			if (target.equals(other.target)) return;
+			System.out.println("conflict");
+			System.out.println(formatLine(source, target, count));
+			System.out.println(formatLine(source, other.target, other.count));
+			// skip adding this, and instead add result -> other.target
+			add(m, target, other.target, count);
+		} else {
+			m.put(source, new Data2(target, count));
+		}
+	};
+	
+	static private boolean preferSecondAsSource(String a, String b) {
+		// if first is longer, prefer second
+		int ca = UTF16.countCodePoint(a);
+		int cb = UTF16.countCodePoint(b);
+		if (ca != cb) {
+			return ca > cb;
+		}
+		// if first is lower, prefer second
+		return a.compareTo(b) < 0;
+	}
+	
+	static String getCodeCharName(String a) {
+		return Default.ucd().getCode(a) + "(  " + a + "  ) " + Default.ucd().getName(a);
+	}
+	
+}
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java
@ -0,0 +1,125 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
+* $Date: 2005/05/27 21:40:51 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+import com.ibm.text.utility.*;
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import java.util.*;
+import java.io.*;
+
+public final class GenerateNamedSequences implements UCD_Types {
+    
+    static final boolean DEBUG = false;
+    
+    static public String showVarGlyphs(String code0, String code1, String shape, String description) {
+        if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
+        
+        String abbShape = "";
+        if (shape.length() != 0) {
+            abbShape = '-' + shape.substring(0,4);
+            if (description.indexOf("feminine") >= 0) abbShape += "fem";
+        }
+        
+        return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape 
+            + "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
+    }
+    
+/*
+#   Field 0: the variation sequence
+#   Field 1: the description of the desired appearance
+#   Field 2: where the appearance is only different in in particular shaping environments
+#	this field lists them. The possible values are: isolated, initial, medial, final.
+#	If more than one is present, there are spaces between them.
+*/
+    static public void generate() throws IOException {
+        
+        
+        // read the data and compose the table
+        
+        String table = "<table><tr><th width='10%'>Rep Glyph</th><th>Hex Sequence</th><th>Name</th><th>Copyable</th></tr>";
+        
+        String[] splits = new String[4];
+        String[] codes = new String[20];
+        String[] shapes = new String[4];
+        
+        BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1);
+        Transliterator unicodexml = Transliterator.getInstance("hex/xml");
+        while (true) {
+            String line = Utility.readDataLine(in);
+            if (line == null) break;
+            line = line.trim();
+            if (line.length() == 0) continue;
+            
+            int count = Utility.split(line, ';', splits);
+            String name = splits[0];
+            int codeCount = Utility.split(splits[1], ' ', codes);
+            StringBuffer codeBuffer = new StringBuffer();
+            for (int i = 0; i < codeCount; ++i) {
+            	UTF16.append(codeBuffer, Integer.parseInt(codes[i],16));
+            }
+            String codeWithHyphens = splits[1].replaceAll("\\s", "-");
+            String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+");
+            String codeString = unicodexml.transliterate(codeBuffer.toString());
+            
+            // <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
+            
+            //table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
+            String imageName = "images/U" + codeWithHyphens + ".gif";
+            if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) {
+                String codeNoSpaces2 = splits[1].replaceAll("\\s", "");
+            	imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif";
+            }
+            table += "<tr>"
+               		+ "<td class='copy'><img alt='(" + codeAlt + ")' src='" + imageName + "'><br><tt>"
+					+ splits[1] + "</tt></td>"
+ 					+ "<td>" + splits[1] + "</td>"
+					+ "</td><td>" + name + "</td>" 
+              		+ "<td class='copy'>" + codeString + "</td>"
+					+ "</tr>\n";
+            System.out.println(splits[1] + "\t" + codeString);
+        }
+        in.close();            
+        table += "</table>";
+     
+        // now write out the results
+        
+        String directory = "DerivedData/";
+        String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true);
+        PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
+        /*
+        String[] batName = {""};
+        String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
+        
+        String version = Default.ucd().getVersion();
+        int lastDot = version.lastIndexOf('.');
+        String updateDirectory = version.substring(0,lastDot) + "-Update";
+        int updateV = version.charAt(version.length()-1) - '0';
+        if (updateV != 0) updateDirectory += (char)('1' + updateV);
+        if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
+        */
+        
+        String[] replacementList = {
+            "@revision@", Default.ucd().getVersion(),
+            //"@updateDirectory@", updateDirectory,
+            "@date@", Default.getDate(),
+            "@table@", table};
+                
+        Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList);
+     
+        out.close();
+        //Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
+    }
+}
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java
@ -0,0 +1,515 @@
+/*
+ * Created on May 3, 2005
+ * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
+ * For terms of use, see http://www.unicode.org/terms_of_use.html
+ */
+package com.ibm.text.UCD;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.CollectionUtilities;
+import com.ibm.icu.dev.test.util.UnicodeLabel;
+import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.IDNA;
+import com.ibm.icu.text.StringPrepParseException;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.text.UTF16.StringComparator;
+import com.ibm.icu.util.ULocale;
+import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator;
+import com.ibm.text.UCD.TestData.RegexMatcher;
+import com.ibm.text.utility.Utility;
+
+
+class GenerateStringPrep implements UCD_Types {
+	
+	public static void main (String[] args) throws IOException {
+		//checkChars(false);
+		new GenerateStringPrep().genStringPrep();
+		System.out.println("Done");
+	}
+	
+	UnicodeSet[] coreChars = new UnicodeSet[100];
+	UnicodeSet decomposable = new UnicodeSet();
+	UnicodeMap suspect = new UnicodeMap();
+	
+	ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+	ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0");
+	//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
+	UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
+	UnicodeSet wordChars = new UnicodeSet();
+	{
+		if (false) {
+			wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
+			wordChars.retainAll(ups.getSet("gc=Sk"));
+		}
+		wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
+		" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
+		" \\u055A \\u02B9 \\u02BA]"));
+		//wordChars.removeAll(xid_continue);
+	}
+	
+	UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
+	UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
+	UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me"))
+		.addAll(ups.getSet("gc=Mn"))
+		.removeAll(ups.getSet("Default_Ignorable_Code_Point=true"));
+	
+	UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
+	
+	//UnicodeSet[] decompChars = new UnicodeSet[100];
+	UCD ucd = Default.ucd();
+
+	static Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
+	{
+		uca0.setStrength(Collator.IDENTICAL);
+	}
+	static GenerateHanTransliterator.MultiComparator uca 
+		= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
+				uca0, new UTF16.StringComparator()});
+
+	UnicodeSet bidiR = new UnicodeSet(
+			"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
+
+	UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
+	UnicodeSet hasNoUpper = new UnicodeSet();
+	UnicodeSet hasNoUpperMinus = new UnicodeSet();
+	BagFormatter bf = new BagFormatter();
+	UnicodeSet inIDN = new UnicodeSet();
+	UnicodeSet isCaseFolded = new UnicodeSet();
+
+	void genStringPrep() throws IOException {
+		//showScriptToBlock();
+		bf.setShowLiteral(BagFormatter.toHTMLControl);
+		bf.setUnicodePropertyFactory(ups);
+		//bf.setValueSource(UnicodeLabel.NULL);
+		if (false) {
+			
+			System.out.println("word chars: " + bf.showSetNames(wordChars));
+			System.out.println("pat: " + bf.showSetNames(patternProp));
+			System.out.println("xid: " + bf.showSetNames(not_xid_continue));
+		}
+		for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+			Utility.dot(cp);
+			int cat = Default.ucd().getCategory(cp);
+			if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
+			if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
+			// get IDNA
+			int idnaType = getIDNAType(cp);
+			idnaTypeSet[idnaType].add(cp);
+			
+			String str = UTF16.valueOf(cp);
+			if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
+			if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp);
+			
+			// scripts
+			int script = ucd.getScript(cp);
+			if (coreChars[script] == null)
+				coreChars[script] = new UnicodeSet();
+			coreChars[script].add(cp);
+		}
+		// fix characters with no uppercase
+		hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
+		System.out.println(bf.showSetNames(hasNoUpper));
+		
+		Utility.fixDot();
+		PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
+		PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
+		PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
+		textOut.println('\uFEFF');
+		textOut.println("For documentation, see idn-chars.html");
+		
+		Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut, 
+				new String[] {"%date%", Default.getDate()});
+		/*
+		out
+				.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
+		out.println("<title>IDN Characters</title><style>");
+		out.println("<!--");
+		out.println(".script       { font-size: 150%; background-color: #CCCCCC }");
+		out.println(".Atomic       { background-color: #CCCCFF }");
+		out.println(".Atomic-no-uppercase       { background-color: #CCFFCC }");
+		out.println(".Non-XID       { background-color: #FFCCCC }");
+		out.println(".Decomposable       { background-color: #FFFFCC }");
+		out.println(".Pattern_Syntax       { background-color: #FFCCFF }");
+		
+		out.println("th           { text-align: left }");
+		out.println("-->");
+		out.println("</style></head><body><table>");
+		*/
+		htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
+		htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
+
+		for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
+			if (scriptCode == COMMON_SCRIPT
+					|| scriptCode == INHERITED_SCRIPT)
+				continue;
+			showCodes(htmlOut, textOut, scriptCode, htmlOut2);
+		}
+		showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
+		showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
+
+		showCodes(htmlOut, textOut, non_spacing);
+		htmlOut.println("</table></body></html>");
+		htmlOut.close();
+		htmlOut2.println("</table></body></html>");
+		htmlOut2.close();
+		bf.setMergeRanges(false);
+
+		textOut.println();
+		textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
+		textOut.println();
+		bf.setValueSource("word-chars");
+		bf.showSetNames(textOut, wordChars);
+		
+		textOut.println();
+		textOut.println("# *** FOR REVIEW ***");
+		bf.setLabelSource(UnicodeLabel.NULL);
+		for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
+			textOut.println();
+			String value = (String)it.next();
+			bf.setValueSource(value);
+			bf.showSetNames(textOut, suspect.getSet(value));
+		}
+		textOut.close();
+		textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt");
+		bf = new BagFormatter();
+		bf.setUnicodePropertyFactory(ups);
+		textOut.println();
+		textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***");
+		UnicodeSet U32 = ups32.getSet("gc=cn").complement();
+		UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32);		
+		bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]);
+		textOut.close();
+
+	}
+	
+	/**
+	 * 
+	 */
+	private void showScriptToBlock() {
+		UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
+		UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
+		UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
+			public Object compose(Object a, Object b) {
+				return a + "\t" + b;
+			}
+		};
+		UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
+		for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
+			System.out.println(it.next());
+		}
+		throw new IllegalArgumentException();
+	}
+	
+	Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
+	
+	static String[][] script_to_gif = {
+			
+		{"Common","common.gif"}, //Miscellaneous_Symbols
+		{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
+		{"Arabic","arabic.gif"}, //Arabic
+		{"Armenian","armenian.gif"}, //Armenian
+		{"Bengali","bengali.gif"}, //Bengali
+		{"Bopomofo","bopomofo.gif"}, //Bopomofo
+		{"Braille","braillesymbols.gif"}, //Braille_Patterns
+		{"Buginese","buginese.gif"}, //Buginese
+		{"Buhid","buhid.gif"}, //Buhid
+		{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
+		{"Cherokee","cherokee.gif"}, //Cherokee
+		{"Coptic","coptic.gif"}, //Coptic
+		{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
+		{"Cyrillic","cyrillic.gif"}, //Cyrillic
+		{"Deseret","deseret.gif"}, //Deseret
+		{"Devanagari","devanagari.gif"}, //Devanagari
+		{"Ethiopic","ethiopic.gif"}, //Ethiopic
+		{"Georgian","georgian.gif"}, //Georgian
+		{"Glagolitic","glagolitic.gif"}, //Glagolitic
+		{"Gothic","gothic.gif"}, //Gothic
+		{"Greek","greek.gif"}, //Greek_and_Coptic
+		{"Gujarati","gujarati.gif"}, //Gujarati
+		{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
+		{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
+		{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
+		{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
+		{"Hanunoo","hanunoo.gif"}, //Hanunoo
+		{"Hebrew","hebrew.gif"}, //Hebrew
+		{"Hiragana","hiragana.gif"}, //Hiragana
+		{"Kannada","kannada.gif"}, //Kannada
+		{"Katakana","katakana.gif"}, //Katakana
+		{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
+		{"Khmer","khmer.gif"}, //Khmer
+		{"Lao","lao.gif"}, //Lao
+		{"Latin","latin.gif"}, //Basic_Latin
+		{"Limbu","limbu.gif"}, //Limbu
+		{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
+		{"Malayalam","malayalam.gif"}, //Malayalam
+		{"Mongolian","mongolian.gif"}, //Mongolian
+		{"Myanmar","myanmar.gif"}, //Myanmar
+		{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
+		{"Ogham","ogham.gif"}, //Ogham
+		{"Old_Italic","olditalic.gif"}, //Old_Italic
+		{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
+		{"Oriya","oriya.gif"}, //Oriya
+		{"Osmanya","osmanya.gif"}, //Osmanya
+		{"Runic","runic.gif"}, //Runic
+		{"Shavian","shavian.gif"}, //Shavian
+		{"Sinhala","sinhala.gif"}, //Sinhala
+		{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
+		{"Syriac","syriac.gif"}, //Syriac
+		{"Tagalog","tagalog.gif"}, //Tagalog
+		{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
+		{"Tai_Le","taile.gif"}, //Tai_Le
+		{"Tamil","tamil.gif"}, //Tamil
+		{"Telugu","telugu.gif"}, //Telugu
+		{"Thaana","thaana.gif"}, //Thaana
+		{"Thai","thai.gif"}, //Thai
+		{"Tibetan","tibetan.gif"}, //Tibetan
+		{"Tifinagh","tifinagh.gif"}, //Tifinagh
+		{"Ugaritic","ugaritic.gif"}, //Ugaritic
+		{"Yi","yi.gif"}, //Yi_Syllables
+
+	};
+	
+	UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
+	{
+		for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
+	}
+	static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
+	/**
+	 * 
+	 */
+	private int getIDNAType(int cp) {
+		inbuffer.setLength(0);
+		UTF16.append(inbuffer, cp);
+		try {
+			intermediate = IDNA.convertToASCII(inbuffer,
+					IDNA.DEFAULT); // USE_STD3_RULES
+			if (intermediate.length() == 0)
+				return DELETED;
+			outbuffer = IDNA.convertToUnicode(intermediate,
+					IDNA.USE_STD3_RULES);
+		} catch (StringPrepParseException e) {
+			return ILLEGAL;
+		} catch (Exception e) {
+			System.out.println("Failure at: " + Utility.hex(cp));
+			return ILLEGAL;
+		}
+		if (!TestData.equals(inbuffer, outbuffer))
+			return REMAPPED;
+		return OK;
+	}
+	StringBuffer inbuffer = new StringBuffer();
+	StringBuffer intermediate, outbuffer;
+
+	UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
+
+	/**
+	 * @param htmlOut
+	 * @param textOut TODO
+	 * @param scriptCode
+	 * @param htmlOut2 TODO
+	 * @param ucd
+	 * @param coreChars
+	 * @param decompChars
+	 */
+	private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
+		if (coreChars[scriptCode] == null) return;
+		String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
+		script = Utility.getUnskeleton(script.toLowerCase(),true);
+		System.out.println(script);
+		
+		htmlOut.println();
+		String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
+		+ "'> Script: " + script + "</th></tr>";
+		htmlOut.println(scriptLine);
+		htmlOut2.println(scriptLine);
+		textOut.println();
+		textOut.println("#*** Script: " + script + " ***");
+		UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
+		
+		UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
+		UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
+		UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
+		
+		UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
+		UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
+		
+		UnicodeSet decomp = extract(decomposable, core);
+		UnicodeSet pattern = extract(patternProp, core);
+		UnicodeSet non_id = extract(not_xid_continue, core);
+		
+		UnicodeSet bicameralNoupper = new UnicodeSet();
+		if (!hasNoUpper.containsAll(core)) {
+			bicameralNoupper = extract(hasNoUpperMinus, core);
+		}
+
+		UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
+		for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
+			String cat = Default.ucd().getCategoryID(it.codepoint);
+			String name = Default.ucd().getName(it.codepoint);
+			if (name.indexOf("MUSICAL SYMBOL") >= 0 
+					|| name.indexOf("DINGBA") >= 0 
+					|| name.indexOf("RADICAL ") >= 0 
+					 						) cat = "XX";
+			suspect.put(it.codepoint, cat);
+		}
+		
+		if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca);
+		if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca);
+		if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca);
+		if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca);
+		if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca);
+
+		if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca);
+		if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca);
+		if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca);
+		if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca);
+		if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca);
+	}
+	
+	private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException {
+		String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT);
+		script = Utility.getUnskeleton(script.toLowerCase(),true);
+		String scriptLine = "<tr><th class='script'><img src='images/" 
+			+ ((String)scriptToGif.get(script)).toLowerCase()
+			+ "'> Script: " + script + "</th></tr>";
+		htmlOut.println(scriptLine);
+		UnicodeMap m = getPositions();
+		
+		for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) {
+			String type = (String) it.next();
+			UnicodeSet current = m.getSet(type).retainAll(non_spacing);
+			if (current.size() == 0) continue;
+			printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator);
+		}
+	}
+	
+	/**
+	 * @throws IOException
+	 * 
+	 */
+	private UnicodeMap getPositions() throws IOException {
+		UnicodeMap result = new UnicodeMap();
+		BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt");
+		String type="Undetermined";
+		while (true) {
+			String line = Utility.readDataLine(in);
+			if (line == null) break;
+			if (line.length() == 0) continue;
+			if (line.startsWith("@")) {
+				type = line.substring(1);
+				continue;
+			}
+			String[] pieces = Utility.split(line, ';');
+			String code = Utility.fromHex(pieces[0]);
+			result.put(UTF16.charAt(code,0), type);
+		}
+		return result;
+	}
+
+	static Comparator positionComparator = new Comparator() {
+		public int compare(Object o1, Object o2) {
+			String s1 = (String)o1;
+			String s2 = (String)o2;
+			return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2));
+		}
+	};
+
+	/**
+	 * 
+	 */
+	private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
+		UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
+		core.removeAll(decomp);
+		return decomp;
+	}
+
+	/**
+	 * @param htmlOut
+	 * @param textOut TODO
+	 * @param script TODO
+	 * @param unicodeset
+	 * @param scriptCode
+	 * @param comparator TODO
+	 * @param uca
+	 */
+	private  void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
+			String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) {
+		if (unicodeset == null)
+			return;
+		int size = unicodeset.size();
+		String dir = unicodeset.containsSome(bidiR)
+				&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
+		htmlOut.println("<tr><th class='" + title + "'><a href='#" +
+				title + "'>" + title + "</a> ("
+				+ TestData.nf.format(size) + ")</th></tr>");
+		htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
+		// <a href="#Atomic">categorization</a>
+		textOut.println();
+		textOut.println("# " + title);
+		bf.setValueSource(script + " ; " + title);
+		UnicodeSetIterator usi = new UnicodeSetIterator();
+		if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
+			usi.reset(unicodeset);
+			while (usi.nextRange()) {
+				if (usi.codepoint == usi.codepointEnd) {
+					htmlOut.print(formatCode(UTF16
+							.valueOf(usi.codepoint)));
+				} else {
+					htmlOut.print(formatCode(UTF16
+							.valueOf(usi.codepoint))
+							+ ".. "
+							+ formatCode(UTF16
+									.valueOf(usi.codepointEnd)));
+				}
+			}
+			bf.showSetNames(textOut, unicodeset);
+		} else {
+			Set reordered = new TreeSet(comparator);
+			usi.reset(unicodeset);
+			while (usi.next()) {
+				String x = usi.getString();
+				boolean foo = reordered.add(x);
+				if (!foo)
+					throw new IllegalArgumentException("Collision with "
+							+ Default.ucd().getCodeAndName(x));
+			}
+			for (Iterator it = reordered.iterator(); it.hasNext();) {
+				Object key = it.next();
+				htmlOut.print(formatCode((String)key));
+			}
+			bf.showSetNames(textOut, reordered);
+		}
+		htmlOut.println("</td></tr>");
+	}
+
+	/**
+	 * @param string
+	 * @return
+	 */
+	private String formatCode(String string) {
+		int cat = ucd.getCategory(UTF16.charAt(string,0));
+		String pad = "\u00A0", pad1 = pad;
+		if (cat == Me || cat == Mn) {
+			pad = "\u00A0\u00A0";
+			pad1 = "\u00A0\u00A0\u25cc";
+		}	
+		return "<span title='" + ucd.getCodeAndName(string) + "'>"
+		+ pad1
+		+ BagFormatter.toHTMLControl.transliterate(string)
+		+ pad
+		+ "</span> ";
+	}
+}
--- a/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html
+++ b/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html
@ -0,0 +1,153 @@
+<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+<html>
+
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<meta http-equiv="Content-Language" content="en-us">
+<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
+<meta name="ProgId" content="FrontPage.Editor.Document">
+<meta name="keywords" content="unicode, variant glyphs">
+<meta name="description" content="Describes and displays standardized variant glyphs">
+<title>Named Sequences</title>
+<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
+<style>
+<!--
+.copy	{ text-align: center; font-size: 150% }
+th, td	{ vertical-align: middle }
+tt		{ font-size: 8pt }
+table	{ padding: 2pt }
+-->
+</style>
+</head>
+
+<body bgcolor="#ffffff">
+
+<table class="header">
+  <tr>
+    <td class="icon"><a href="http://www.unicode.org">
+    <img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode 
+    Character Database</a></td>
+  </tr>
+  <tr>
+    <td class="gray">&nbsp;</td>
+  </tr>
+</table>
+<div style="margin:1em">
+  <table border="1" cellpadding="0" cellspacing="1" style="border-collapse: collapse" bordercolor="#111111" width="100%" id="AutoNumber1">
+    <tr>
+      <td width="100%">
+      <p style="text-align: right">L2-XXX</p>
+      <p><i>To: UTC<br>
+      From: Mark Davis<br>
+      Date: 2005-04-28</i></p>
+      <p><i>One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html, 
+      following the pattern of StandardizedVariants.html. This document was generated along those 
+      lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable 
+      style modifications, of course) as a chart someplace accessible under
+      <a href="http://unicode.org/charts/">http://unicode.org/charts/</a>.</i></p>
+      <p><i>Alternatively, we could also combine this with the StandardizedVariants.html to provide 
+      a unified chart of sequences, again someplace under <a href="http://unicode.org/charts/">
+      http://unicode.org/charts/</a>.</i></p>
+      <p><i><b>Note:</b> we don&#39;t have some of the glyphs quite right yet, but it should be 
+      sufficient for discussing the format. One of the innovations is having a separate column of 
+      text that for copy&amp;paste; that needs discussion also.</i></td>
+    </tr>
+  </table>
+  <h1><i><font color="#990000">&nbsp;PROPOSED WORKING DRAFT<br>
+  </font></i>Named Sequences</h1>
+  <table class="wide">
+    <tr>
+      <td valign="top" width="144">Revision</td>
+      <td valign="top">@revision@</td>
+    </tr>
+    <tr>
+      <td valign="top" width="144">Authors</td>
+      <td valign="top">Members of the Editorial Committee</td>
+    </tr>
+    <tr>
+      <td valign="top" width="144">Date</td>
+      <td valign="top">@date@</td>
+    </tr>
+    <tr>
+      <td valign="top" width="144">This Version</td>
+      <td valign="top">
+      <a href="http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html">
+      http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html</a></td>
+    </tr>
+    <tr>
+      <td valign="top" width="144">Previous Version</td>
+      <td valign="top">n/a</td>
+    </tr>
+    <tr>
+      <td valign="top" width="144">Latest Version</td>
+      <td valign="top">n/a</td>
+    </tr>
+  </table>
+  <h3><br>
+  <i>Summary</i></h3>
+  <blockquote>
+    <p>This file provides a visual display of the named sequences derived from NamedSequences.txt.<i>The 
+    proposal is to add this, </i></p>
+  </blockquote>
+  <h3><i>Status</i></h3>
+  <blockquote>
+    <p><i>The file and the files described herein are part of the
+    <a href="http://www.unicode.org/ucd">Unicode Character Database</a> (UCD) and are governed by 
+    the <a href="#Terms of Use">UCD Terms of Use</a> stated at the end.</i></p>
+  </blockquote>
+  <hr width="50%">
+  <h2>Introduction</h2>
+  <p>The tables here exhaustively lists the valid, registered named sequences. The columns include a 
+  representative glyph, the sequence of code points in hex, and the name of the sequence. In 
+  addition, there is a last column entitled <i>Copyable</i>, which contains the literal text forming 
+  the sequence. That text can be copied and pasting in elsewhere. The display of the text in this 
+  column is up to the capabilities of the browser and the set of available fonts. For more 
+  information, see <a href="http://www.unicode.org/help/display_problems.html">Display Problems?</a>.</p>
+  <blockquote>
+    <p><a name="fonts"><b>Note: </b></a>The representative glyphs used to show the names sequences 
+    are often derived from different physical fonts than the representative glyphs in the standard. 
+    They may therefore exhibit minor differences in size, proportion, style, or weight.</p>
+  </blockquote>
+  <p>@table@</p>
+  <hr width="50%">
+  <h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
+  <h3><i>Disclaimer</i></h3>
+  <blockquote>
+    <p><i>The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to 
+    fitness for any particular purpose. No warranties of any kind are expressed or implied. The 
+    recipient agrees to determine applicability of information provided. If this file has been 
+    purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be 
+    exchange of defective media within 90 days of receipt.</i></p>
+    <p><i>This disclaimer is applicable for all other data files accompanying the Unicode Character 
+    Database, some of which have been compiled by the Unicode Consortium, and some of which have 
+    been supplied by other sources.</i></p>
+  </blockquote>
+  <h3><i>Limitations on Rights to Redistribute This Data</i></h3>
+  <blockquote>
+    <p><i>Recipient is granted the right to make copies in any form for internal distribution and to 
+    freely use the information supplied in the creation of products supporting the Unicode<sup>TM</sup> 
+    Standard. The files in the Unicode Character Database can be redistributed to third parties or 
+    other organizations (whether for profit or not) as long as this notice and the disclaimer notice 
+    are retained. Information can be extracted from these files and used in documentation or 
+    programs, as long as there is an accompanying notice indicating the source.</i></p>
+  </blockquote>
+  <hr width="50%">
+  <div align="center">
+    <center>
+    <table cellspacing="0" cellpadding="0" border="0">
+      <tr>
+        <td><a href="http://www.unicode.org/unicode/copyright.html">
+        <img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
+      </tr>
+    </table>
+    <script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
+    </script>
+    </center>
+  </div>
+  <blockquote>
+  </blockquote>
+</div>
+
+</body>
+
+</html>
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/05/02 15:39:53 $
-* $Revision: 1.22 $
+* $Date: 2005/05/27 21:38:51 $
+* $Revision: 1.23 $
 *
 *******************************************************************************
 */
@ -46,8 +46,6 @@ public class TestData implements UCD_Types {
    
 	public static void main (String[] args) throws IOException {
 		//checkChars(false);
-		new GenStringPrep().genStringPrep();
-		if (true) return;
        
        System.out.println("main: " + Default.getDate());
        upf = ICUPropertyFactory.make();
@ -152,404 +150,6 @@ public class TestData implements UCD_Types {
 	}
 	Matcher m;
 	
-	static class GenStringPrep {
-		
-		UnicodeSet[] coreChars = new UnicodeSet[100];
-		UnicodeSet decomposable = new UnicodeSet();
-		UnicodeMap suspect = new UnicodeMap();
-		
-		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
-		//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
-		UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
-		UnicodeSet wordChars = new UnicodeSet();
-		{
-			if (false) {
-				wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
-				wordChars.retainAll(ups.getSet("gc=Sk"));
-			}
-			wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
-			" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
-			" \\u055A \\u02B9 \\u02BA]"));
-			//wordChars.removeAll(xid_continue);
-		}
-		
-		UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
-		UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
-		
-		UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
-		
-		//UnicodeSet[] decompChars = new UnicodeSet[100];
-		UCD ucd = Default.ucd();
-
-		Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
-		{
-			uca0.setStrength(Collator.IDENTICAL);
-		}
-		GenerateHanTransliterator.MultiComparator uca 
-			= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
-					uca0, new UTF16.StringComparator()});
-
-		UnicodeSet bidiR = new UnicodeSet(
-				"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
-
-		UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
-		UnicodeSet hasNoUpper = new UnicodeSet();
-		UnicodeSet hasNoUpperMinus = new UnicodeSet();
-		BagFormatter bf = new BagFormatter();
-		UnicodeSet inIDN = new UnicodeSet();
-
-		void genStringPrep() throws IOException {
-			//showScriptToBlock();
-			bf.setShowLiteral(BagFormatter.toHTMLControl);
-			//bf.setValueSource(UnicodeLabel.NULL);
-			if (false) {
-				
-				System.out.println("word chars: " + bf.showSetNames(wordChars));
-				System.out.println("pat: " + bf.showSetNames(patternProp));
-				System.out.println("xid: " + bf.showSetNames(not_xid_continue));
-			}
-			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
-				Utility.dot(cp);
-				int cat = Default.ucd().getCategory(cp);
-				if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
-				if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
-				int idnaType = getIDNAType(cp);
-				idnaTypeSet[idnaType].add(cp);
-				String str = UTF16.valueOf(cp);
-				if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
-				int script = ucd.getScript(cp);
-				if (coreChars[script] == null)
-					coreChars[script] = new UnicodeSet();
-				coreChars[script].add(cp);
-			}
-			// fix characters with no uppercase
-			hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
-			System.out.println(bf.showSetNames(hasNoUpper));
-			
-			Utility.fixDot();
-			PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
-			PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
-			PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
-			textOut.println('\uFEFF');
-			textOut.println("For documentation, see idn-chars.html");
-			
-			Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut, 
-					new String[] {"%date%", Default.getDate()});
-			/*
-			out
-					.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
-			out.println("<title>IDN Characters</title><style>");
-			out.println("<!--");
-			out.println(".script       { font-size: 150%; background-color: #CCCCCC }");
-			out.println(".Atomic       { background-color: #CCCCFF }");
-			out.println(".Atomic-no-uppercase       { background-color: #CCFFCC }");
-			out.println(".Non-XID       { background-color: #FFCCCC }");
-			out.println(".Decomposable       { background-color: #FFFFCC }");
-			out.println(".Pattern_Syntax       { background-color: #FFCCFF }");
-			
-			out.println("th           { text-align: left }");
-			out.println("-->");
-			out.println("</style></head><body><table>");
-			*/
-			htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
-			htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
-
-			for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
-				if (scriptCode == COMMON_SCRIPT
-						|| scriptCode == INHERITED_SCRIPT)
-					continue;
-				showCodes(htmlOut, textOut, scriptCode, htmlOut2);
-			}
-			showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
-			showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
-			htmlOut.println("</table></body></html>");
-			htmlOut.close();
-			htmlOut2.println("</table></body></html>");
-			htmlOut2.close();
-			bf.setMergeRanges(false);
-
-			textOut.println();
-			textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
-			textOut.println();
-			bf.setValueSource("word-chars");
-			bf.showSetNames(textOut, wordChars);
-			
-			textOut.println();
-			textOut.println("# *** FOR REVIEW ***");
-			bf.setLabelSource(UnicodeLabel.NULL);
-			for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
-				textOut.println();
-				String value = (String)it.next();
-				bf.setValueSource(value);
-				bf.showSetNames(textOut, suspect.getSet(value));
-			}
-			textOut.close();
-		}
-		
-		/**
-		 * 
-		 */
-		private void showScriptToBlock() {
-			UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
-			UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
-			UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
-				public Object compose(Object a, Object b) {
-					return a + "\t" + b;
-				}
-			};
-			UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
-			for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
-				System.out.println(it.next());
-			}
-			throw new IllegalArgumentException();
-		}
-		
-		Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
-		
-		static String[][] script_to_gif = {
-				
-			{"Common","common.gif"}, //Miscellaneous_Symbols
-			{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
-			{"Arabic","arabic.gif"}, //Arabic
-			{"Armenian","armenian.gif"}, //Armenian
-			{"Bengali","bengali.gif"}, //Bengali
-			{"Bopomofo","bopomofo.gif"}, //Bopomofo
-			{"Braille","braillesymbols.gif"}, //Braille_Patterns
-			{"Buginese","buginese.gif"}, //Buginese
-			{"Buhid","buhid.gif"}, //Buhid
-			{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
-			{"Cherokee","cherokee.gif"}, //Cherokee
-			{"Coptic","coptic.gif"}, //Coptic
-			{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
-			{"Cyrillic","cyrillic.gif"}, //Cyrillic
-			{"Deseret","deseret.gif"}, //Deseret
-			{"Devanagari","devanagari.gif"}, //Devanagari
-			{"Ethiopic","ethiopic.gif"}, //Ethiopic
-			{"Georgian","georgian.gif"}, //Georgian
-			{"Glagolitic","glagolitic.gif"}, //Glagolitic
-			{"Gothic","gothic.gif"}, //Gothic
-			{"Greek","greek.gif"}, //Greek_and_Coptic
-			{"Gujarati","gujarati.gif"}, //Gujarati
-			{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
-			{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
-			{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
-			{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
-			{"Hanunoo","hanunoo.gif"}, //Hanunoo
-			{"Hebrew","hebrew.gif"}, //Hebrew
-			{"Hiragana","hiragana.gif"}, //Hiragana
-			{"Kannada","kannada.gif"}, //Kannada
-			{"Katakana","katakana.gif"}, //Katakana
-			{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
-			{"Khmer","khmer.gif"}, //Khmer
-			{"Lao","lao.gif"}, //Lao
-			{"Latin","latin.gif"}, //Basic_Latin
-			{"Limbu","limbu.gif"}, //Limbu
-			{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
-			{"Malayalam","malayalam.gif"}, //Malayalam
-			{"Mongolian","mongolian.gif"}, //Mongolian
-			{"Myanmar","myanmar.gif"}, //Myanmar
-			{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
-			{"Ogham","ogham.gif"}, //Ogham
-			{"Old_Italic","olditalic.gif"}, //Old_Italic
-			{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
-			{"Oriya","oriya.gif"}, //Oriya
-			{"Osmanya","osmanya.gif"}, //Osmanya
-			{"Runic","runic.gif"}, //Runic
-			{"Shavian","shavian.gif"}, //Shavian
-			{"Sinhala","sinhala.gif"}, //Sinhala
-			{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
-			{"Syriac","syriac.gif"}, //Syriac
-			{"Tagalog","tagalog.gif"}, //Tagalog
-			{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
-			{"Tai_Le","taile.gif"}, //Tai_Le
-			{"Tamil","tamil.gif"}, //Tamil
-			{"Telugu","telugu.gif"}, //Telugu
-			{"Thaana","thaana.gif"}, //Thaana
-			{"Thai","thai.gif"}, //Thai
-			{"Tibetan","tibetan.gif"}, //Tibetan
-			{"Tifinagh","tifinagh.gif"}, //Tifinagh
-			{"Ugaritic","ugaritic.gif"}, //Ugaritic
-			{"Yi","yi.gif"}, //Yi_Syllables
-
-		};
-		
-		UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
-		{
-			for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
-		}
-		static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
-		/**
-		 * 
-		 */
-		private int getIDNAType(int cp) {
-			inbuffer.setLength(0);
-			UTF16.append(inbuffer, cp);
-			try {
-				intermediate = IDNA.convertToASCII(inbuffer,
-						IDNA.DEFAULT); // USE_STD3_RULES
-				if (intermediate.length() == 0)
-					return DELETED;
-				outbuffer = IDNA.convertToUnicode(intermediate,
-						IDNA.USE_STD3_RULES);
-			} catch (StringPrepParseException e) {
-				return ILLEGAL;
-			} catch (Exception e) {
-				System.out.println("Failure at: " + Utility.hex(cp));
-				return ILLEGAL;
-			}
-			if (!TestData.equals(inbuffer, outbuffer))
-				return REMAPPED;
-			return OK;
-		}
-		StringBuffer inbuffer = new StringBuffer();
-		StringBuffer intermediate, outbuffer;
-
-		UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
-
-		/**
-		 * @param htmlOut
-		 * @param textOut TODO
-		 * @param scriptCode
-		 * @param htmlOut2 TODO
-		 * @param ucd
-		 * @param coreChars
-		 * @param decompChars
-		 */
-		private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
-			if (coreChars[scriptCode] == null) return;
-			String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
-			script = Utility.getUnskeleton(script.toLowerCase(),true);
-			System.out.println(script);
-			
-			htmlOut.println();
-			String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
-			+ "'> Script: " + script + "</th></tr>";
-			htmlOut.println(scriptLine);
-			htmlOut2.println(scriptLine);
-			textOut.println();
-			textOut.println("#*** Script: " + script + " ***");
-			UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
-			
-			UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
-			UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
-			UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
-			
-			UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
-			UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
-			
-			UnicodeSet decomp = extract(decomposable, core);
-			UnicodeSet pattern = extract(patternProp, core);
-			UnicodeSet non_id = extract(not_xid_continue, core);
-			
-			UnicodeSet bicameralNoupper = new UnicodeSet();
-			if (!hasNoUpper.containsAll(core)) {
-				bicameralNoupper = extract(hasNoUpperMinus, core);
-			}
-
-			UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
-			for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
-				String cat = Default.ucd().getCategoryID(it.codepoint);
-				String name = Default.ucd().getName(it.codepoint);
-				if (name.indexOf("MUSICAL SYMBOL") >= 0 
-						|| name.indexOf("DINGBA") >= 0 
-						|| name.indexOf("RADICAL ") >= 0 
-						 						) cat = "XX";
-				suspect.put(it.codepoint, cat);
-			}
-			
-			if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
-			if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
-			if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
-			if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
-			if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);
-
-			if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
-			if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
-			if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
-			if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
-			if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
-		}
-
-		/**
-		 * 
-		 */
-		private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
-			UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
-			core.removeAll(decomp);
-			return decomp;
-		}
-
-		/**
-		 * @param htmlOut
-		 * @param textOut TODO
-		 * @param script TODO
-		 * @param unicodeset
-		 * @param scriptCode
-		 * @param uca
-		 */
-		private  void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
-				String script, String title, UnicodeSet unicodeset, int scriptCode) {
-			if (unicodeset == null)
-				return;
-			int size = unicodeset.size();
-			String dir = unicodeset.containsSome(bidiR)
-					&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
-			htmlOut.println("<tr><th class='" + title + "'><a href='#" +
-					title + "'>" + title + "</a> ("
-					+ nf.format(size) + ")</th></tr>");
-			htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
-			// <a href="#Atomic">categorization</a>
-			textOut.println();
-			textOut.println("# " + title);
-			bf.setValueSource(script + " ; " + title);
-			UnicodeSetIterator usi = new UnicodeSetIterator();
-			if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
-				usi.reset(unicodeset);
-				while (usi.nextRange()) {
-					if (usi.codepoint == usi.codepointEnd) {
-						htmlOut.print(formatCode(UTF16
-								.valueOf(usi.codepoint)));
-					} else {
-						htmlOut.print(formatCode(UTF16
-								.valueOf(usi.codepoint))
-								+ ".. "
-								+ formatCode(UTF16
-										.valueOf(usi.codepointEnd)));
-					}
-				}
-				bf.showSetNames(textOut, unicodeset);
-			} else {
-				Set reordered = new TreeSet(uca);
-				usi.reset(unicodeset);
-				while (usi.next()) {
-					String x = usi.getString();
-					boolean foo = reordered.add(x);
-					if (!foo)
-						throw new IllegalArgumentException("Collision with "
-								+ Default.ucd().getCodeAndName(x));
-				}
-				for (Iterator it = reordered.iterator(); it.hasNext();) {
-					Object key = it.next();
-					htmlOut.print(formatCode((String)key));
-				}
-				bf.showSetNames(textOut, reordered);
-			}
-			htmlOut.println("</td></tr>");
-		}
-
-		/**
-		 * @param string
-		 * @return
-		 */
-		private String formatCode(String string) {
-			int cat = ucd.getCategory(UTF16.charAt(string,0));
-			return "<span title='" + ucd.getCodeAndName(string) + "'>"
-			+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
-			+ BagFormatter.toHTMLControl.transliterate(string)
-			+ " </span>";
-		}
-	}
-	
 	/**
 	 * @param inbuffer
 	 * @param outbuffer
--- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
@ -240,7 +240,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
 				{"Control", "CN"},
 				{"Extend", "EX"},
 				{"Other", "XX"},
-		}).swapFirst2ValueAliases());
+		}, true).swapFirst2ValueAliases());

        add(new UnicodeProperty.UnicodeMapProperty() {
        	{
@ -283,7 +283,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
 				{"Numeric", "NU"},
 				{"ExtendNumLet", "EX"},
 				{"Other", "XX"},
-		}).swapFirst2ValueAliases());
+		}, true).swapFirst2ValueAliases());

        add(new UnicodeProperty.UnicodeMapProperty() {
        	{
@ -335,7 +335,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
 				{"STerm", "ST"},
 				{"Close", "CL"},
 				{"Other", "XX"},
-		}).swapFirst2ValueAliases());
+		}, false).swapFirst2ValueAliases());
    }
    
    static String[] YES_NO_MAYBE = {"N", "M", "Y"};
--- a/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt
@ -0,0 +1,34 @@
+# Confusables.txt
+# Generated: %date%, MED
+# This is a draft list of visually confusable characters, for use in conjunction with the
+# recommendations in http://www.unicode.org/reports/tr36/
+#
+# To fold using this list, first perform NFKD (if not already performed),
+# then map each source character to the target character(s), then perform NFKD again.
+#
+# The format the standard Unicode semicolon-delimited hex.
+# <source> ; <target> ; <internal_info> # <comment>
+#
+# The characters may be visually distinguishable in many fonts, or at larger sizes.
+# Some anomalies are also introduced by 'closure'. That is, there may be a sequence of
+# characters where each is visually confusable from the next, but the start and end are
+# visually distinguishable. But when the set is closed, these will all map to together.
+#
+# This is unlike normalization data. There may be no connection between characters other
+# than visual confusability. This data should not be used except in assessing visual confusability.
+#
+# This list is not limited to Unicode Identifier characters (XID_Continue) although the primary
+# application will be to such characters. It is also not limited to lowercase characters,
+# although the recommendations are to lowercase for security.
+#
+# Note that a some characters have unusual characteristics, and are not yet accounted for.
+# For example, U+302E (?) HANGUL SINGLE DOT TONE MARK and U+302F (?) HANGUL DOUBLE DOT TONE MARK
+# appear to the left of the prevous character. So what looks like "a:b" can actually be "ab\u302F"
+#
+# WARNING: The data is not final; it is very draft at this point, put together from different
+# sources that need to be reviewed for accuracy and completeness of the mappings. 
+# There are still clear errors in the data; do not use this in any implementations.
+# Ignore the internal_info field; it will be removed.
+#
+# Thanks especially to Eric van der Poel for collecting information about fonts using shared glyphs.
+# =================================
--- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
+++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
@ -86,6 +86,10 @@ Within each subcategory characters are sorted according to the default
    </tr>
  </table>
 </blockquote>
+<p>Characters that are normally invisible are represented in the chart by their Unicode number, such as "U+FE00".</p>
+<p>At the end of this document, there is an additional section that lists all <a href='#Visible_Combining_Marks_0'>visible non-spacing marks</a>.
+These are sorted first by combining character class (modified), then by script, then by code point..</p>
+<p>For comparison of Indic characters, see <a href='indic-trans.html'>indic-trans.html</a>.</p>
 <h3>Additional <a name="Word_Characters">Word Characters</a></h3>
 <p>This is a draft list of characters based on <i>Section 4 Word Boundaries</i> of
 <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2005/03/30 17:19:32 $
-* $Revision: 1.48 $
+* $Date: 2005/05/27 21:39:03 $
+* $Revision: 1.49 $
 *
 *******************************************************************************
 */
@ -336,6 +336,10 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
    }

    public static String fromHex(String p) {
+    	return fromHex(p, false);
+    }
+    
+    public static String fromHex(String p, boolean acceptChars) {
        StringBuffer output = new StringBuffer();
        int value = 0;
        int count = 0;
@ -357,13 +361,31 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
                default:
                    int type = Character.getType(ch);
                    if (type != Character.SPACE_SEPARATOR) {
+                    	if (acceptChars) {
+                            if (count >= 4 && count <= 6) {
+                                UTF32.append32(output, value);
+                                count = 0;
+                                value = 0;
+                            } else if (count != 0) {
+                            	output.append(p.substring(i-count, i)); // TODO fix supplementary characters
+                            }
+                            UTF32.append32(output, ch);
+                            continue main;
+                   		
+                    	}
                        throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
                            new Object[] {String.valueOf(ch), new Integer(i), p});
                    }
                    // fall through!!
                case ' ': case ',': case ';': // do SPACE here, just for speed
                    if (count != 0) {
-                        UTF32.append32(output, value);
+                    	if (count < 4 || count > 6) {
+                    		if (acceptChars) output.append(p.substring(i-count, i));
+                    		else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
+                                    new Object[] {String.valueOf(ch), new Integer(i), p});
+                    	} else {
+                    		UTF32.append32(output, value);
+                    	}
                    }
                    count = 0;
                    value = 0;
@ -378,7 +400,13 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
            count++;
        }
        if (count != 0) {
-            UTF32.append32(output, value);
+           	if (count < 4 || count > 6) {
+           		if (acceptChars) output.append(p.substring(p.length()-count, p.length()));
+        		else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
+                        new Object[] {"EOS", new Integer(p.length()), p});
+        	} else {
+        		UTF32.append32(output, value);
+        	}
        }
        return output.toString();
    }