ICU-0 U4.1

X-SVN-Rev: 17421
2005-03-30 17:19:32 +00:00 · 2005-03-30 17:19:32 +00:00 · 31eafca234
commit 31eafca234
parent 98a1c52e09
8 changed files with 441 additions and 257 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
-* $Date: 2004/04/17 18:21:39 $
-* $Revision: 1.12 $
+* $Date: 2005/03/30 17:19:32 $
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@ -17,6 +17,7 @@ import java.util.*;
 import java.io.*;

 import com.ibm.text.utility.*;
+import com.ibm.icu.dev.test.util.UnicodeProperty;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

@ -30,6 +31,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
    
    OldUnicodeMap sampleMap = null;
    OldUnicodeMap map = new OldUnicodeMap();
+    UnicodeProperty prop;
   
    // ====================== Main ===========================
    
@ -46,6 +48,34 @@ abstract public class GenerateBreakTest implements UCD_Types {
        this.ucd = ucd;
        nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
        nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
+        	/*
+        public void fillMap(String propName) {
+        	List list = y.getAvailableValues();
+        	for (Iterator it = list.iterator(); it.hasNext();) {
+        		String label = (String) it.next();
+        		map.add(label, y.getSet(label));
+        	}
+        }
+        */
+    }
+    
+	ToolUnicodePropertySource unicodePropertySource = ToolUnicodePropertySource.make("");
+
+	Set labels = new HashSet();
+    
+    int addToMap(String label) {
+    	labels.add(label);
+    	UnicodeSet s = prop.getSet(label);
+    	if (s == null || s.size() == 0) throw new IllegalArgumentException("Bad value: " + prop.getName() + ", " + label);
+    	return map.add(label, s);
+    }
+
+    int addToMapLast(String label) {
+    	int result = addToMap(label);
+    	Set values = new HashSet(prop.getAvailableValues());
+    	if (!values.equals(labels)) throw new IllegalArgumentException("Missing Property Values: " + prop.getName()
+    			+ ": " + values.removeAll(labels));
+		return result;
    }

    // COMMON STUFF for Hangul
@ -280,24 +310,30 @@ abstract public class GenerateBreakTest implements UCD_Types {
        //printLine(out, samples[LB_ZW], "", samples[LB_CL]);
        //printLine(out, samples[LB_ZW], " ", samples[LB_CL]);

-        PrintWriter out = Utility.openPrintWriter("TR29\\" 
+        UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("auxiliary\\", fileName + "BreakTest");
+        PrintWriter out = fc.out;
+
+/*        PrintWriter out = Utility.openPrintWriter("auxiliary\\" 
            + fileName + "BreakTest-"
            + ucd.getVersion()
            + ".html", Utility.UTF8_WINDOWS);
+*/        
+        out.println("<!doctype HTML PUBLIC '-//W3C//DTD HTML 4.0 Transitional//EN' 'http://www.w3.org/TR/REC-html40/loose.dtd'>");
        out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
        out.println("<title>" + fileName + " Break Chart</title>");
-        out.println("<style>");
+        out.println("<style type='text/css'>");
        out.println("td, th { vertical-align: top }");
        out.println("</style></head>");


        out.println("<body bgcolor='#FFFFFF'>");
        out.println("<h2>" + fileName + " Break Chart</h2>");
-        out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "; <b>Date:</b> " + ucd.getDate() + "</p>");
+        out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
+        out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
        generateTable(out);
        

-        if (sampleMap != null) {
+        if (false) {
            out.println("<h3>Character Type Breakdown</h3>");
            out.println("<table border='1' cellspacing='0' width='100%'>");
            for (int i = 0; i < sampleMap.size(); ++i) {
@ -308,7 +344,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            out.println("</table>");
        }

-        out.close();
+        fc.close();
        
        generateTest(false);
        
@ -318,14 +354,18 @@ abstract public class GenerateBreakTest implements UCD_Types {
        String[] testCase = new String[50];
        // do main test

-        PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest" 
+        UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("auxiliary\\", fileName + "BreakTest" 
+                + (shortVersion ? "_SHORT" : ""));
+        PrintWriter out = fc.out;
+/*        PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest" 
            + (shortVersion ? "_SHORT" : "")
            + "-" + ucd.getVersion()
            + ".txt", Utility.UTF8_WINDOWS);
+*/        
        int counter = 0;

+        out.println("#");
        out.println("# Default " + fileName + " Break Test");
-        out.println("# Generated: " + ucd.getDate() + ", MED");
        out.println("#");
        out.println("# Format:");
        out.println("# <string> (# <comment>)? ");
@ -361,7 +401,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            printLine(out, extraSingleSamples[ii], true, false);
        }
        out.println("# Lines: " + counter);
-        out.close();
+        fc.close();
    }

    public void sampleDescription(PrintWriter out) {}
@ -461,7 +501,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            if (after == null) continue;

            String h = getTypeID(after);
-            types += "<th " + width + " title='" + getInfo(after) + "'><a class='lbclass' href='#" + h + "'>" + h + "</th>";
+            types += "<th " + width + " class='lbclass' title='" + getInfo(after) + "'>" + h + "</th>";
            
            
            //codes += "<th " + width + " title='" + getInfo(after) + "'>" + Utility.hex(after) + "</th>";
@ -480,8 +520,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            if (before == null) continue;

            String h = getTypeID(before);
-            String line = "<tr><th title='" + ucd.getCodeAndName(before) + "'><a class='lbclass' href='#" + h + "'>" 
-                + h + "</th>";
+            String line = "<tr><th class='lbclass' title='" + ucd.getCodeAndName(before) + "'>" + h + "</th>";

            for (int type2 = 0; type2 < tableLimit; ++type2) {
                
@ -555,7 +594,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
        String status;
        if (html) {
            status = hasBreak ? " style='border-right: 1px solid blue'" : "";
-            string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;<span>");
+            string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;</span>");
        } else {
            status = hasBreak ? BREAK : NOBREAK;
            string.append(status);
@ -574,7 +613,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
                    + "'>"
                    + Utility.quoteXML(Utility.getDisplay(cp), true)
                    + "</span>");
-                string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;<span>");
+                string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;</span>");
            } else {
                if (string.length() > 0) {
                    string.append(' ');
@ -743,28 +782,23 @@ abstract public class GenerateBreakTest implements UCD_Types {

        GenerateGraphemeBreakTest(UCD ucd) {
            super(ucd);
-            fileName = "GraphemeCluster";
+            fileName = "Grapheme";
            sampleMap = map;
        }

-        
+        Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break");
+
        final int
-            CR =    map.add("CR",    new UnicodeSet(0xD, 0xD)),
-            LF =    map.add("LF",    new UnicodeSet(0xA, 0xA)),
-            Control = map.add("Control", 
-                        getSet(ucd, CATEGORY, Cc)
-                .addAll(getSet(ucd, CATEGORY, Cf))
-                .addAll(getSet(ucd, CATEGORY, Zp))
-                .addAll(getSet(ucd, CATEGORY, Zl))
-                .removeAll(map.getSetFromIndex(CR))
-                .removeAll(map.getSetFromIndex(LF))),
-            Extend = map.add("Extend", getSet(ucd, DERIVED, GraphemeExtend)),
-            L =     map.add("L",     getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.L)),
-            V =     map.add("V",     getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.V)),
-            T =     map.add("T",     getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.T)),
-            LV =    map.add("LV",    getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LV)),
-            LVT =   map.add("LVT",   getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LVT)),
-            Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);            
+            CR =    addToMap("CR"),
+            LF =    addToMap("LF"),
+            Control = addToMap("Control"),
+            Extend = addToMap("Extend"),
+            L =     addToMap("L"),
+            V =     addToMap("V"),
+            T =     addToMap("T"),
+            LV =    addToMap("LV"),
+            LVT =   addToMap("LVT"),
+            Other = addToMapLast("Other");            
                
        // stuff that subclasses need to override
        public String getTypeID(int cp) {
@ -860,35 +894,23 @@ abstract public class GenerateBreakTest implements UCD_Types {

        }
        
+        Object foo = prop = unicodePropertySource.getProperty("Word_Break");
+
        //static String LENGTH = "[\u30FC\uFF70]";
        //static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]";
        //static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
        //static String HIRAGANA_ITERATION = "[\u309D\u309E]";
        
        final int
-            Format =    map.add("Format",    getSet(ucd, CATEGORY, Cf).remove(0x00AD)),
-            Katakana =    map.add("Katakana",    getSet(ucd, SCRIPT, KATAKANA_SCRIPT)
-                .addAll(new UnicodeSet("[\u30FC\uFF70\uFF9E\uFF9F]"))
-                //.addAll(new UnicodeSet(HALFWIDTH_KATAKANA))
-                //.addAll(new UnicodeSet(KATAKANA_ITERATION))
-                ),
-            ALetter = map.add("ALetter", 
-                        getSet(ucd, DERIVED, PropAlphabetic)
-                .add(0x05F3, 0x05F3)
-                .removeAll(map.getSetFromIndex(Katakana))
-                .removeAll(getSet(ucd, BINARY_PROPERTIES, Ideographic))
-                .removeAll(getSet(ucd, SCRIPT, THAI_SCRIPT))
-                .removeAll(getSet(ucd, SCRIPT, LAO_SCRIPT))
-                .removeAll(getSet(ucd, SCRIPT, HIRAGANA_SCRIPT))
-                ),
-            MidLetter = map.add("MidLetter", 
-                new UnicodeSet("[\\u0027\\u00AD\\u00B7\\u05f4\\u05F4\\u2019\\u2027]")),
-            MidNumLet =     map.add("MidNumLet",
-                new UnicodeSet("[\\u002E\\u003A]")),
-            MidNum =     map.add("MidNum",     getSet(ucd, LINE_BREAK, LB_IN)
-                .removeAll(map.getSetFromIndex(MidNumLet))),
-            Numeric =     map.add("Numeric",     getSet(ucd, LINE_BREAK, LB_NU)),
-            Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);      
+            Format =    addToMap("Format"),
+            Katakana =    addToMap("Katakana"),
+            ALetter = addToMap("ALetter"),
+            MidLetter = addToMap("MidLetter"),
+            //MidNumLet =     addToMap("MidNumLet"),
+            MidNum =     addToMap("MidNum"),
+            Numeric =     addToMap("Numeric"),
+            ExtendNumLet =     addToMap("ExtendNumLet"),
+            Other = addToMapLast("Other");      

        // stuff that subclasses need to override
        public String getTypeID(int cp) {
@ -948,11 +970,11 @@ abstract public class GenerateBreakTest implements UCD_Types {

            // Don’t break letters across certain punctuation

-            setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
-            if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
+            setRule("6: ALetter × MidLetter ALetter");
+            if (before == ALetter && after == MidLetter && after2 == ALetter) return false;

            setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
-            if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
+            if (before2 == ALetter && before == MidLetter && after == ALetter) return false;

            // Don’t break within sequences of digits, or digits adjacent to letters.

@ -968,15 +990,22 @@ abstract public class GenerateBreakTest implements UCD_Types {

            // Don’t break within sequences like: '-3.2'
            setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
-            if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false;
+            if (before2 == Numeric && before == MidNum && after == Numeric) return false;

            setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
-            if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false;
+            if (before == Numeric && after == MidNum && after2 == Numeric) return false;

            // Don't break between Katakana

            setRule("13: Katakana × Katakana");
            if (before == Katakana && after == Katakana) return false;
+            
+            // Do not break from extenders
+            setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet)  	×  	ExtendNumLet");
+            if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) && after == ExtendNumLet) return false;
+
+            setRule("13b: ExtendNumLet 	× 	(ALetter | Numeric | Katakana)");
+            if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) return false;

            // Otherwise break always.
            setRule("14: Any ÷ Any");
@ -1344,7 +1373,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            setRule("9: OP SP* ×");
            if (lastNonSpace == LB_OP) return false;

-            // LB 10  Don’t break within ‘”[’, , even with intervening spaces.
+            // LB 10  Don’t break within ‘<EFBFBD>?[’, , even with intervening spaces.
            // QU SP* × OP
            setRule("10: QU SP* × OP");
            if (lastNonSpace == LB_QU && after == LB_OP) return false;
@ -1377,7 +1406,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            setRule("13: × GL ; GL ×");
            if (after == LB_GL || before == LB_GL) return false;

-            // LB 14  Don’t break before or after ‘”’
+            // LB 14  Don’t break before or after ‘<EFBFBD>?’
            setRule("14: × QU ; QU ×");
            if (before == LB_QU || after == LB_QU) return false;

@ -1450,7 +1479,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            if (before == LB_HY) return true;
            if (after == LB_BB) return true;

-            // LB 19  Don’t break between alphabetics (“at”)
+            // LB 19  Don’t break between alphabetics (“at<EFBFBD>?)
            // AL × AL

            setRule("19: AL × AL");
@ -1515,36 +1544,20 @@ abstract public class GenerateBreakTest implements UCD_Types {

        }

+        Object foo = prop = unicodePropertySource.getProperty("Sentence_Break");

        final int
-            Sep =    map.add("Sep",    new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]")),
-            Format =    map.add("Format",    getSet(ucd, CATEGORY, Cf)),
-            Sp = map.add("Sp", getSet(ucd, BINARY_PROPERTIES, White_space)
-                .removeAll(map.getSetFromIndex(Sep))),
-            Lower = map.add("Lower", getSet(ucd, DERIVED, PropLowercase)),
-            Upper = map.add("Upper", getSet(ucd, CATEGORY, Lt)
-                .addAll(getSet(ucd, DERIVED, PropUppercase))),
-            OLetter = map.add("OLetter", 
-                        getSet(ucd, DERIVED, PropAlphabetic)
-                .add(0x05F3, 0x05F3)
-                .removeAll(map.getSetFromIndex(Lower))
-                .removeAll(map.getSetFromIndex(Upper))
-                ),
-            Numeric =     map.add("Numeric",     getSet(ucd, LINE_BREAK, LB_NU)),
-            ATerm =     map.add("ATerm", new UnicodeSet(0x002E,0x002E)),
-            Term =    map.add("Term", new UnicodeSet(
-                "[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367"
-                + "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049"
-                + "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]")),
-            Close =     map.add("Close",     
-                getSet(ucd, CATEGORY, Po)
-                .addAll(getSet(ucd, CATEGORY, Pe))
-                .addAll(getSet(ucd, LINE_BREAK, LB_QU))
-                .removeAll(map.getSetFromIndex(ATerm))
-                .removeAll(map.getSetFromIndex(Term))
-                .remove(0x05F3)
-                ),
-            Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);            
+            Sep =    addToMap("Sep"),
+            Format =    addToMap("Format"),
+            Sp = addToMap("Sp"),
+            Lower = addToMap("Lower"),
+            Upper = addToMap("Upper"),
+            OLetter = addToMap("OLetter"),
+            Numeric =     addToMap("Numeric"),
+            ATerm =     addToMap("ATerm"),
+            STerm =    addToMap("STerm"),
+            Close =     addToMap("Close"),
+            Other = addToMapLast("Other");            
                
        // stuff that subclasses need to override
        public String getTypeID(int cp) {
@ -1726,8 +1739,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
                }
                if (t == ATerm) {
                    lookAfter = ATerm;
-                } else if (t == Term) {
-                    lookAfter = Term;
+                } else if (t == STerm) {
+                    lookAfter = STerm;
                }
                break;
            }
@ -1776,7 +1789,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
                        setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
                        return false;
                    }
-                    if (lookAfter == Term) break;
+                    if (lookAfter == STerm) break;
                }
                    
                // at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
@ -34,6 +34,10 @@ import com.ibm.icu.text.SymbolTable;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeMatcher;
 import com.ibm.icu.text.UnicodeSet;
+import com.ibm.text.UCD.GenerateBreakTest.GenerateGraphemeBreakTest;
+import com.ibm.text.UCD.GenerateBreakTest.GenerateLineBreakTest;
+import com.ibm.text.UCD.GenerateBreakTest.GenerateSentenceBreakTest;
+import com.ibm.text.UCD.GenerateBreakTest.GenerateWordBreakTest;
 import com.ibm.text.UCD.MakeUnicodeFiles.Format.PrintStyle;
 import com.ibm.text.utility.UnicodeDataFile;
 import com.ibm.text.utility.Utility;
@ -511,6 +515,14 @@ public class MakeUnicodeFiles {
            GenerateCaseFolding.generateSpecialCasing(false);
        } else if (filename.equals("StandardizedVariants")) {
            GenerateStandardizedVariants.generate();
+        } else if (filename.equals("GraphemeBreakTest")) {
+            new GenerateGraphemeBreakTest(Default.ucd()).run();
+        } else if (filename.equals("WordBreakTest")) {
+            new GenerateWordBreakTest(Default.ucd()).run();
+        } else if (filename.equals("LineBreakTest")) {
+            new GenerateLineBreakTest(Default.ucd()).run();
+        } else if (filename.equals("SentenceBreakTest")) {
+           new GenerateSentenceBreakTest(Default.ucd()).run();
        } else {
            generatePropertyFile(filename);
        }
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
@ -1,5 +1,5 @@
-Generate:
-DeltaVersion: 13
+Generate: DerivedBidiClass
+DeltaVersion: 14
 CopyrightYear: 2005

 File: auxiliary/GraphemeBreakProperty
@ -14,6 +14,18 @@ File: auxiliary/SentenceBreakProperty
 Property: Sentence_Break
 Format:	skipValue=Other

+File: auxiliary/GraphemeBreakTest
+Property: SPECIAL
+
+File: auxiliary/WordBreakTest
+Property: SPECIAL
+
+File: auxiliary/LineBreakTest
+Property: SPECIAL
+
+File: auxiliary/SentenceBreakTest
+Property: SPECIAL
+
 File:	Blocks
 Property: Block
 # Note:   When comparing block names, casing, whitespace, hyphens,
@ -58,12 +70,14 @@ Value:	4.1
 File:	extracted/DerivedBidiClass
 Property:	Bidi_Class
 # Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
-# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
+# Unlike other properties, unassigned code points in blocks 
+# reserved for right-to-left scripts are given either types R or AL.
 # The unassigned characters that default to R are:
-#   Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
+#   Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF
+#   \uFB1D-\uFB4F \U00010840-\U000109FF \U00010A60-\U00010FFF
 # The unassigned characters that default to AL are:
-#   Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
-#   and the range \u0750-\u077F, minus the Noncharacter_Code_Points
+#   Arabic, Syriac, Arabic_Supplement, Thaana, Arabic_Presentation_Forms_A,
+#   Arabic_Presentation_Forms_B, minus the Noncharacter_Code_Points
 # For all other cases:
 Format:	valueStyle=short skipUnassigned=Left_To_Right

--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/03/26 05:40:05 $
-* $Revision: 1.19 $
+* $Date: 2005/03/30 17:19:32 $
+* $Revision: 1.20 $
 *
 *******************************************************************************
 */
@ -148,54 +148,58 @@ public class TestData implements UCD_Types {
 			log.close();
 		}
 	}
+	Matcher m;
 	
 	static class GenStringPrep {
 		UnicodeSet[] coreChars = new UnicodeSet[100];
 		UnicodeSet decomposable = new UnicodeSet();
-		UnicodeSet pattern = new UnicodeSet();
+		
 		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
 		//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
-		UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
+		UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())
+		.retainAll(ups.getSet("gc=Sk"))
+		.addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" +
+		" \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]"));
+		
+		UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
+		
+		UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars);
+		
 		//UnicodeSet[] decompChars = new UnicodeSet[100];
 		UCD ucd = Default.ucd();

-		Collator uca = Collator.getInstance(ULocale.ENGLISH);
+		Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
 		{
-			uca.setStrength(Collator.IDENTICAL);
+			uca0.setStrength(Collator.IDENTICAL);
 		}
+		GenerateHanTransliterator.MultiComparator uca 
+			= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
+					uca0, new UTF16.StringComparator()});

 		UnicodeSet bidiR = new UnicodeSet(
 				"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");

 		UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
 		UnicodeSet hasUpper = new UnicodeSet();
-
+		BagFormatter bf = new BagFormatter();
+		UnicodeSet inIDN = new UnicodeSet();

 		void genStringPrep() throws IOException {
-			//BagFormatter bf = new BagFormatter();
-			//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
-			StringBuffer inbuffer = new StringBuffer();
-			StringBuffer intermediate, outbuffer;
+			bf.setShowLiteral(BagFormatter.toHTMLControl);
+			//bf.setValueSource(UnicodeLabel.NULL);
+			if (false) {
+				
+				System.out.println("word chars: " + bf.showSetNames(wordChars));
+				System.out.println("pat: " + bf.showSetNames(patternProp));
+				System.out.println("xid: " + bf.showSetNames(not_xid_continue));
+			}
 			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
 				Utility.dot(cp);
+				int cat = Default.ucd().getCategory(cp);
+				if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
 				if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
-				inbuffer.setLength(0);
-				UTF16.append(inbuffer, cp);
-				try {
-					intermediate = IDNA.convertToASCII(inbuffer,
-							IDNA.USE_STD3_RULES);
-					if (intermediate.length() == 0)
-						continue;
-					outbuffer = IDNA.convertToUnicode(intermediate,
-							IDNA.USE_STD3_RULES);
-				} catch (StringPrepParseException e) {
-					continue;
-				} catch (Exception e) {
-					System.out.println("Failure at: " + Utility.hex(cp));
-					continue;
-				}
-				if (!TestData.equals(inbuffer, outbuffer))
-					continue;
+				int idnaType = getIDNAType(cp);
+				idnaTypeSet[idnaType].add(cp);
 				int script = ucd.getScript(cp);
 				if (coreChars[script] == null)
 					coreChars[script] = new UnicodeSet();
@ -208,8 +212,12 @@ public class TestData implements UCD_Types {
 			}
 			
 			Utility.fixDot();
-			PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
-					"idn-chars.html");
+			PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
+			PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
+			textOut.println('\uFEFF');
+			textOut.println("For documentation, see idn-chars.html");
+			Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
+			/*
 			out
 					.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
 			out.println("<title>IDN Characters</title><style>");
@ -217,44 +225,87 @@ public class TestData implements UCD_Types {
 			out.println(".script       { font-size: 150%; background-color: #CCCCCC }");
 			out.println(".Atomic       { background-color: #CCCCFF }");
 			out.println(".Atomic-no-uppercase       { background-color: #CCFFCC }");
-			out.println(".Non-ID       { background-color: #FFCCCC }");
+			out.println(".Non-XID       { background-color: #FFCCCC }");
 			out.println(".Decomposable       { background-color: #FFFFCC }");
+			out.println(".Pattern_Syntax       { background-color: #FFCCFF }");
+			
 			out.println("th           { text-align: left }");
 			out.println("-->");
 			out.println("</style></head><body><table>");
+			*/
+			htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");

 			for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
 				if (scriptCode == COMMON_SCRIPT
 						|| scriptCode == INHERITED_SCRIPT)
 					continue;
-				showCodes(out, scriptCode);
+				showCodes(htmlOut, textOut, scriptCode);
 			}
-			showCodes(out, COMMON_SCRIPT);
-			showCodes(out, INHERITED_SCRIPT);
-			out.println("</table></body></html>");
-			out.close();
+			showCodes(htmlOut, textOut, COMMON_SCRIPT);
+			showCodes(htmlOut, textOut, INHERITED_SCRIPT);
+			htmlOut.println("</table></body></html>");
+			htmlOut.close();
 		}
 		
+		UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
+		{
+			for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
+		}
+		static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
+		/**
+		 * 
+		 */
+		private int getIDNAType(int cp) {
+			inbuffer.setLength(0);
+			UTF16.append(inbuffer, cp);
+			try {
+				intermediate = IDNA.convertToASCII(inbuffer,
+						IDNA.DEFAULT); // USE_STD3_RULES
+				if (intermediate.length() == 0)
+					return DELETED;
+				outbuffer = IDNA.convertToUnicode(intermediate,
+						IDNA.USE_STD3_RULES);
+			} catch (StringPrepParseException e) {
+				return ILLEGAL;
+			} catch (Exception e) {
+				System.out.println("Failure at: " + Utility.hex(cp));
+				return ILLEGAL;
+			}
+			if (!TestData.equals(inbuffer, outbuffer))
+				return REMAPPED;
+			return OK;
+		}
+		StringBuffer inbuffer = new StringBuffer();
+		StringBuffer intermediate, outbuffer;
+
 		UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");

 		/**
-		 * @param out
+		 * @param htmlOut
+		 * @param textOut TODO
+		 * @param scriptCode
 		 * @param ucd
 		 * @param coreChars
 		 * @param decompChars
-		 * @param scriptCode
 		 */
-		private void showCodes(PrintWriter out, int scriptCode) {
+		private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
 			if (coreChars[scriptCode] == null) return;
 			System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
 			String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
-			out.println();
-			out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
+			htmlOut.println();
+			htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
+			textOut.println();
+			textOut.println("#*** Script: " + script + " ***");
 			UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
-			UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
-			core.removeAll(decomp);
-			UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
-			core.removeAll(non_id);
+			
+			UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
+			UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
+			UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
+			
+			UnicodeSet decomp = extract(decomposable, core);
+			UnicodeSet pattern = extract(patternProp, core);
+			UnicodeSet non_id = extract(not_xid_continue, core);
+			
 			UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
 			core.removeAll(otherCore);
 			if (core.size() == 0) {
@ -262,58 +313,81 @@ public class TestData implements UCD_Types {
 				core = otherCore;
 				otherCore = temp;
 			}
-			printlnSet(out, "Atomic", core, scriptCode);
-			if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
-			if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
-			if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
+			
+			if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
+			if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode);
+			if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
+			if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
+			if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
+
+			if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode);
+			if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
+			if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
 		}

 		/**
-		 * @param out
-		 * @param unicodeset
-		 * @param uca
-		 * @param scriptCode
+		 * 
 		 */
-		private  void printlnSet(PrintWriter out, String title,
-				UnicodeSet unicodeset, int scriptCode) {
+		private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
+			UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
+			core.removeAll(decomp);
+			return decomp;
+		}
+
+		/**
+		 * @param htmlOut
+		 * @param textOut TODO
+		 * @param script TODO
+		 * @param unicodeset
+		 * @param scriptCode
+		 * @param uca
+		 */
+		private  void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
+				String script, String title, UnicodeSet unicodeset, int scriptCode) {
 			if (unicodeset == null)
 				return;
 			int size = unicodeset.size();
 			String dir = unicodeset.containsSome(bidiR)
 					&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
-			out.println("<tr><th class='" + title + "'>" + title + " ("
+			htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
 					+ nf.format(size) + ")</th></tr>");
-			out.print("<tr><td class='" + title + "'" + dir + ">");
+			htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
+			textOut.println();
+			textOut.println("# " + title);
+			bf.setValueSource(script + " ; " + title);
 			UnicodeSetIterator usi = new UnicodeSetIterator();
 			if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
 				usi.reset(unicodeset);
 				while (usi.nextRange()) {
 					if (usi.codepoint == usi.codepointEnd) {
-						out.print(formatCode(UTF16
+						htmlOut.print(formatCode(UTF16
 								.valueOf(usi.codepoint)));
 					} else {
-						out.print(formatCode(UTF16
+						htmlOut.print(formatCode(UTF16
 								.valueOf(usi.codepoint))
 								+ ".. "
 								+ formatCode(UTF16
 										.valueOf(usi.codepointEnd)));
 					}
 				}
+				bf.showSetNames(textOut, unicodeset);
 			} else {
 				Set reordered = new TreeSet(uca);
 				usi.reset(unicodeset);
 				while (usi.next()) {
-					boolean foo = reordered.add(usi.getString());
+					String x = usi.getString();
+					boolean foo = reordered.add(x);
 					if (!foo)
 						throw new IllegalArgumentException("Collision with "
-								+ Default.ucd().getCodeAndName(usi.getString()));
+								+ Default.ucd().getCodeAndName(x));
 				}
 				for (Iterator it = reordered.iterator(); it.hasNext();) {
-					out.print(formatCode((String) it
-							.next()));
+					Object key = it.next();
+					htmlOut.print(formatCode((String)key));
 				}
+				bf.showSetNames(textOut, reordered);
 			}
-			out.println("</td></tr>");
+			htmlOut.println("</td></tr>");
 		}

 		/**
@ -324,7 +398,7 @@ public class TestData implements UCD_Types {
 			int cat = ucd.getCategory(UTF16.charAt(string,0));
 			return "<span title='" + ucd.getCodeAndName(string) + "'>"
 			+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
-			+ BagFormatter.toHTML.transliterate(string)
+			+ BagFormatter.toHTMLControl.transliterate(string)
 			+ " </span>";
 		}
 	}
--- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
@ -123,7 +123,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
            public int getMaxWidth(boolean isShort) {
                return 15;
            }
-        }.setValues(LONG_YES_NO, YES_NO)
+        }.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
 		.setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
        
        add(new UnicodeProperty.SimpleProperty() {
@ -135,7 +135,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
            public int getMaxWidth(boolean isShort) {
                return 15;
            }
-        }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
+        }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
 		.setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
        
        add(new UnicodeProperty.SimpleProperty() {
@ -147,7 +147,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
            public int getMaxWidth(boolean isShort) {
                return 15;
            }
-        }.setValues(LONG_YES_NO, YES_NO)
+        }.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
 		.setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
        
        add(new UnicodeProperty.SimpleProperty() {
@ -159,7 +159,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
            public int getMaxWidth(boolean isShort) {
                return 15;
            }
-        }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
+        }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
 		.setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));


@ -235,7 +235,12 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
        		unicodeMap.putAll(hangul.getSet("LVT"),"LVT");
        		unicodeMap.setMissing("Other");
        	}
-        }.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version));
+        }.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version)
+		.addValueAliases(new String[][] {
+				{"Control", "CN"},
+				{"Extend", "EX"},
+				{"Other", "XX"},
+		}).swapFirst2ValueAliases());

        add(new UnicodeProperty.UnicodeMapProperty() {
        	{
@ -268,7 +273,17 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
        		unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
        		unicodeMap.setMissing("Other");
        	}
-        }.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version));
+        }.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version)
+		.addValueAliases(new String[][] {
+				{"Format", "FO"},
+				{"Katakana", "KA"},
+				{"ALetter", "LE"},
+				{"MidLetter", "ML"},
+				{"MidNum", "MN"},
+				{"Numeric", "NU"},
+				{"ExtendNumLet", "EX"},
+				{"Other", "XX"},
+		}).swapFirst2ValueAliases());

        add(new UnicodeProperty.UnicodeMapProperty() {
        	{
@ -307,7 +322,20 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
        		unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
        		unicodeMap.setMissing("Other");
        	}
-        }.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version));
+        }.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version)
+		.addValueAliases(new String[][] {
+				{"Sep", "SE"},
+				{"Format", "FO"},
+				{"Sp", "SP"},
+				{"Lower", "LO"},
+				{"Upper", "UP"},
+				{"OLetter", "LE"},
+				{"Numeric", "NU"},
+				{"ATerm", "AT"},
+				{"STerm", "ST"},
+				{"Close", "CL"},
+				{"Other", "XX"},
+		}).swapFirst2ValueAliases());
    }
    
    static String[] YES_NO_MAYBE = {"N", "M", "Y"};
--- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
+++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
@ -14,58 +14,83 @@
 .Non-XID       { background-color: #FFCCCC }
 .Decomposable       { background-color: #FFFFCC }
 .Pattern_Syntax       { background-color: #FFCCFF }
+.IDN-Remapped       { background-color: #FF6666 }
+.IDN-Deleted       { background-color: #66FF66 }
+.IDN-Illegal       { background-color: #6666FF }
 th           { text-align: left }
 -->
 </style>
 </head>

-<body>
+<body style="margin: 2em">

 <h1>IDN Character Categorization</h1>
-<p>$Date: 2005/03/29 18:31:15 $, MED</p>
+<p><i>$Date: 2005/03/30 17:19:32 $, MED</i></p>
 <p>This page lists all of the valid output IDN characters broken down by category. By &quot;output&quot; IDN 
 characters, we mean ones that can result from nameprep. Characters are grouped first by script, and 
 then by subcategory. Within each subcategory characters are sorted according to the default
-<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tooltips provide the character code 
+<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tool-tips provide the character code 
 and name (in enabled browsers).</p>
-<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse" bordercolor="#111111" id="AutoNumber1">
-  <tr>
-    <th>Subcategory</th>
-    <th>Description</th>
-  </tr>
-  <tr>
-    <td class="Atomic">Atomic</td>
-    <td>Characters that don&#39;t fall into any of the following subcategories</td>
-  </tr>
-  <tr>
-    <td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
-    <td>For bicameral scripts, Atomic characters without an uppercase.</td>
-  </tr>
-  <tr>
-    <td class="Pattern_Syntax">Pattern_Syntax</td>
-    <td>Characters recommended as a basis for syntax, as in
-    <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>. 
-    Excludes the word characters in <i>Section 4 Word Boundaries</i> of
-    <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
-    Word_Break property and notes at the end of the section.&nbsp;&nbsp; </td>
-  </tr>
-  <tr>
-    <td class="Non-XID">Non-XID</td>
-    <td>Characters recommended as a basis for identifiers, as in
-    <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a> 
-    (XID_Continue). Excludes the word characters in <i>Section 4 Word Boundaries</i> of
-    <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
-    Word_Break property and notes at the end of the section.</td>
-  </tr>
-  <tr>
-    <td class="Decomposable">Decomposable</td>
-    <td>Characters with NFC decompositions.</td>
-  </tr>
-</table>
-<table>
-</table>
-<h2>Categorization</h2>
-
-</body>
-
-</html>
+<blockquote>
+  <table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
+    <caption><b><font size="4">Key</font></b></caption>
+    <tr>
+      <th>Subcategory</th>
+      <th>Description</th>
+    </tr>
+    <tr>
+      <td class="Atomic">Atomic</td>
+      <td>Characters that don&#39;t fall into any of the following subcategories</td>
+    </tr>
+    <tr>
+      <td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
+      <td>For bicameral scripts, Atomic characters without an uppercase.</td>
+    </tr>
+    <tr>
+      <td class="Pattern_Syntax">Pattern_Syntax</td>
+      <td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word 
+      characters in <i>Section 4 Word Boundaries</i> of
+      <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
+      Word_Break property and notes at the end of the section.</p>
+      <p>See <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and 
+      Pattern Syntax</a>. </td>
+    </tr>
+    <tr>
+      <td class="Non-XID">Non-XID</td>
+      <td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the 
+      word characters in <i>Section 4 Word Boundaries</i> of
+      <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
+      Word_Break property and notes at the end of the section.<p>See
+      <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern 
+      Syntax</a> (XID_Continue).</td>
+    </tr>
+    <tr>
+      <td class="Decomposable">Decomposable</td>
+      <td>Characters with NFC decompositions.</td>
+    </tr>
+    <tr>
+      <td class="IDN-Remapped">IDN-Remapped</td>
+      <td>Characters remapped by IDN.</td>
+    </tr>
+    <tr>
+      <td class="IDN-Deleted">IDN-Deleted</td>
+      <td>Characters deleted by IDN.</td>
+    </tr>
+    <tr>
+      <td class="IDN-Illegal">IDN-Illegal </td>
+      <td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
+    </tr>
+  </table>
+</blockquote>
+<p>The information in the categorization is also available in a plain-text file, at
+<a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for 
+sorting and filtering to view the data in different ways. The format is:</p>
+<blockquote>
+  <p>code ; script ; subcategory # general-category (character) character-name</p>
+</blockquote>
+<p><i>Examples:</i></p>
+<pre>0061          ; LATIN ; Atomic # ; L&amp; (a) LATIN SMALL LETTER A
+2015          ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
+058A          ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
+20AC          ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
+<h2>Categorization</h2>
--- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
+++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
@ -17,43 +17,53 @@ public class UnicodeDataFile {
    private String mostRecent;
    private String filename;
    private UnicodeDataFile(){};
+    private String fileType = ".txt";
    
    public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
-        UnicodeDataFile result = new UnicodeDataFile();
-        result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
-        result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
-        String[] batName = {""};
-        result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
-        result.batName = batName[0];
-    	result.filename = filename;
+        return new UnicodeDataFile(directory, filename, false);
+    }
+    
+    public static UnicodeDataFile openHTMLAndWriteHeader(String directory, String filename) throws IOException {
+        return new UnicodeDataFile(directory, filename, true);
+    }
+    
+    private UnicodeDataFile (String directory, String filename, boolean isHTML) throws IOException {
+    	fileType = isHTML ? ".html" : ".txt";
+    	String newSuffix = UnicodeDataFile.getFileSuffix(true, fileType);
+        newFile = directory + filename + newSuffix;
+        out = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
+        String[] batName2 = {""};
+        mostRecent = UnicodeDataFile.generateBat(directory, filename, newSuffix, fileType, batName2);
+        batName = batName2[0];
+    	filename = filename;
        
-        result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
-        result.out.println(generateDateLine());
-        result.out.println("#");        
-        result.out.println("# Unicode Character Database");        
-        result.out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
-        result.out.println(
-            "# For terms of use, see http://www.unicode.org/terms_of_use.html");
-        result.out.println("# For documentation, see UCD.html");
+    	if (!isHTML) {
+	        out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
+	        out.println(generateDateLine());
+	        out.println("#");        
+	        out.println("# Unicode Character Database");        
+	        out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
+	        out.println(
+	            "# For terms of use, see http://www.unicode.org/terms_of_use.html");
+	        out.println("# For documentation, see UCD.html");
+    	}
        try {
-            Utility.appendFile(filename + "Header.txt", Utility.LATIN1, result.out);
+            Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out);
        } catch (FileNotFoundException e) {
            /*
-            result.out.println("# Unicode Character Database: Derived Property Data");
-            result.out.println("# Generated algorithmically from the Unicode Character Database");
-            result.out.println("# For documentation, see UCD.html");
-            result.out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
-            result.out.println("#       if they have default property values.");
-            result.out.println("# ================================================");
+            out.println("# Unicode Character Database: Derived Property Data");
+            out.println("# Generated algorithmically from the Unicode Character Database");
+            out.println("# For documentation, see UCD.html");
+            out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
+            out.println("#       if they have default property values.");
+            out.println("# ================================================");
            */
        }
-        
-        return result;
    }
    
    public void close() throws IOException {
        try {
-            Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
+            Utility.appendFile(filename + "Footer" + fileType, Utility.UTF8_UNIX, out);
        } catch (FileNotFoundException e) {}
        out.close();           
        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
@ -64,21 +74,20 @@ public class UnicodeDataFile {
    }

    public static String getHTMLFileSuffix(boolean withDVersion) {
-        return "-"
-            + Default.ucd().getVersion()
-            + ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
-                ? ("d" + MakeUnicodeFiles.dVersion)
-                : "")
-            + ".html";
+        return getFileSuffix(withDVersion, ".html");
    }

    public static String getFileSuffix(boolean withDVersion) {
+    	return getFileSuffix(withDVersion, ".txt");
+    }
+
+    public static String getFileSuffix(boolean withDVersion, String suffix) {
        return "-"
            + Default.ucd().getVersion()
            + ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
                ? ("d" + MakeUnicodeFiles.dVersion)
                : "")
-            + ".txt";
+            + suffix;
    }

    //Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
@ -126,8 +135,8 @@ public class UnicodeDataFile {
        */
        // static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
        
-    public static String generateBat(String directory, String fileRoot, String suffix, String[] outputBatName) throws IOException {
-        String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true);
+    public static String generateBat(String directory, String fileRoot, String suffix, String fileType, String[] outputBatName) throws IOException {
+        String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true, fileType);
        if (mostRecent != null) {
            outputBatName[0] = UnicodeDataFile.generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
                mostRecent, directory + fileRoot + suffix);
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2005/03/04 02:50:26 $
-* $Revision: 1.47 $
+* $Date: 2005/03/30 17:19:32 $
+* $Revision: 1.48 $
 *
 *******************************************************************************
 */
@ -1021,7 +1021,12 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
    }

    public static String getMostRecentUnicodeDataFile(String filename, String version, 
-      boolean acceptLatest, boolean show) throws IOException {
+    	      boolean acceptLatest, boolean show) throws IOException {
+    	return getMostRecentUnicodeDataFile(filename, version, acceptLatest, show, ".txt");
+    }
+    
+    public static String getMostRecentUnicodeDataFile(String filename, String version, 
+      boolean acceptLatest, boolean show, String fileType) throws IOException {
        // get all the files in the directory

        int compValue = acceptLatest ? 0 : 1;
@ -1030,7 +1035,7 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES

            String directoryName = UCD_Types.UCD_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
            if (show) System.out.println("Trying: '" + directoryName + "', '" + filename + "'");
-            String result = searchDirectory(new File(directoryName), filename, show);
+            String result = searchDirectory(new File(directoryName), filename, show, fileType);
            if (result != null) return result;
            
        }
@ -1048,16 +1053,20 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
    }
    
    public static String searchDirectory(File directory, String filename, boolean show) throws IOException {
+    	return searchDirectory(directory, filename, show, ".txt");
+    }
+    
+    public static String searchDirectory(File directory, String filename, boolean show, String fileType) throws IOException {
        Iterator it = getDirectoryContentsLastFirst(directory).iterator();
        while (it.hasNext()) {
            String fn = (String) it.next();
            File foo = new File(directory + File.separator + fn);
            // System.out.println("\tChecking: '" + foo.getCanonicalPath() + "'");
            if (foo.isDirectory()) {
-                String attempt = searchDirectory(foo, filename, show);
+                String attempt = searchDirectory(foo, filename, show, fileType);
                if (attempt != null) return attempt;
            }
-            if (fn.endsWith(".txt") && fn.startsWith(filename)) {
+            if (fn.endsWith(fileType) && fn.startsWith(filename)) {
                if (show) System.out.println("\tFound: '" + fn + "'");
                return foo.getCanonicalPath();
            }