ICU-0 uca fixes

X-SVN-Rev: 17533
2005-05-02 15:39:54 +00:00 · 2005-05-02 15:39:54 +00:00 · c6350d9d97
commit c6350d9d97
parent b83dda29e5
8 changed files with 274 additions and 108 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2005/04/06 08:48:16 $ 
-* $Revision: 1.24 $
+* $Date: 2005/05/02 15:39:54 $ 
+* $Revision: 1.25 $
 *
 *******************************************************************************
 */
@ -930,7 +930,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
     */
    static final char EMPTY = '\uFFFF';
    char rearrangeBuffer = EMPTY;
-    UnicodeSet rearrangeList = null;
+    UnicodeSet rearrangeList = new UnicodeSet();
    int hangulBufferPosition = 0;
    StringBuffer hangulBuffer = new StringBuffer();

@ -1102,6 +1102,22 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
    UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154);
    UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F);
    UnicodeSet homelessSecondaries;
+    static final UnicodeSet moreSamples = new UnicodeSet();
+    static {
+    	moreSamples.add("\u09C7\u09BE");
+    	moreSamples.add("\u09C7\u09D7");
+    	moreSamples.add("\u1025\u102E");
+    	moreSamples.add("\u0DD9\u0DCF");
+    	moreSamples.add("\u0DD9\u0DDF");
+    	moreSamples.add("\u1100\u1161");
+    	moreSamples.add("\u1100\u1175");
+    	moreSamples.add("\u1112\u1161");
+    	moreSamples.add("\u1112\u1175");
+    	moreSamples.add("\uAC00\u1161");
+    	moreSamples.add("\uAC00\u1175");
+    	moreSamples.add("\uD788\u1161");
+    	moreSamples.add("\uD788\u1175");
+    }

    // static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198);
    //  0x0153..0x017F
@ -1121,6 +1137,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        int skip = 1;
        boolean doSamples = false;
        AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
+        UnicodeSetIterator moreSampleIterator = new UnicodeSetIterator(moreSamples);
+     
        
        /**
         * use FIXED_CE as the limit
@ -1231,6 +1249,12 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
                return result;
            }
            
+            if (moreSampleIterator.next()) {
+            	result = moreSampleIterator.getString();
+                if (DEBUG) System.out.println("More Samples: " + ucd.getCodeAndName(result));
+                return result;
+           }
+            
            // extra samples
            if (currentRange < SAMPLE_RANGES.length) {
                try {
@ -1329,9 +1353,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        
        // In UAX 3.1, the rearrange list is moved to UCD.
        
+        if (ucaData.lessThan410) {        	
        	rearrangeList = UnifiedBinaryProperty.make(UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd)
            .getSet();
-        
+        }
            
        while (true) try {
            inputLine = in.readLine();
@ -1465,7 +1490,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
            UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd);
        UnicodeSet desiredSet = ubp.getSet();
        
-        if (!rearrangeList.equals(desiredSet)) {
+        if (ucaData.lessThan410 && !rearrangeList.equals(desiredSet)) {
            throw new IllegalArgumentException("Rearrangement should be " + desiredSet.toPattern(true)
                + ", but is " + rearrangeList.toPattern(true));
        }
--- a/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $ 
-* $Date: 2004/03/11 19:03:19 $ 
-* $Revision: 1.2 $
+* $Date: 2005/05/02 15:39:54 $ 
+* $Revision: 1.3 $
 *
 *******************************************************************************
 */
@ -29,6 +29,7 @@ import com.ibm.icu.text.UnicodeSet;
 public class UCA_Data implements UCA_Types {
    static final boolean DEBUG = false;
    static final boolean DEBUG_SHOW_ADD = false;
+    static final boolean lessThan410 = false;
    
    private Normalizer toD;
    private UCD ucd;
@ -197,7 +198,7 @@ public class UCA_Data implements UCA_Types {
                int increment = UTF16.getCharCount(cp2);
                
                // CHECK if last char was completely ignorable
-                if (isCompletelyIgnoreable(cp2)) {
+                if (lessThan410 && isCompletelyIgnoreable(cp2)) {
                    index += increment; // just skip char don't set probe, value
                    continue;
                }
@ -231,7 +232,7 @@ public class UCA_Data implements UCA_Types {
                lastCan = can;                  // remember for next time
                
                // CHECK if last char was completely ignorable. If so, skip it.
-                if (isCompletelyIgnoreable(cp2)) {
+                if (lessThan410 && isCompletelyIgnoreable(cp2)) {
                    continue;
                }
                
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2005/04/06 08:48:17 $ 
-* $Revision: 1.40 $
+* $Date: 2005/05/02 15:39:54 $ 
+* $Revision: 1.41 $
 *
 *******************************************************************************
 */
@ -440,9 +440,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
            if (!shortPrint) {
                log.print(Utility.hex(source));
                log.print(
-                    ";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
+                    ";\t# (" + quoteOperand(clipped) + ") " + ucd.getName(clipped) + "\t" + UCA.toString(key));
            } else {
-                log.print(Utility.hex(source) + ";\t" + Utility.hex(clipped));
+                log.print(Utility.hex(source));
            }
            log.println();
        }
@ -537,16 +537,16 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
        //Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
        //Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
        switch (strength) {
-            case 1: log.println("<h2>3. Primaries Incompatible with Decompositions</h2>"); break;
-            case 2: log.println("<h2>4. Secondaries Incompatible with Decompositions</h2>"); break;
-            case 3: log.println("<h2>5. Tertiaries Incompatible with Decompositions</h2>"); 
-                log.println("<p>Note: Tertiary differences are not really errors; these are just warnings</p>"); 
+            case 1: log.println("<h2>3. Primaries Incompatible with NFKD</h2>"); break;
+            case 2: log.println("<h2>4. Secondaries Incompatible with NFKD</h2>"); break;
+            case 3: log.println("<h2>5. Tertiaries Incompatible with NFKD</h2>"); 
             break;
            default: throw new IllegalArgumentException("bad strength: " + strength);
        }
+        log.println("<p>Note: Differences are not really errors; but they should be checked over for inadvertant problems</p>"); 
        log.println("<p>Warning: only checking characters defined in base: " + ucd_uca_base.getVersion() + "</p>");
        log.println("<table border='1' cellspacing='0' cellpadding='2'>");
-        log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
+        log.println("<tr><th>Code</td><th>Sort Key</th><th>NFKD Sort Key</th><th>Name</th></tr>");
        
        int errorCount = 0;
        
@ -1991,7 +1991,7 @@ F900..FAFF; CJK Compatibility Ideographs
                    relation = getStrengthDifference(ces, len, ces2, len2);
                    	
                    reset = quoteOperand(UTF16.valueOf(resetCp));
-                    resetComment = ucd.getCodeAndName(resetCp);
+                    if (!shortPrint) resetComment = ucd.getCodeAndName(resetCp);
                    // lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
                    xmlReset = 2;
                }
@ -2523,6 +2523,7 @@ F900..FAFF; CJK Compatibility Ideographs
    static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
    
    static UnicodeSet needsQuoting = null;
+    static UnicodeSet needsUnicodeForm = null;
        
    static final String quoteOperand(String s) {
        if (needsQuoting == null) {
@ -2533,8 +2534,13 @@ F900..FAFF; CJK Compatibility Ideographs
              || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))
              */
            needsQuoting = new UnicodeSet(
-                "[[:whitespace:][:c:][:z:][[:ascii:]-[a-zA-Z0-9]]]");
+            "[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); // 
+            //"[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:]
+            //for (int i = 0; i <= 0x10FFFF; ++i) {
+            //	if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i);
+            //}
            // needsQuoting.remove();
+            needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]");
        }
    	s = Default.nfc().normalize(s);
        quoteOperandBuffer.setLength(0);
@ -2558,7 +2564,8 @@ F900..FAFF; CJK Compatibility Ideographs
                        quoteOperandBuffer.append('\'');
                        inQuote = true;
                    }
-                    if (cp > 0xFFFF) {
+                    if (!needsUnicodeForm.contains(cp)) quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028
+                    else if (cp > 0xFFFF) {
                        quoteOperandBuffer.append("\\U").append(Utility.hex(cp,8));
                    } else if (cp <= 0x20 || cp > 0x7E) {
                        quoteOperandBuffer.append("\\u").append(Utility.hex(cp));
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
@ -515,6 +515,8 @@ public class MakeUnicodeFiles {
            GenerateCaseFolding.generateSpecialCasing(false);
        } else if (filename.equals("StandardizedVariants")) {
            GenerateStandardizedVariants.generate();
+        } else if (filename.equals("NamedSequences")) {
+        	GenerateNamedSequences.generate();
        } else if (filename.equals("GraphemeBreakTest")) {
            new GenerateGraphemeBreakTest(Default.ucd()).run();
        } else if (filename.equals("WordBreakTest")) {
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
@ -1,4 +1,4 @@
-Generate: DerivedBidiClass
+Generate: NamedSequences
 DeltaVersion: 14
 CopyrightYear: 2005

@ -361,6 +361,9 @@ Property: SPECIAL
 File:	StandardizedVariants
 Property: SPECIAL

+File:	NamedSequences
+Property: SPECIAL
+
 HackName:	noBreak
 HackName:	Arabic_Presentation_Forms-A
 HackName:	Arabic_Presentation_Forms-B
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/04/06 08:48:17 $
-* $Revision: 1.21 $
+* $Date: 2005/05/02 15:39:53 $
+* $Revision: 1.22 $
 *
 *******************************************************************************
 */
@ -19,6 +19,7 @@ import java.text.DateFormat;
 import java.text.SimpleDateFormat;

 import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.CollectionUtilities;
 import com.ibm.icu.dev.test.util.ICUPropertyFactory;
 import com.ibm.icu.dev.test.util.UnicodeLabel;
 import com.ibm.icu.dev.test.util.UnicodeMap;
@ -152,6 +153,7 @@ public class TestData implements UCD_Types {
 	Matcher m;
 	
 	static class GenStringPrep {
+		
 		UnicodeSet[] coreChars = new UnicodeSet[100];
 		UnicodeSet decomposable = new UnicodeSet();
 		UnicodeMap suspect = new UnicodeMap();
@ -159,11 +161,15 @@ public class TestData implements UCD_Types {
 		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
 		//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
 		UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
-		UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher());
+		UnicodeSet wordChars = new UnicodeSet();
 		{
+			if (false) {
+				wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
 				wordChars.retainAll(ups.getSet("gc=Sk"));
+			}
 			wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
-			" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0]"));
+			" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
+			" \\u055A \\u02B9 \\u02BA]"));
 			//wordChars.removeAll(xid_continue);
 		}
 		
@ -193,6 +199,7 @@ public class TestData implements UCD_Types {
 		UnicodeSet inIDN = new UnicodeSet();

 		void genStringPrep() throws IOException {
+			//showScriptToBlock();
 			bf.setShowLiteral(BagFormatter.toHTMLControl);
 			//bf.setValueSource(UnicodeLabel.NULL);
 			if (false) {
@ -221,10 +228,13 @@ public class TestData implements UCD_Types {
 			
 			Utility.fixDot();
 			PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
+			PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
 			PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
 			textOut.println('\uFEFF');
 			textOut.println("For documentation, see idn-chars.html");
-			Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
+			
+			Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut, 
+					new String[] {"%date%", Default.getDate()});
 			/*
 			out
 					.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
@ -241,27 +251,31 @@ public class TestData implements UCD_Types {
 			out.println("-->");
 			out.println("</style></head><body><table>");
 			*/
-			htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");
+			htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
+			htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");

 			for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
 				if (scriptCode == COMMON_SCRIPT
 						|| scriptCode == INHERITED_SCRIPT)
 					continue;
-				showCodes(htmlOut, textOut, scriptCode);
+				showCodes(htmlOut, textOut, scriptCode, htmlOut2);
 			}
-			showCodes(htmlOut, textOut, COMMON_SCRIPT);
-			showCodes(htmlOut, textOut, INHERITED_SCRIPT);
+			showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
+			showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
 			htmlOut.println("</table></body></html>");
 			htmlOut.close();
+			htmlOut2.println("</table></body></html>");
+			htmlOut2.close();
 			bf.setMergeRanges(false);

 			textOut.println();
-			textOut.println("# *** WORD CHARACTERS ADDED ***");
+			textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
+			textOut.println();
 			bf.setValueSource("word-chars");
 			bf.showSetNames(textOut, wordChars);
 			
 			textOut.println();
-			textOut.println("# *** FOR REVIEW (collected from above) ***");
+			textOut.println("# *** FOR REVIEW ***");
 			bf.setLabelSource(UnicodeLabel.NULL);
 			for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
 				textOut.println();
@ -272,6 +286,93 @@ public class TestData implements UCD_Types {
 			textOut.close();
 		}
 		
+		/**
+		 * 
+		 */
+		private void showScriptToBlock() {
+			UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
+			UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
+			UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
+				public Object compose(Object a, Object b) {
+					return a + "\t" + b;
+				}
+			};
+			UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
+			for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
+				System.out.println(it.next());
+			}
+			throw new IllegalArgumentException();
+		}
+		
+		Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
+		
+		static String[][] script_to_gif = {
+				
+			{"Common","common.gif"}, //Miscellaneous_Symbols
+			{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
+			{"Arabic","arabic.gif"}, //Arabic
+			{"Armenian","armenian.gif"}, //Armenian
+			{"Bengali","bengali.gif"}, //Bengali
+			{"Bopomofo","bopomofo.gif"}, //Bopomofo
+			{"Braille","braillesymbols.gif"}, //Braille_Patterns
+			{"Buginese","buginese.gif"}, //Buginese
+			{"Buhid","buhid.gif"}, //Buhid
+			{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
+			{"Cherokee","cherokee.gif"}, //Cherokee
+			{"Coptic","coptic.gif"}, //Coptic
+			{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
+			{"Cyrillic","cyrillic.gif"}, //Cyrillic
+			{"Deseret","deseret.gif"}, //Deseret
+			{"Devanagari","devanagari.gif"}, //Devanagari
+			{"Ethiopic","ethiopic.gif"}, //Ethiopic
+			{"Georgian","georgian.gif"}, //Georgian
+			{"Glagolitic","glagolitic.gif"}, //Glagolitic
+			{"Gothic","gothic.gif"}, //Gothic
+			{"Greek","greek.gif"}, //Greek_and_Coptic
+			{"Gujarati","gujarati.gif"}, //Gujarati
+			{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
+			{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
+			{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
+			{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
+			{"Hanunoo","hanunoo.gif"}, //Hanunoo
+			{"Hebrew","hebrew.gif"}, //Hebrew
+			{"Hiragana","hiragana.gif"}, //Hiragana
+			{"Kannada","kannada.gif"}, //Kannada
+			{"Katakana","katakana.gif"}, //Katakana
+			{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
+			{"Khmer","khmer.gif"}, //Khmer
+			{"Lao","lao.gif"}, //Lao
+			{"Latin","latin.gif"}, //Basic_Latin
+			{"Limbu","limbu.gif"}, //Limbu
+			{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
+			{"Malayalam","malayalam.gif"}, //Malayalam
+			{"Mongolian","mongolian.gif"}, //Mongolian
+			{"Myanmar","myanmar.gif"}, //Myanmar
+			{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
+			{"Ogham","ogham.gif"}, //Ogham
+			{"Old_Italic","olditalic.gif"}, //Old_Italic
+			{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
+			{"Oriya","oriya.gif"}, //Oriya
+			{"Osmanya","osmanya.gif"}, //Osmanya
+			{"Runic","runic.gif"}, //Runic
+			{"Shavian","shavian.gif"}, //Shavian
+			{"Sinhala","sinhala.gif"}, //Sinhala
+			{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
+			{"Syriac","syriac.gif"}, //Syriac
+			{"Tagalog","tagalog.gif"}, //Tagalog
+			{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
+			{"Tai_Le","taile.gif"}, //Tai_Le
+			{"Tamil","tamil.gif"}, //Tamil
+			{"Telugu","telugu.gif"}, //Telugu
+			{"Thaana","thaana.gif"}, //Thaana
+			{"Thai","thai.gif"}, //Thai
+			{"Tibetan","tibetan.gif"}, //Tibetan
+			{"Tifinagh","tifinagh.gif"}, //Tifinagh
+			{"Ugaritic","ugaritic.gif"}, //Ugaritic
+			{"Yi","yi.gif"}, //Yi_Syllables
+
+		};
+		
 		UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
 		{
 			for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
@ -309,16 +410,22 @@ public class TestData implements UCD_Types {
 		 * @param htmlOut
 		 * @param textOut TODO
 		 * @param scriptCode
+		 * @param htmlOut2 TODO
 		 * @param ucd
 		 * @param coreChars
 		 * @param decompChars
 		 */
-		private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
+		private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
 			if (coreChars[scriptCode] == null) return;
-			System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
 			String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
+			script = Utility.getUnskeleton(script.toLowerCase(),true);
+			System.out.println(script);
+			
 			htmlOut.println();
-			htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
+			String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
+			+ "'> Script: " + script + "</th></tr>";
+			htmlOut.println(scriptLine);
+			htmlOut2.println(scriptLine);
 			textOut.println();
 			textOut.println("#*** Script: " + script + " ***");
 			UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
@ -354,13 +461,13 @@ public class TestData implements UCD_Types {
 			if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
 			if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
 			if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
-			if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
+			if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);

 			if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
-			if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Decomposable", remappedIsNFKCDecomp, scriptCode);
+			if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
 			if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
 			if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
-			if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
+			if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
 		}

 		/**
@ -387,9 +494,11 @@ public class TestData implements UCD_Types {
 			int size = unicodeset.size();
 			String dir = unicodeset.containsSome(bidiR)
 					&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
-			htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
+			htmlOut.println("<tr><th class='" + title + "'><a href='#" +
+					title + "'>" + title + "</a> ("
 					+ nf.format(size) + ")</th></tr>");
 			htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
+			// <a href="#Atomic">categorization</a>
 			textOut.println();
 			textOut.println("# " + title);
 			bf.setValueSource(script + " ; " + title);
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2005/03/10 02:37:20 $
-* $Revision: 1.38 $
+* $Date: 2005/05/02 15:39:53 $
+* $Revision: 1.39 $
 *
 *******************************************************************************
 */
@ -31,6 +31,7 @@ import com.ibm.text.utility.*;
 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodeProperty;
+import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 public final class UCD implements UCD_Types {
@ -157,11 +158,11 @@ public final class UCD implements UCD_Types {
     * Get the character names for the code points in a string, separated by ", "
     */
    public String getName(String s, byte style) {
-        if (s.length() == 1) return getName(s.charAt(0), style);
+        if (s.length() == 1) return getName(s.charAt(0), style); // optimize BMP
        StringBuffer result = new StringBuffer();
        int cp;
-        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
-            cp = UTF32.char32At(s, i);
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(s, i);
            if (i > 0) result.append(", ");
            result.append(getName(cp, style));
        }
@ -1383,7 +1384,7 @@ to guarantee identifier closure.
        result.codePoint = codePoint;
        if (fixStrings) {
            if (result.name == null || isRemapped) result.name = constructedName;
-            if (result.shortName == null) result.shortName = Utility.replace(constructedName, UCD_Names.NAME_ABBREVIATIONS);
+            if (result.shortName == null) result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
            if (isRemapped) {
                result.decompositionMapping = result.bidiMirror
                = result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding
--- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
+++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
@ -6,93 +6,96 @@
 <meta name="ProgId" content="FrontPage.Editor.Document">
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 <title>IDN Characters</title>
-<style>
-<!--
-.script       { font-size: 150%; background-color: #CCCCCC }
-.Atomic       { background-color: #CCCCFF }
-.Atomic-no-uppercase       { background-color: #CCFFCC }
-.Non-XID       { background-color: #FFCCCC }
-.Decomposable       { background-color: #FFFFCC }
-.Pattern_Syntax       { background-color: #FFCCFF }
-.IDN-Remapped-Case-Atomic       { background-color: #CCFFFF }
-.IDN-Remapped-Case-Decomposable       { background-color: #66FFFF }
-.IDN-Remapped-Compat       { background-color: #FF6666 }
-.IDN-Deleted       { background-color: #66FF66 }
-.IDN-Illegal       { background-color: #6666FF }
-th           { text-align: left }
-->
-</style>
+<link rel="stylesheet" type="text/css" href="idn-chars.css">
 </head>

-<body style="margin: 2em">
+<body>

 <h1>IDN Character Categorization</h1>
-<p><i>$Date: 2005/04/06 08:48:17 $, MED</i></p>
-<p>This page lists all of the valid output IDN characters broken down by category. By &quot;output&quot; IDN 
-characters, we mean ones that can result from nameprep. Characters are grouped first by script, and 
-then by subcategory. Within each subcategory characters are sorted according to the default
-<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tool-tips provide the character code 
-and name (in enabled browsers).</p>
+<p><i>%date%, MED</i></p>
+<p>This page lists all Unicode characters relevant to IDN in a <a href="#Categorization">chart</a>, 
+broken down by category. Characters are grouped first by script, and then by subcategory.</p>
+<p>The &quot;output&quot; IDN characters are ones that can result from nameprep, while the &quot;input&quot; characters 
+are those that are allowed in input, but transformed (remapped or deleted). Tool-tips provide the 
+character code and name (in enabled browsers). The following table described the subcategories. 
+Within each subcategory characters are sorted according to the default
+<a href="http://www.unicode.org/reports/tr10/">UCA</a> order.</p>
 <blockquote>
-  <table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
+  <table border="1" cellpadding="2" cellspacing="0">
    <caption><b><font size="4">Key</font></b></caption>
    <tr>
+      <th>Type</th>
      <th>Subcategory</th>
      <th>Description</th>
    </tr>
    <tr>
-      <td class="Atomic">Atomic</td>
+      <th rowspan="5">Output</th>
+      <td class="Atomic"><a name="Atomic">Atomic</a></td>
      <td>Characters that don&#39;t fall into any of the following subcategories</td>
    </tr>
    <tr>
-      <td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
-      <td>For bicameral scripts, Atomic characters without an uppercase.</td>
+      <td class="Atomic-no-uppercase"><a name="Atomic-no-uppercase">Atomic-no-uppercase</a></td>
+      <td>For bicameral scripts, Atomic characters without an uppercase. These need to be examined 
+      to see which are used in modern languages.</td>
    </tr>
    <tr>
-      <td class="Pattern_Syntax">Pattern_Syntax</td>
-      <td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word 
-      characters in <i>Section 4 Word Boundaries</i> of
-      <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
-      Word_Break property and notes at the end of the section.</p>
-      <p>See <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and 
-      Pattern Syntax</a>. </td>
+      <td class="Pattern_Syntax"><a name="Pattern_Syntax">Pattern_Syntax</a></td>
+      <td>Characters recommended as a basis for use in pattern syntax. Excludes the
+      <a href="#Word_Characters">additional word characters</a>.</td>
    </tr>
    <tr>
-      <td class="Non-XID">Non-XID</td>
-      <td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the 
-      word characters in <i>Section 4 Word Boundaries</i> of
-      <a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
-      Word_Break property and notes at the end of the section.<p>See
-      <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern 
-      Syntax</a> (XID_Continue).</td>
+      <td class="Non-XID"><a name="Non-XID">Non-XID</a></td>
+      <td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and
+      <a href="#Word_Characters">additional word characters</a>.</td>
    </tr>
    <tr>
-      <td class="Decomposable">Decomposable</td>
-      <td>Characters with NFC decompositions.</td>
+      <td class="NFD-Decomposable"><a name="NFD-Decomposable">NFD-Decomposable</a></td>
+      <td>Characters with NFD (canonical) decompositions. These are broken out separately because 
+      certain spoofing techniques are applied to them <i>via their decompositions.</i></td>
    </tr>
    <tr>
-      <td class="IDN-Remapped-Case-Atomic">IDN-Remapped</td>
-      <td>Characters remapped by IDN due to case folding</td>
+      <th rowspan="4">Input</th>
+      <td class="IDN-Remapped-Case-Atomic"><a name="IDN-Remapped-Case-Atomic">
+      IDN-Remapped-Case-Atomic</a></td>
+      <td>Atomic characters remapped by IDN due to case folding [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> 
+      Section 3.2].</td>
    </tr>
    <tr>
-      <td class="IDN-Remapped-Case-Decomposable">IDN-Remapped</td>
-      <td>Characters remapped by IDN due to case folding, that are decomposable.</td>
-    </tr>
-    IDN-Remapped-Case-Decomposable
-    <tr>
-      <td class="IDN-Remapped-Compat">IDN-Remapped</td>
-      <td>Characters remapped by IDN due to compatibility mapping.</td>
+      <td class="IDN-Remapped-Case-NFD-Decomposable"><a name="IDN-Remapped-Case-NFD-Decomposable">
+      IDN-Remapped-Case-NFD-Decomposable</a></td>
+      <td>Characters that are NFD (canonical) decomposable and that are remapped by IDN due to case 
+      folding [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> Section 3.2].</td>
    </tr>
    <tr>
-      <td class="IDN-Deleted">IDN-Deleted</td>
-      <td>Characters deleted by IDN.</td>
+      <td class="IDN-Remapped-Compat"><a name="IDN-Remapped-Compat">IDN-Remapped</a></td>
+      <td>Characters remapped by IDN due to compatibility (NFKD) mapping. [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> 
+      Section 4]</td>
    </tr>
    <tr>
-      <td class="IDN-Illegal">IDN-Illegal </td>
-      <td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
+      <td class="IDN-Deleted"><a name="IDN-Deleted">IDN-Deleted</a></td>
+      <td>Characters deleted by IDN, that is, mapped to nothing [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> 
+      Section 3.1]</td>
+    </tr>
+    <tr>
+      <th>Prohibited</th>
+      <td class="IDN-Prohibited"><a name="IDN-Prohibited">IDN-Prohibited </a></td>
+      <td>Characters prohibited in IDN [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> 
+      Section 5] (Note: most of these are due to IDN&#39;s using an old version of Unicode. IDN does 
+      treat unassigned characters differently than explicitly prohibited characters, but for our 
+      purposes this distinction doesn&#39;t matter.)</td>
    </tr>
  </table>
 </blockquote>
+<h3>Additional <a name="Word_Characters">Word Characters</a></h3>
+<p>This is a draft list of characters based on <i>Section 4 Word Boundaries</i> of
+<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the 
+Word_Break property and notes at the end of the section. While not currently a part of the 
+recommended characters for programming identifiers (XID_Continue), these characters have been 
+identified as being necessary for more &quot;natural language&quot; identifiers, since some words in some 
+modern languages could not be constructed without them. See also
+<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>. 
+These characters are listed in the plain text file, as described below.</p>
+<h2>Plain-Text Version</h2>
 <p>The information in the categorization is also available in a plain-text file, at
 <a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for 
 sorting and filtering to view the data in different ways. The format is:</p>
@ -101,7 +104,22 @@ sorting and filtering to view the data in different ways. The format is:</p>
 </blockquote>
 <p><i>Examples:</i></p>
 <pre>0061          ; LATIN ; Atomic # ; L&amp; (a) LATIN SMALL LETTER A
+<code>026B          ; LATIN ; Atomic-no-uppercase # L&amp; (?) LATIN SMALL LETTER L WITH MIDDLE TILDE</code>
 2015          ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
 058A          ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
 20AC          ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
-<h2>Categorization</h2>
+<p>At the end of <a href="idn-chars.txt">idn-chars.txt</a> is a section called ADDITIONAL WORD 
+CHARACTERS, defined as described above. Below that is a section of FOR REVIEW characters, 
+sorted by Unicode general category (an additional category of XX is added for the odd characters 
+whose names include: <span style="font-variant: small-caps">MUSICAL SYMBOL, DINGBAT, or RADICAL</span>.) 
+We need review of that list to check for characters that are needed for words in modern languages, 
+that is, that should be moved up into the ADDITIONAL WORD CHARACTERS list. Each character in the FOR 
+REVIEW list is collected because it either: </p>
+<ol>
+  <li>would not otherwise count as part of an XID, or</li>
+  <li>is part of a bicameral script and doesn&#39;t have an uppercase (eg, the situation for U+026B 
+  above)</li>
+</ol>
+<p>In either case there is prima facie reason for some level of scrutiny, if the goal to be 
+initially conservative in repertoire.</p>
+<h2><a name="Categorization">Categorization</a></h2>