ICU-1472 fixes for halfwidth (also iteration mark)

X-SVN-Rev: 6835
2001-11-13 20:03:51 +00:00 · 2001-11-13 20:03:51 +00:00 · 1abc1b2372
commit 1abc1b2372
parent 3d4e45e02d
4 changed files with 376 additions and 62 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/translit/WriteCharts.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/WriteCharts.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/WriteCharts.java,v $
- * $Date: 2001/11/13 02:50:11 $
- * $Revision: 1.6 $
+ * $Date: 2001/11/13 20:03:51 $
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
@ -21,9 +21,11 @@ import java.io.*;

 public class WriteCharts {
    public static void main(String[] args) throws IOException {
-        if (false) testSet();
+        if (false) {
+            printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
+        }
        String testSet = "";
-        if (args.length == 0) args = all;
+        if (args.length == 0) args = getAllScripts();
        for (int i = 0; i < args.length; ++i) {
    // Enumeration enum = Transliterator.getAvailableIDs();
            if (args[i].startsWith("[")) {
@ -35,8 +37,9 @@ public class WriteCharts {
        }
    }
    
-    public static void testSet() {
-        UnicodeSet s = new UnicodeSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
+    public static void printSet(String source) {
+        UnicodeSet s = new UnicodeSet(source);
+        System.out.println("Printout for '" + source + "'");
        int count = s.getRangeCount();
        for (int i = 0; i < count; ++i) {
            int start = s.getRangeStart(i);
@ -45,14 +48,63 @@ public class WriteCharts {
        }
    }
    
-    static final String[] all = {
-        "Cyrillic-Latin", "Greek-Latin", 
-        "el-Latin",
-        "Devanagari-Tamil", "Devanagari-Latin", 
-        "Katakana-Latin", "Hiragana-Latin", "Hangul-Latin"
+    public static String[] getAllScripts() {
+        Set set = new TreeSet();
+        int scripts[];
+        Enumeration sources = Transliterator.getAvailableSources();
+        while(sources.hasMoreElements()) {
+            String source = (String) sources.nextElement();
+            scripts = UScript.getCode(source);
+            int sourceScript = scripts[0];
+            if (sourceScript == UScript.INVALID_CODE) {
+                System.out.println("[Skipping " + source + "]");
+                continue;
+            }
+            System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
+            Enumeration targets = Transliterator.getAvailableTargets(source);
+            while(targets.hasMoreElements()) {
+                String target = (String) targets.nextElement();
+                scripts = UScript.getCode(target);
+                int targetScript = scripts[0];
+                if (targetScript == UScript.INVALID_CODE
+                        || targetScript < sourceScript) {
+                    // skip doing both directions
+                    System.out.println("[Skipping '" + source + "-" + target + "']");
+                    continue;
+                }
+                System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
+                Enumeration variants = Transliterator.getAvailableVariants(source, target);
+                while(variants.hasMoreElements()) {
+                    String variant = (String) variants.nextElement();
+                    String id = source + "-" + target;
+                    if (variant.length() != 0) {
+                        id += "/" + variant;
+                        if (true) {
+                            System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
+                            continue;
+                        }
+                    }
+                    System.out.println("\t\t\t\tAdding: '" + id + "'");
+                    set.add(id);
+                }
+            }
+        }
+        String[] results = new String[set.size()];
+        set.toArray(results);
+        return results;
    };
    
+    public static String showScripts(int[] scripts) {
+        StringBuffer results = new StringBuffer();
+        for (int i = 0; i < scripts.length; ++i) {
+            if (i != 0) results.append(", ");
+            results.append(UScript.getName(scripts[i]));
+        }
+        return results.toString();
+    }
+    
    public static void print(String testSet, String rawId) throws IOException {
+        System.out.println("Processing " + rawId);
        Transliterator t = Transliterator.getInstance(rawId);
        String id = t.getID();
        
@ -73,11 +125,15 @@ public class WriteCharts {
                return;
            } else {
                testSet = "[:" + source + ":]";
+                if (source.equalsIgnoreCase("katakana")) {
+                    testSet = "[" + testSet + "\u30FC]";
+                    printSet(testSet);
+                }
            }
        }
        UnicodeSet sourceSet = new UnicodeSet(testSet);

-        // check that the source is a script
+        // check that the target is a script
        int[] scripts = UScript.getCode(target);
        if (scripts.length != 1) {
            target = "[:Latin:]";
@ -88,7 +144,7 @@ public class WriteCharts {
        
        Transliterator inverse = t.getInverse();
        
-        Transliterator hex = Transliterator.getInstance("Any-Hex");
+        //Transliterator hex = Transliterator.getInstance("Any-Hex");
        
                
        // iterate through script
@ -133,12 +189,41 @@ public class WriteCharts {
                    
                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.DECOMP_COMPAT, 0))
                        + "\u0000" + ss, 
-                    "<tr><td>" + ss + "<br><tt>" + hex.transliterate(ss) + "</tt></td><td>"
-                        + ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
-                        + rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>" );
+                    "<td class='s'>" + ss + "<br><tt>" + hex(ss)
+                        + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
+                        + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
+                
+                // Check Duals
+                /*
+                int maxDual = 200;
+              dual:
+                for (int i2 = 0; i2 < count; ++i2) {
+                    int end2 = sourceSet.getRangeEnd(i2);
+                    for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
+                        String ss2 = UTF16.valueOf(j2);
+                        String ts2 = t.transliterate(ss2);
+                        String rt2 = inverse.transliterate(ts2);
+                        
+                        String ss12 = ss + ss2;
+                        String ts12 = t.transliterate(ss + ss12);
+                        String rt12 = inverse.transliterate(ts12);
+                        if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;   
+                        if (--maxDual < 0) break dual;
+                        
+                        // transliteration of whole differs from that of parts
+                        group = 0x100;
+                        map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
+                                + "\u0000" + ss12, 
+                            "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
+                                + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
+                                + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
+                    }
+                }
+                */
            }
        }
        
+        
        leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
        
        count = leftOverSet.getRangeCount();
@ -161,13 +246,14 @@ public class WriteCharts {
                }
                    
                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0)) + ts, 
-                    "<tr><td>-</td><td>" + ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
-                    + rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>");
+                    "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
+                    + "</tt></td><td class='r'>"
+                    + rt + "<br><tt>" + hex(rt) + "</tt></td>");
            }
        }

        // make file name and open
-        File f = new File("chart_" + id.replace('/', '_') + ".html");
+        File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
        String filename = f.getCanonicalFile().toString();
        PrintWriter out = new PrintWriter(
            new OutputStreamWriter(
@ -183,35 +269,64 @@ public class WriteCharts {
            out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
            
            out.println("<BODY>");
-            String tableHeader = "<p><table border='1'><tr><th>Source</th><th>Target</th><th>Return</th></tr>";
+            out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
+            out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
+            out.println("The samples are mechanically generated, and only include single characters");
+            out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
+            out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
+            out.println("<a href='http://oss.software.ibm.com/cgi-bin/icu/tr'>Transliteration Demo</a></p><hr>");
+            
+            // set up the headers
+            int columnCount = 3;
+            String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
+            String headers = headerBase;
+            for (int i = columnCount - 1; i > 0; --i) headers += headerBase;
+            
+            String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
            String tableFooter = "</table></p>";
-            out.println("<h1>Round Trip</h1>");
+            out.println("<h2>Round Trip</h2>");
            out.println(tableHeader);
            
            Iterator it = map.keySet().iterator();
            char lastGroup = 0;
            count = 0;
+            int column = 0;
            while (it.hasNext()) {
                String key = (String) it.next();
                char group = key.charAt(0);
                if (group != lastGroup || count++ > 50) {
                    lastGroup = group;
                    count = 0;
+                    if (column != 0) {
+                        out.println("</tr>");
+                        column = 0;
+                    }
                    out.println(tableFooter);
                    
                    String title = "";
-                    if ((group & 0x80) != 0) out.println("<hr><h1>Completeness</h1>");
-                    else out.println("<hr><h1>Round Trip</h1>");
-                    if ((group & 16) != 0) out.println("<h2>Errors: Contains Private Use Characters</h2>");
-                    if ((group & 8) != 0) out.println("<h2>Possible Errors: Return not in Source Set</h2>");
-                    if ((group & 4) != 0) out.println("<h2>Errors: Return not equal to Source</h2>");
-                    if ((group & 2) != 0) out.println("<h2>Errors: Return not in Source Set</h2>");
-                    if ((group & 1) != 0) out.println("<h2>Errors: Target not in Target Set</h2>");
+                    if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
+                    else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
+                    else out.println("<hr><h2>Round Trip</h2>");
+                    if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
+                    if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
+                    if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
+                    if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
+                    if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
                                        
                    out.println(tableHeader);
+                    column = 0;
                }
                String value = (String) map.get(key);
+                if (column++ == 0) out.print("<tr>");
                out.println(value);
+                if (column == 3) {
+                    out.println("</tr>");
+                    column = 0;
+                }
+            }
+            if (column != 0) {
+                out.println("</tr>");
+                column = 0;
            }
            out.println(tableFooter + "</BODY></HTML>");
            
@ -220,6 +335,17 @@ public class WriteCharts {
        }
    }
    
+    public static String hex(String s) {
+        int cp;
+        StringBuffer results = new StringBuffer();
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(s, i);
+            if (i != 0) results.append(' ');
+            results.append(Integer.toHexString(cp));
+        }
+        return results.toString().toUpperCase();
+    }
+    
    static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
    
    /*
--- a/icu4j/src/com/ibm/icu/impl/data/Transliterator_Latin_Katakana.txt
+++ b/icu4j/src/com/ibm/icu/impl/data/Transliterator_Latin_Katakana.txt
@ -3,8 +3,8 @@
 # Corporation and others. All Rights Reserved.
 #--------------------------------------------------------------------
 # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Latin_Katakana.txt,v $
-# $Date: 2001/11/13 02:57:28 $
-# $Revision: 1.10 $
+# $Date: 2001/11/13 20:03:22 $
+# $Revision: 1.11 $
 #--------------------------------------------------------------------

 #:: [\u0020-\u00FF [:Latin:][:Mark:]] ;   # note: a global filter is more efficient, but MUST include all source chars
@ -62,6 +62,7 @@
 # Variables

 $vowel = [aeiou] ;
+$consonant = [bcdfghjklmnpqrstvwxyz] ;
 $macron = \u0304 ;

 # Variables used for doubled-consonants with tsu
@ -387,6 +388,30 @@ $macron <> ー ;
 '~ye' > ェ ;
 '~yo' <> ョ ;

+# iteration marks
+# TODO: make more accurate
+
+j $1 < sh (y* $vowel) {ヽ$voice ;
+dj $1 < ch (y* $vowel) {ヽ$voice ;
+dz $1 < ts (y* $vowel) {ヽ$voice ;
+
+g $1 < k (y* $vowel) {ヽ$voice ;
+z $1 < s (y* $vowel) {ヽ$voice ;
+d $1 < t (y* $vowel) {ヽ$voice ;
+h $1 < b (y* $vowel) {ヽ$voice ;
+v $1 < w (y* $vowel) {ヽ$voice ;
+
+sh $1 < sh (y* $vowel) {ヽ$voice ;
+j $1 < j (y* $vowel) {ヽ$voice ;
+ch $1 < ch (y* $vowel) {ヽ$voice ;
+dj $1 < dj(y* $vowel) {ヽ$voice ;
+ts $1 < ts (y* $vowel) {ヽ$voice ;
+dz $1 < dz (y* $vowel) {ヽ$voice ;
+
+$1 < ($consonant y* $vowel) {ヽ$voice? ;
+$1 < (.) {ヽ $voice? ; # otherwise repeat last character
+ < ヽ $voice? ; # delete if no characters found
+
 # h- rule: lengthens vowel if not followed by a vowel

 [aeiou] } h > ー ;
@ -435,6 +460,12 @@ f > フ;
 j  > ジ;
 w > ウ;

+ß > | ss ;
+æ > | e ;
+ð > | d ;
+ø > | u ;
+þ > | th ;
+
 # simple substitutions using backup

 c > | k ;
@ -449,7 +480,7 @@ x > | ks ;

 '~' > ; # delete stray tildes between letters
 [:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
-[[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
+[\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use

 :: NFC (NFKD) ;  # use NFKD to get the halfwidth katakana characters

--- a/icu4j/src/com/ibm/test/translit/WriteCharts.java
+++ b/icu4j/src/com/ibm/test/translit/WriteCharts.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/WriteCharts.java,v $
- * $Date: 2001/11/13 02:50:11 $
- * $Revision: 1.6 $
+ * $Date: 2001/11/13 20:03:51 $
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
@ -21,9 +21,11 @@ import java.io.*;

 public class WriteCharts {
    public static void main(String[] args) throws IOException {
-        if (false) testSet();
+        if (false) {
+            printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
+        }
        String testSet = "";
-        if (args.length == 0) args = all;
+        if (args.length == 0) args = getAllScripts();
        for (int i = 0; i < args.length; ++i) {
    // Enumeration enum = Transliterator.getAvailableIDs();
            if (args[i].startsWith("[")) {
@ -35,8 +37,9 @@ public class WriteCharts {
        }
    }
    
-    public static void testSet() {
-        UnicodeSet s = new UnicodeSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
+    public static void printSet(String source) {
+        UnicodeSet s = new UnicodeSet(source);
+        System.out.println("Printout for '" + source + "'");
        int count = s.getRangeCount();
        for (int i = 0; i < count; ++i) {
            int start = s.getRangeStart(i);
@ -45,14 +48,63 @@ public class WriteCharts {
        }
    }
    
-    static final String[] all = {
-        "Cyrillic-Latin", "Greek-Latin", 
-        "el-Latin",
-        "Devanagari-Tamil", "Devanagari-Latin", 
-        "Katakana-Latin", "Hiragana-Latin", "Hangul-Latin"
+    public static String[] getAllScripts() {
+        Set set = new TreeSet();
+        int scripts[];
+        Enumeration sources = Transliterator.getAvailableSources();
+        while(sources.hasMoreElements()) {
+            String source = (String) sources.nextElement();
+            scripts = UScript.getCode(source);
+            int sourceScript = scripts[0];
+            if (sourceScript == UScript.INVALID_CODE) {
+                System.out.println("[Skipping " + source + "]");
+                continue;
+            }
+            System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
+            Enumeration targets = Transliterator.getAvailableTargets(source);
+            while(targets.hasMoreElements()) {
+                String target = (String) targets.nextElement();
+                scripts = UScript.getCode(target);
+                int targetScript = scripts[0];
+                if (targetScript == UScript.INVALID_CODE
+                        || targetScript < sourceScript) {
+                    // skip doing both directions
+                    System.out.println("[Skipping '" + source + "-" + target + "']");
+                    continue;
+                }
+                System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
+                Enumeration variants = Transliterator.getAvailableVariants(source, target);
+                while(variants.hasMoreElements()) {
+                    String variant = (String) variants.nextElement();
+                    String id = source + "-" + target;
+                    if (variant.length() != 0) {
+                        id += "/" + variant;
+                        if (true) {
+                            System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
+                            continue;
+                        }
+                    }
+                    System.out.println("\t\t\t\tAdding: '" + id + "'");
+                    set.add(id);
+                }
+            }
+        }
+        String[] results = new String[set.size()];
+        set.toArray(results);
+        return results;
    };
    
+    public static String showScripts(int[] scripts) {
+        StringBuffer results = new StringBuffer();
+        for (int i = 0; i < scripts.length; ++i) {
+            if (i != 0) results.append(", ");
+            results.append(UScript.getName(scripts[i]));
+        }
+        return results.toString();
+    }
+    
    public static void print(String testSet, String rawId) throws IOException {
+        System.out.println("Processing " + rawId);
        Transliterator t = Transliterator.getInstance(rawId);
        String id = t.getID();
        
@ -73,11 +125,15 @@ public class WriteCharts {
                return;
            } else {
                testSet = "[:" + source + ":]";
+                if (source.equalsIgnoreCase("katakana")) {
+                    testSet = "[" + testSet + "\u30FC]";
+                    printSet(testSet);
+                }
            }
        }
        UnicodeSet sourceSet = new UnicodeSet(testSet);

-        // check that the source is a script
+        // check that the target is a script
        int[] scripts = UScript.getCode(target);
        if (scripts.length != 1) {
            target = "[:Latin:]";
@ -88,7 +144,7 @@ public class WriteCharts {
        
        Transliterator inverse = t.getInverse();
        
-        Transliterator hex = Transliterator.getInstance("Any-Hex");
+        //Transliterator hex = Transliterator.getInstance("Any-Hex");
        
                
        // iterate through script
@ -133,12 +189,41 @@ public class WriteCharts {
                    
                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.DECOMP_COMPAT, 0))
                        + "\u0000" + ss, 
-                    "<tr><td>" + ss + "<br><tt>" + hex.transliterate(ss) + "</tt></td><td>"
-                        + ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
-                        + rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>" );
+                    "<td class='s'>" + ss + "<br><tt>" + hex(ss)
+                        + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
+                        + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
+                
+                // Check Duals
+                /*
+                int maxDual = 200;
+              dual:
+                for (int i2 = 0; i2 < count; ++i2) {
+                    int end2 = sourceSet.getRangeEnd(i2);
+                    for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
+                        String ss2 = UTF16.valueOf(j2);
+                        String ts2 = t.transliterate(ss2);
+                        String rt2 = inverse.transliterate(ts2);
+                        
+                        String ss12 = ss + ss2;
+                        String ts12 = t.transliterate(ss + ss12);
+                        String rt12 = inverse.transliterate(ts12);
+                        if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;   
+                        if (--maxDual < 0) break dual;
+                        
+                        // transliteration of whole differs from that of parts
+                        group = 0x100;
+                        map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
+                                + "\u0000" + ss12, 
+                            "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
+                                + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
+                                + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
+                    }
+                }
+                */
            }
        }
        
+        
        leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
        
        count = leftOverSet.getRangeCount();
@ -161,13 +246,14 @@ public class WriteCharts {
                }
                    
                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0)) + ts, 
-                    "<tr><td>-</td><td>" + ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
-                    + rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>");
+                    "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
+                    + "</tt></td><td class='r'>"
+                    + rt + "<br><tt>" + hex(rt) + "</tt></td>");
            }
        }

        // make file name and open
-        File f = new File("chart_" + id.replace('/', '_') + ".html");
+        File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
        String filename = f.getCanonicalFile().toString();
        PrintWriter out = new PrintWriter(
            new OutputStreamWriter(
@ -183,35 +269,64 @@ public class WriteCharts {
            out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
            
            out.println("<BODY>");
-            String tableHeader = "<p><table border='1'><tr><th>Source</th><th>Target</th><th>Return</th></tr>";
+            out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
+            out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
+            out.println("The samples are mechanically generated, and only include single characters");
+            out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
+            out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
+            out.println("<a href='http://oss.software.ibm.com/cgi-bin/icu/tr'>Transliteration Demo</a></p><hr>");
+            
+            // set up the headers
+            int columnCount = 3;
+            String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
+            String headers = headerBase;
+            for (int i = columnCount - 1; i > 0; --i) headers += headerBase;
+            
+            String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
            String tableFooter = "</table></p>";
-            out.println("<h1>Round Trip</h1>");
+            out.println("<h2>Round Trip</h2>");
            out.println(tableHeader);
            
            Iterator it = map.keySet().iterator();
            char lastGroup = 0;
            count = 0;
+            int column = 0;
            while (it.hasNext()) {
                String key = (String) it.next();
                char group = key.charAt(0);
                if (group != lastGroup || count++ > 50) {
                    lastGroup = group;
                    count = 0;
+                    if (column != 0) {
+                        out.println("</tr>");
+                        column = 0;
+                    }
                    out.println(tableFooter);
                    
                    String title = "";
-                    if ((group & 0x80) != 0) out.println("<hr><h1>Completeness</h1>");
-                    else out.println("<hr><h1>Round Trip</h1>");
-                    if ((group & 16) != 0) out.println("<h2>Errors: Contains Private Use Characters</h2>");
-                    if ((group & 8) != 0) out.println("<h2>Possible Errors: Return not in Source Set</h2>");
-                    if ((group & 4) != 0) out.println("<h2>Errors: Return not equal to Source</h2>");
-                    if ((group & 2) != 0) out.println("<h2>Errors: Return not in Source Set</h2>");
-                    if ((group & 1) != 0) out.println("<h2>Errors: Target not in Target Set</h2>");
+                    if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
+                    else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
+                    else out.println("<hr><h2>Round Trip</h2>");
+                    if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
+                    if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
+                    if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
+                    if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
+                    if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
                                        
                    out.println(tableHeader);
+                    column = 0;
                }
                String value = (String) map.get(key);
+                if (column++ == 0) out.print("<tr>");
                out.println(value);
+                if (column == 3) {
+                    out.println("</tr>");
+                    column = 0;
+                }
+            }
+            if (column != 0) {
+                out.println("</tr>");
+                column = 0;
            }
            out.println(tableFooter + "</BODY></HTML>");
            
@ -220,6 +335,17 @@ public class WriteCharts {
        }
    }
    
+    public static String hex(String s) {
+        int cp;
+        StringBuffer results = new StringBuffer();
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(s, i);
+            if (i != 0) results.append(' ');
+            results.append(Integer.toHexString(cp));
+        }
+        return results.toString().toUpperCase();
+    }
+    
    static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
    
    /*
--- a/icu4j/src/com/ibm/text/resources/Transliterator_Latin_Katakana.txt
+++ b/icu4j/src/com/ibm/text/resources/Transliterator_Latin_Katakana.txt
@ -3,8 +3,8 @@
 # Corporation and others. All Rights Reserved.
 #--------------------------------------------------------------------
 # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/Transliterator_Latin_Katakana.txt,v $
-# $Date: 2001/11/13 02:57:28 $
-# $Revision: 1.10 $
+# $Date: 2001/11/13 20:03:22 $
+# $Revision: 1.11 $
 #--------------------------------------------------------------------

 #:: [\u0020-\u00FF [:Latin:][:Mark:]] ;   # note: a global filter is more efficient, but MUST include all source chars
@ -62,6 +62,7 @@
 # Variables

 $vowel = [aeiou] ;
+$consonant = [bcdfghjklmnpqrstvwxyz] ;
 $macron = \u0304 ;

 # Variables used for doubled-consonants with tsu
@ -387,6 +388,30 @@ $macron <> ー ;
 '~ye' > ェ ;
 '~yo' <> ョ ;

+# iteration marks
+# TODO: make more accurate
+
+j $1 < sh (y* $vowel) {ヽ$voice ;
+dj $1 < ch (y* $vowel) {ヽ$voice ;
+dz $1 < ts (y* $vowel) {ヽ$voice ;
+
+g $1 < k (y* $vowel) {ヽ$voice ;
+z $1 < s (y* $vowel) {ヽ$voice ;
+d $1 < t (y* $vowel) {ヽ$voice ;
+h $1 < b (y* $vowel) {ヽ$voice ;
+v $1 < w (y* $vowel) {ヽ$voice ;
+
+sh $1 < sh (y* $vowel) {ヽ$voice ;
+j $1 < j (y* $vowel) {ヽ$voice ;
+ch $1 < ch (y* $vowel) {ヽ$voice ;
+dj $1 < dj(y* $vowel) {ヽ$voice ;
+ts $1 < ts (y* $vowel) {ヽ$voice ;
+dz $1 < dz (y* $vowel) {ヽ$voice ;
+
+$1 < ($consonant y* $vowel) {ヽ$voice? ;
+$1 < (.) {ヽ $voice? ; # otherwise repeat last character
+ < ヽ $voice? ; # delete if no characters found
+
 # h- rule: lengthens vowel if not followed by a vowel

 [aeiou] } h > ー ;
@ -435,6 +460,12 @@ f > フ;
 j  > ジ;
 w > ウ;

+ß > | ss ;
+æ > | e ;
+ð > | d ;
+ø > | u ;
+þ > | th ;
+
 # simple substitutions using backup

 c > | k ;
@ -449,7 +480,7 @@ x > | ks ;

 '~' > ; # delete stray tildes between letters
 [:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
-[[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
+[\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use

 :: NFC (NFKD) ;  # use NFKD to get the halfwidth katakana characters