Incorporated CEDICT and EDICT data for generating transliterators

X-SVN-Rev: 9278
2002-07-21 08:43:39 +00:00 · 2002-07-21 08:43:39 +00:00 · fd17229533
commit fd17229533
parent bdc6d957c4
2 changed files with 412 additions and 173 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2002/07/14 22:04:49 $
-* $Revision: 1.6 $
+* $Date: 2002/07/21 08:43:39 $
+* $Revision: 1.7 $
 *
 *******************************************************************************
 */
@ -14,14 +14,23 @@
 package com.ibm.text.UCD;
 import java.io.*;
 import com.ibm.text.utility.*;
+
 import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.Replaceable;
+import com.ibm.icu.text.ReplaceableString;
+import com.ibm.icu.text.UnicodeMatcher;
+
+
 import java.util.*;


 public final class GenerateHanTransliterator implements UCD_Types {
    
+    static final boolean DISAMBIG = false;
+    static final boolean DEBUG = false;
+    
    static class HanInfo {
        int count = 0;
        int minLen = Integer.MAX_VALUE;
@ -237,45 +246,46 @@ public final class GenerateHanTransliterator implements UCD_Types {
    	Default.setUCD();
        try {
            System.out.println("Starting");
-            log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
-            err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
-            log.print('\uFEFF');
+            System.out.println("Quoting: " + quoteNonLetters.toRules(true));
+            System.out.println("Quoting: " + quoteNonLetters.toRules(true));
+            
            
            String key; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
-            String filter; // "kJis0";
            String filename;
            
            switch (type) {
                case DEFINITION:
                    key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
-                    filter = null; // "kJis0";
-                    filename = "Transliterator_Han_Latin_Definition.txt";
+                    filename = "Raw_Transliterator_Han_Latin_Definition.txt";
                    break;
                case JAPANESE: 
                    key = "kJapaneseOn";
-                    filter = null; // "kJis0";
-                    filename = "Transliterator_ja_Latin.txt";
+                    filename = "Raw_Transliterator_ja_Latin.txt";
                    break;
                case CHINESE:
                    key = "kMandarin";
-                    filename = "Transliterator_Han_Latin.txt";
-                    filter = null;
+                    filename = "Raw_Transliterator_Han_Latin.txt";
                    break;
                default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
            }
                
-            if (type == DEFINITION) readCDICTDefinitions();
-            readUnihanData(key, filter);
+            log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
+            err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
+            log.print('\uFEFF');
+            
+            readUnihanData(key);
+            readCDICTDefinitions(type);
            
            if (false) {
                readCDICT();
                compareUnihanWithCEDICT();
            }
            
-            readFrequencyData();
+            readFrequencyData(type);
            
            out = Utility.openPrintWriter(filename, false, false);
-            out.println("# Convert CJK characters");
+            out.println("# Start RAW data for converting CJK characters");
+            /*
            out.println("# Note: adds space between them and letters.");
            out.println("{ ([:Han:]) } [:L:] > | $1 ' ';");
            out.println("[\\.\\,\\?\\!\uFF0E\uFF0C\uFF1F\uFF01\u3001\u3002[:Pe:][:Pf:]] { } [:L:] > ' ';");
@ -288,41 +298,65 @@ public final class GenerateHanTransliterator implements UCD_Types {
                out.println("[:hiragana:] { } [[:L:]-[:hiragana:]] > ' ';");
                out.println("[[:L:]-[:hiragana:]] { } [:hiragana:]> ' ';");
            }
-                        
+            */
+            
            Set gotAlready = new HashSet();
            Iterator it = rankList.iterator();
            Set lenSet = new TreeSet();
            Set backSet = new TreeSet();
            int rank = 0;
+            Map definitionCount = new HashMap();
+            
+            
            while (it.hasNext()) {
-                Comparable keyChar = (Comparable) it.next();
-                Comparable def = (Comparable) unihanMap.get(keyChar);
+                String keyChar = (String) it.next();
+                String def = (String) unihanMap.get(keyChar);
                if (def == null) continue; // skipping
                // sort longer definitions first!
+                
+                Integer countInteger = (Integer) definitionCount.get(def);
+                int defCount = (countInteger == null) ? 0 : countInteger.intValue();
+                String oldDef = def;
+                if (DISAMBIG && (defCount != 0 || def.indexOf(' ') >= 0)) {
+                    def += " " + toSub.transliterate(String.valueOf(defCount));
+                }
+                
                lenSet.add(new Pair(
-                    new Pair(new Integer(-keyChar.toString().length()), 
-                        new Pair(new Integer(-def.toString().length()), new Integer(rank++))),
+                    new Pair(new Integer(-UTF16.countCodePoint(keyChar)), 
+                        new Pair(new Integer(-def.length()), new Integer(rank++))),
                    new Pair(keyChar, def)));
                backSet.add(new Pair(
                    new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
                    new Pair(keyChar, def)));
+                    
+                definitionCount.put(oldDef, new Integer(defCount+1));
                gotAlready.add(keyChar);
            }
            
            // add the ones that are not ranked!
            it = unihanMap.keySet().iterator();
            while (it.hasNext()) {
-                Comparable keyChar = (Comparable) it.next();
+                String keyChar = (String) it.next();
                if (gotAlready.contains(keyChar)) continue;
                
-                Comparable def = (Comparable) unihanMap.get(keyChar);
+                String def = (String) unihanMap.get(keyChar);
+
+                Integer countInteger = (Integer) definitionCount.get(def);
+                int defCount = (countInteger == null) ? 0 : countInteger.intValue();
+                String oldDef = def;
+                if (DISAMBIG && (defCount != 0 || def.indexOf(' ') >= 0)) {
+                    def += " " + toSub.transliterate(String.valueOf(defCount));
+                }
+                
                lenSet.add(new Pair(
-                    new Pair(new Integer(-keyChar.toString().length()), 
+                    new Pair(new Integer(-UTF16.countCodePoint(keyChar)), 
                        new Pair(new Integer(-def.toString().length()), new Integer(rank++))),
                    new Pair(keyChar, def)));
                backSet.add(new Pair(
                    new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
                    new Pair(keyChar, def)));
+
+                definitionCount.put(oldDef, new Integer(defCount+1));
            }
            
            // First, find the ones that we want a definition for, based on the ranking
@ -358,27 +392,33 @@ public final class GenerateHanTransliterator implements UCD_Types {
                String keyChar = (String) p.first; 
                String def = (String) p.second;
                String rel = doReverse.contains(keyChar) ? " <> " : " > ";
-                out.println(quoteNonLetters.transliterate(keyChar) + rel + quoteNonLetters.transliterate(def) + ";");
-                //if (TESTING) System.out.println("# " + code + " > " + definition);
+                
+                out.println(quoteNonLetters.transliterate(keyChar) + rel
+                    + quoteNonLetters.transliterate(def) + ";");
+                    //if (TESTING) System.out.println("# " + code + " > " + definition);
            }
            
            out.println("\u3002 <> '.';");
+            out.println("# End RAW data for converting CJK characters");
+            
+            /*
            if (type == JAPANESE) {
                out.println(":: katakana-latin;");
                out.println(":: hiragana-latin;");
            }
            out.println(":: fullwidth-halfwidth ();");
-
+            */
            
            
            System.out.println("Total: " + totalCount);
            System.out.println("Defined Count: " + count);
+            
        } catch (Exception e) {
            System.out.println("Exception: " + e);
        } finally {
            if (log != null) log.close();
-            if (out != null) out.close();
            if (err != null) err.close();
+            if (out != null) out.close();
        }
    }
    
@ -390,7 +430,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
    static int totalCount;
    static int oldLine;
    
-    static void readFrequencyData() throws java.io.IOException {
+    static void readFrequencyData(int type) throws java.io.IOException {
        String line = "";
        try {
            
@ -400,61 +440,68 @@ public final class GenerateHanTransliterator implements UCD_Types {
            // 1 ? 17176
            
            Set combinedRank = new TreeSet();
-            
-            System.out.println("Reading chinese_frequency.txt");
-            BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", true);
+            BufferedReader br;
            int counter = 0;
-            while (true) {
-                line = Utility.readDataLine(br);
-                if (line == null) break;
-                if (line.length() == 0) continue;
-                Utility.dot(counter++);
-                int tabPos = line.indexOf('\t');
-                int rank = Integer.parseInt(line.substring(0,tabPos));
-                int cp = line.charAt(tabPos+1);
-                //if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
-                combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp)));
-            }
-            br.close();
+            Iterator it;
            
-            System.out.println("Reading japanese_frequency.txt");
-     
-            br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", true);
-            Map japaneseMap = new HashMap();
-            while (true) {
-                line = Utility.readDataLine(br);
-                if (line == null) break;
-                if (line.length() == 0) continue;
-                Utility.dot(counter++);
-                int tabPos = line.indexOf(' ');
-                
-                int tabPos2 = line.indexOf(' ', tabPos+1);
-                int freq = Integer.parseInt(line.substring(tabPos2+1));
-                
-                for (int i = tabPos+1; i < tabPos2; ++i) {
-                    int cp = line.charAt(i);
-                    int script = Default.ucd.getScript(cp);
-                    if (script != HAN_SCRIPT) {
-                        if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT) {
-                            System.out.println("Huh: " + Default.ucd.getCodeAndName(cp));
-                        }
-                        continue;
-                    }
-                    // if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
-                    Utility.addCount(japaneseMap, UTF16.valueOf(cp), -freq);
+            if (type == CHINESE) {
+                System.out.println("Reading chinese_frequency.txt");
+                br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", true);
+                counter = 0;
+                while (true) {
+                    line = Utility.readDataLine(br);
+                    if (line == null) break;
+                    if (line.length() == 0) continue;
+                    Utility.dot(counter++);
+                    int tabPos = line.indexOf('\t');
+                    int rank = Integer.parseInt(line.substring(0,tabPos));
+                    int cp = line.charAt(tabPos+1);
+                    //if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
+                    combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp)));
                }
+                br.close();
            }
-            br.close();
            
-            // get rank order japanese
-            Iterator it = japaneseMap.keySet().iterator();
-            int countJapanese = 0;
-            while (it.hasNext()) {
-                Comparable key = (Comparable) it.next();
-                Comparable val = (Comparable) japaneseMap.get(key);
-                combinedRank.add(new Pair(new Integer(++countJapanese), key));
+            if (type == JAPANESE) {
+                System.out.println("Reading japanese_frequency.txt");
+         
+                br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", true);
+                Map japaneseMap = new HashMap();
+                while (true) {
+                    line = Utility.readDataLine(br);
+                    if (line == null) break;
+                    if (line.length() == 0) continue;
+                    Utility.dot(counter++);
+                    int tabPos = line.indexOf(' ');
+                    
+                    int tabPos2 = line.indexOf(' ', tabPos+1);
+                    int freq = Integer.parseInt(line.substring(tabPos2+1));
+                    
+                    for (int i = tabPos+1; i < tabPos2; ++i) {
+                        int cp = line.charAt(i);
+                        int script = Default.ucd.getScript(cp);
+                        if (script != HAN_SCRIPT) {
+                            if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT) {
+                                System.out.println("Huh: " + Default.ucd.getCodeAndName(cp));
+                            }
+                            continue;
+                        }
+                        // if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
+                        Utility.addCount(japaneseMap, UTF16.valueOf(cp), -freq);
+                    }
+                }
+                br.close();
+                // get rank order japanese
+                it = japaneseMap.keySet().iterator();
+                int countJapanese = 0;
+                while (it.hasNext()) {
+                    Comparable key = (Comparable) it.next();
+                    Comparable val = (Comparable) japaneseMap.get(key);
+                    combinedRank.add(new Pair(new Integer(++countJapanese), key));
+                }
+     
            }
- 
+            
            
            int overallRank = 0;
            it = combinedRank.iterator();
@ -582,12 +629,16 @@ public final class GenerateHanTransliterator implements UCD_Types {
    
    // form: ???? [ai4 wu1 ji2 wu1] /love me/love my dog/
    
-    static void readCDICTDefinitions() throws IOException {
-        System.out.println("Reading cdict.txt");
-        BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true);
+    static void readCDICTDefinitions(int type) throws IOException {
+        String fname = "cdict.txt";
+        if (type == JAPANESE) fname = "edict.txt";
+        
+        System.out.println("Reading " + fname);
+        BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true);
        int counter = 0;
        String[] pieces = new String[50];
        String line = "";
+        String definition;
        try {
            while (true) {
                line = Utility.readDataLine(br);
@ -597,18 +648,26 @@ public final class GenerateHanTransliterator implements UCD_Types {
                
                
                int pinyinStart = line.indexOf('[');
-                String word = line.substring(0,pinyinStart).trim();
                int pinyinEnd = line.indexOf(']', pinyinStart+1);
                int defStart = line.indexOf('/', pinyinEnd+1);
                int defEnd = line.indexOf('/', defStart+1);
-                String definition = fixDefinition(line.substring(defStart+1, defEnd), line);
-                // word might have / in it, so do each part separately
-                int wordSlash = word.indexOf('/');
-                if (wordSlash < 0) {
+                
+                int firstData = pinyinStart >= 0 ? pinyinStart : defStart;
+                
+                String word = line.substring(0,firstData).trim();
+                
+                if (type == DEFINITION) {
+                    definition = fixDefinition(line.substring(defStart+1, defEnd), line);
                    addCheck(word, definition, line);
-                } else {
-                    addCheck(word.substring(0, wordSlash), definition, line);
-                    addCheck(word.substring(wordSlash+1), definition, line);
+                } else if (pinyinStart >= 0) {
+                    definition = line.substring(pinyinStart+1, pinyinEnd).trim();
+                    if (type == JAPANESE) {
+                        processEdict(word, definition, line);
+                    } else {
+                        definition = convertPinyin.transliterate(definition);
+                        //definition = Utility.replace(definition, " ", "\\ ");
+                        addCheck(word, definition, line);
+                    }
                }
            }
            br.close();
@ -617,10 +676,204 @@ public final class GenerateHanTransliterator implements UCD_Types {
        }
    }
    
+    static void processEdict(String word, String definition, String line) {
+        // We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH
+        // C = CJK, H = Hiragana, K = katakana
+        
+        // We want to break those up into the following rules.
+        // { CCC } HHHKKKCCCHH => HHH
+        // CCCHHHKKK { CC } HHCCH => HH
+        // CCCHHHKKKCCHH { CC } H => HH
+        
+        int[] offset = {0};
+        int[] offset2 = {0};        
+        int[][] pairList = new int[50][2];
+        int pairCount = 0;
+        
+        // first gather the information as to where the CJK blocks are
+        // do this all at once, so we can refer to stuff ahead of us
+        while (true) {
+            // find next CJK block
+            // where CJK really means anything but kana
+            int type = find(word, kana, offset, offset2, word.length(), false, false);
+            if (type == UnicodeMatcher.U_MISMATCH) break; // we are done.
+            pairList[pairCount][0] = offset[0];
+            pairList[pairCount++][1] = offset2[0];
+            offset[0] = offset2[0]; // get ready for the next one
+        }
+        
+        // IF we only got one CJK block, and it goes from the start to the end, then just do it.
+        
+        if (pairCount == 1 && pairList[0][0] == 0 && pairList[0][1] == word.length()) {
+            addCheck(word, kanaToLatin.transliterate(definition), line);
+            return;
+        }
+        
+        // IF we didn't find any Kanji, bail.
+        
+        if (pairCount < 1) {
+            System.out.println("No Kanji on line, skipping");
+            System.out.println(hex.transliterate(word) + " > " + hex.transliterate(definition)
+                + ", " + kanaToLatin.transliterate(definition));
+            return;
+        }
+            
+        // Now generate the rules
+        
+        
+        if (DEBUG && pairCount > 1) {
+            System.out.println("Paircount: " + pairCount);
+            System.out.println("\t" + hex.transliterate(word) + " > " + hex.transliterate(definition) + ", " + kanaToLatin.transliterate(definition));
+        }
+        
+        pairList[pairCount][0] = word.length(); // to make the algorithm easier, we add a termination
+        int delta = 0; // the current difference in positions between the definition and the word
+        
+        for (int i = 0; i < pairCount; ++i) {
+            int start = pairList[i][0];
+            int limit = pairList[i][1];
+            if (DEBUG && pairCount > 1) System.out.println(start + ", " + limit + ", " + delta);
+            
+            // that part was easy. the hard part is figuring out where this corresponds to in the definition.
+            // For now, we use a simple mechanism.
+            
+            // The word and the definition should match to this point, so we just use the start (offset by delta)
+            // We'll check just to be sure.
+            
+            int lastLimit = i == 0 ? 0 : pairList[i-1][1];
+            
+            int defStart = start + delta;
+            
+            String defPrefix = definition.substring(0, defStart);
+            String wordInfix = word.substring(lastLimit, start);
+            
+            boolean firstGood = defPrefix.endsWith(wordInfix);
+            if (!firstGood) {
+                String wordInfix2 = katakanatoHiragana.transliterate(wordInfix);
+                firstGood = defPrefix.endsWith(wordInfix2);
+            }
+            if (!firstGood) {
+                // Houston, we have a problem.
+                Utility.fixDot();
+                System.out.println("Suspect line: " + hex.transliterate(word) + " > " + hex.transliterate(definition)
+                    + ", " + kanaToLatin.transliterate(definition));
+                System.out.println("\tNo match for " + hex.transliterate(word.substring(lastLimit, start)) 
+                    + " at end of " + hex.transliterate(definition.substring(0, defStart)));
+                break; // BAIL
+            }
+            
+            // For the limit of the defintion, we get the intermediate portion of the word
+            // then search for it in the definition.
+            // We could get tripped up if the end of the transliteration of the Kanji matched the start.
+            // If so, we should find out on the next pass.
+            
+            int defLimit;
+            if (limit == word.length()) {
+                defLimit = definition.length();
+            } else {
+                String afterPart = word.substring(limit, pairList[i+1][0]);
+                defLimit = definition.indexOf(afterPart, defStart+1); // we assume the CJK is at least one!
+                if (defLimit < 0) {
+                    String afterPart2 = katakanatoHiragana.transliterate(afterPart);
+                    defLimit = definition.indexOf(afterPart2, defStart+1); // we assume the CJK is at least one!
+                }
+                
+                if (defLimit < 0) {
+                    // Houston, we have a problem.
+                    Utility.fixDot();
+                    System.out.println("Suspect line: " + hex.transliterate(word) + " > " + hex.transliterate(definition)
+                        + ", " + kanaToLatin.transliterate(definition));
+                    System.out.println("\tNo match for " + hex.transliterate(afterPart) 
+                        + " in " + hex.transliterate(definition.substring(0, defStart+1)));
+                }
+                break;
+            }
+            
+            String defPart = definition.substring(defStart, defLimit);
+            defPart = kanaToLatin.transliterate(defPart);
+            
+            // FOR NOW, JUNK the context before!!
+            // String contextWord = word.substring(0, start) + "{" + word.substring(start, limit) + "}" + word.substring(limit);
+            String contextWord = word.substring(start, limit);
+            if (limit != word.length()) contextWord += "}" + word.substring(limit);
+            
+            addCheck(contextWord, defPart, line);
+            if (DEBUG && pairCount > 1) System.out.println("\t" + hex.transliterate(contextWord) + " > " + hex.transliterate(defPart));
+            
+            delta = defLimit - limit;
+        }
+        
+    }
+    
+    // Useful Utilities?
+    
+    /** 
+     * Returns the start of the first substring that matches m.
+     * Most arguments are the same as UnicodeMatcher.matches, except for offset[]
+     * @positive Use true if you want the first point that matches, and false if you want the first point that doesn't match.
+     * @offset On input, the starting position. On output, the start of the match position (not the end!!)
+     */
+    static int find(Replaceable s, UnicodeMatcher m, int[] offset, int limit, boolean incremental, boolean positive) {
+        int direction = offset[0] <= limit ? 1 : -1;
+
+        
+        while (offset[0] != limit) {
+            int original = offset[0];
+            int type = m.matches(s, offset, limit, incremental); // if successful, changes offset.
+            if (type == UnicodeMatcher.U_MISMATCH) {
+                if (!positive) {
+                    return UnicodeMatcher.U_MATCH;
+                }
+                offset[0] += direction;  // used to skip to next code unit, in the positive case
+                // !! This should be safe, and saves checking the length of the code point
+            } else if (positive) {
+                offset[0] = original; // reset to the start position!!!
+                return type;
+            }
+        }
+        return UnicodeMatcher.U_MISMATCH;
+    }
+    
+    /** 
+     * Returns the start/limit of the first substring that matches m. Most arguments are the same as find().<br>
+     * <b>Warning:</b> if the search is backwards, then substringEnd will contain the <i>start</i> of the substring
+     * and offset will contain the </i>limit</i> of the substring.
+     */
+    static int find(Replaceable s, UnicodeMatcher m, int[] offset, int[] offset2, int limit, boolean incremental, boolean positive) {
+        int type = find(s, m, offset, limit, incremental, positive);
+        if (type == UnicodeMatcher.U_MISMATCH) return type;
+        offset2[0] = offset[0];
+        int type2 = find(s, m, offset2, limit, incremental, !positive);
+        return type;
+    }
+    
+    static int find(String ss, UnicodeMatcher m, int[] offset, int limit, boolean incremental, boolean positive) {
+        // UGLY that we have to create a wrapper!
+        return find(new ReplaceableString(ss), m, offset, limit, incremental, positive);
+    }
+    
+    static int find(String ss, UnicodeMatcher m, int[] offset, int[] offset2, int limit, boolean incremental, boolean positive) {
+        // UGLY that we have to create a wrapper!
+        return find(new ReplaceableString(ss), m, offset, offset2, limit, incremental, positive);
+    }
+    
    static UnicodeSet pua = new UnicodeSet("[:private use:]");
    static UnicodeSet numbers = new UnicodeSet("[0-9]");
    
    static void addCheck(String word, String definition, String line) {
+        int lastSlash = 0;
+        while (lastSlash < word.length()) {
+            int wordSlash = word.indexOf('/', lastSlash);
+            if (wordSlash < 0) wordSlash = word.length();
+            addCheck2(word.substring(lastSlash, wordSlash), definition, line);
+            lastSlash = wordSlash + 1;
+        }
+    }
+    
+    static void addCheck2(String word, String definition, String line) {
+        definition = Default.nfc.normalize(definition) + " ";
+        word = Default.nfc.normalize(word);
+        
        if (pua.containsSome(word) ) {
            Utility.fixDot();
            System.out.println("PUA on: " + line);
@ -711,17 +964,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
    static Map simplifiedToTraditional = new HashMap();
    static Map traditionalToSimplified = new HashMap();
  
-    static void readUnihanData(String key, String filter) throws java.io.IOException {
+    static void readUnihanData(String key) throws java.io.IOException {

        BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true); 

        int count = 0;
-        String oldCode = "";
-        String oldLine = "";
-        int oldStart = 0;
-        boolean foundFilter = (filter == null);
-        boolean foundKey = false;
-        
        int lineCounter = 0;
        
        while (true) {
@ -734,97 +981,63 @@ public final class GenerateHanTransliterator implements UCD_Types {
            line = line.trim();
            
            int tabPos = line.indexOf('\t');
-            String code = line.substring(2, tabPos);
+            int tabPos2 = line.indexOf('\t', tabPos+1);
+            
+            String scode = line.substring(2, tabPos).trim();
+            
+            int code = Integer.parseInt(scode, 16);            
+            String property = line.substring(tabPos+1, tabPos2).trim();
+            
+            String propertyValue = line.substring(tabPos2+1).trim();
+            if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
            
            // gather traditional mapping
-            if (line.indexOf("kTraditionalVariant") >= 0) {
-                int tabPos2 = line.indexOf('\t', tabPos+1);
-                int tabPos3 = line.indexOf(' ', tabPos2+1);
-                if (tabPos3 < 0) tabPos3 = line.length();
-                
-                String code2 = line.substring(tabPos2+3, tabPos3);
-                simplifiedToTraditional.put(UTF16.valueOf(Integer.parseInt(code, 16)), 
-                    UTF16.valueOf(Integer.parseInt(code2, 16)));
+            if (property.equals("kTraditionalVariant")) {
+                simplifiedToTraditional.put(UTF16.valueOf(code), propertyValue);
            }
            
-            if (line.indexOf("kSimplifiedVariant") >= 0) {
-                int tabPos2 = line.indexOf('\t', tabPos+1);
-                int tabPos3 = line.indexOf(' ', tabPos2+1);
-                if (tabPos3 < 0) tabPos3 = line.length();
-                
-                String code2 = line.substring(tabPos2+3, tabPos3);
-                traditionalToSimplified.put(UTF16.valueOf(Integer.parseInt(code, 16)), 
-                    UTF16.valueOf(Integer.parseInt(code2, 16)));
+            if (property.equals("kSimplifiedVariant")) {
+                traditionalToSimplified.put(UTF16.valueOf(code), propertyValue);
            }
            
-            
-            
-            /* if (code.compareTo("9FA0") >= 0) {
-                System.out.println("? " + line);
-            }*/
-            if (!code.equals(oldCode)) {
-            	totalCount++;
-            	
-                if (foundKey && foundFilter) {
-                    count++;
-                    /*if (true) { //*/
-                    if (TESTING && (count == 1 || (count % 100) == 0)) {
-                        System.out.println(count + ": " + oldLine);
-                    }
-                    storeDef(out, oldCode, oldLine, oldStart);
-                }
-                if (TESTING) if (count > 1000) {
-                    System.out.println("ABORTING at 1000 for testing");
-                    break;
-                }
-                oldCode = code;
-                foundKey = false;
-                foundFilter = (filter == null);
-            }
-            
-            // detect key, filter. Must be on different lines
-            if (!foundFilter && line.indexOf(filter) >= 0) {
-                foundFilter = true;
-            } else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) {
-                foundKey = true;
-                oldLine = line;
-                oldStart += key.length();
-            }
+            if (property.equals(key) || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")) {
+                storeDef(out, code, propertyValue, line);
+            }            
        }
-        if (foundKey && foundFilter) storeDef(out, oldCode, oldLine, oldStart);
        
        in.close();
    }
    
-    static void storeDef(PrintWriter out, String code, String line, int start) {
-        if (code.length() == 0) return;
-        
+    static void storeDef(PrintWriter out, int cp, String rawDefinition, String line) {
        // skip spaces & numbers at start
-        for (;start < line.length(); ++start) {
-            char ch = line.charAt(start);
+        int start;
+        for (start = 0;start < rawDefinition.length(); ++start) {
+            char ch = rawDefinition.charAt(start);
            if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
        }

        // go up to comma or semicolon, whichever is earlier
-        int end = line.indexOf(";", start);
-        if (end < 0) end = line.length();
+        int end = rawDefinition.indexOf(";", start);
+        if (end < 0) end = rawDefinition.length();
        
-        int end2 = line.indexOf(",", start);
-        if (end2 < 0) end2 = line.length();
+        int end2 = rawDefinition.indexOf(",", start);
+        if (end2 < 0) end2 = rawDefinition.length();
        if (end > end2) end = end2;
  
+        // IF CHINESE or JAPANESE, stop at first space!!!
+        
        if (type != DEFINITION) {
-            end2 = line.indexOf(" ", start);
-            if (end2 < 0) end2 = line.length();
+            end2 = rawDefinition.indexOf(" ", start);
+            if (end2 < 0) end2 = rawDefinition.length();
            if (end > end2) end = end2;
        }
        
-        String definition = line.substring(start,end);
+        String definition = rawDefinition.substring(start,end);
        if (type == CHINESE) {
            // since data are messed up, terminate after first digit
            int end3 = findInString(definition, "12345")+1;
            if (end3 == 0) {
-                log.println("Bad pinyin data: " + line);
+                log.println("Bad pinyin data: " + rawDefinition);
                end3 = definition.length();
            }
            definition = definition.substring(0, end3);
@ -832,18 +1045,18 @@ public final class GenerateHanTransliterator implements UCD_Types {
            definition = convertPinyin.transliterate(definition);
        }
        if (type == DEFINITION) {
-            definition = removeMatched(definition,'(', ')', line);
-            definition = removeMatched(definition,'[', ']', line);
-            definition = fixDefinition(definition, line);
+            definition = removeMatched(definition,'(', ')', rawDefinition);
+            definition = removeMatched(definition,'[', ']', rawDefinition);
+            definition = fixDefinition(definition, rawDefinition);
        }
        definition = definition.trim();
        definition = Default.ucd.getCase(definition, FULL, LOWER);
-        String cp = UTF16.valueOf(Integer.parseInt(code, 16));
+
        if (definition.length() == 0) {
            Utility.fixDot();
            System.out.println("Zero value for " + Default.ucd.getCode(cp) + " on: " + hex.transliterate(line));
        } else {
-            addCheck(cp, definition, line);
+            addCheck(UTF16.valueOf(cp), definition, rawDefinition);
        }
        /*
        String key = (String) unihanMap.get(definition);
@ -855,7 +1068,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
        */
    }
    
-    static String fixDefinition(String definition, String line) {
+    static String fixDefinition(String definition, String rawDefinition) {
        definition = definition.trim();
        definition = Utility.replace(definition, "  ", " ");
        definition = Utility.replace(definition, " ", "-");
@ -894,12 +1107,37 @@ public final class GenerateHanTransliterator implements UCD_Types {
    
    static StringBuffer handlePinyinTemp = new StringBuffer();
    
-    static Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex");
-    static Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters", 
-        "([[\\u0021-\\u007E]-[:L:]-[\\']-[0-9]]) > \\u005C $1; \\' > \\'\\';", Transliterator.FORWARD);
-    
+    static final Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex");
+    static final Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters", 
+          "([[\\u0020-\\u007E]-[:L:]-[\\'\\{\\}]-[0-9]]) > \\u005C $1; "
+        + "\\' > \\'\\';",
+        Transliterator.FORWARD);
+    static final Transliterator toSub = Transliterator.createFromRules("any-subscript", 
+            " 0 > \u2080; "
+          + " 1 > \u2081; "
+          + " 2 > \u2082; "
+          + " 3 > \u2084; "
+          + " 4 > \u2084; "
+          + " 5 > \u2085; "
+          + " 6 > \u2086; "
+          + " 7 > \u2087; "
+          + " 8 > \u2088; "
+          + " 9 > \u2089; ",
+        Transliterator.FORWARD);
    
+    static final Transliterator kanaToLatin = Transliterator.createFromRules("any-subscript", 
+            " $kata = [[:katakana:]\u30FC]; "
+          + "[:hiragana:] {} [:^hiragana:] > ' '; "
+          + "$kata {} [^[:hiragana:]$kata] > ' '; "  
+          + "::Katakana-Latin; "
+          + "::Hiragana-Latin;",
+        Transliterator.FORWARD);
+        
+    static final Transliterator katakanatoHiragana = Transliterator.getInstance("katakana-hiragana");        
    
+    static final UnicodeSet kana = new UnicodeSet("[[:hiragana:][:katakana:]\u30FC]");
+    // since we are working in NFC, we don't worry about the combining marks.
+            
    // ADD Factory since otherwise getInverse blows out
    static class DummyFactory implements Transliterator.Factory {
        static DummyFactory singleton = new DummyFactory();
@ -936,6 +1174,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
                    + "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
                    + "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
                    + "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
+                    + "($digit) > &digit-tone($1);\n"
                    + "::NFC;\n";
 
    	Transliterator at = Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD);
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/07/03 02:15:47 $
-* $Revision: 1.21 $
+* $Date: 2002/07/21 08:43:39 $
+* $Revision: 1.22 $
 *
 *******************************************************************************
 */
@ -806,7 +806,7 @@ public final class Utility {    // COMMON UTILITIES
            pos = source.indexOf(piece, pos);
            if (pos < 0) return source;
            source = source.substring(0,pos) + replacement + source.substring(pos + piece.length());
-            if (replacement.length() > 0) ++pos;
+            pos += replacement.length();
        }
    }