2001-10-25 20:37:09 +00:00
|
|
|
|
/**
|
|
|
|
|
*******************************************************************************
|
|
|
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
|
|
|
* others. All Rights Reserved. *
|
|
|
|
|
*******************************************************************************
|
|
|
|
|
*
|
|
|
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
2001-12-06 00:05:53 +00:00
|
|
|
|
* $Date: 2001/12/06 00:05:53 $
|
|
|
|
|
* $Revision: 1.2 $
|
2001-10-25 20:37:09 +00:00
|
|
|
|
*
|
|
|
|
|
*******************************************************************************
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
package com.ibm.text.UCD;
|
|
|
|
|
import java.io.*;
|
|
|
|
|
import com.ibm.text.utility.*;
|
|
|
|
|
import com.ibm.text.UTF16;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public final class GenerateHanTransliterator {
|
|
|
|
|
|
|
|
|
|
static final boolean TESTING = false;
|
|
|
|
|
static int type;
|
|
|
|
|
|
|
|
|
|
public static void main() {
|
|
|
|
|
try {
|
|
|
|
|
type = 0;
|
|
|
|
|
System.out.println("Starting");
|
|
|
|
|
generate();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
System.out.println("Exception: " + e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static PrintWriter out;
|
|
|
|
|
static PrintWriter err;
|
|
|
|
|
|
|
|
|
|
static int count;
|
|
|
|
|
static int oldLine;
|
|
|
|
|
|
|
|
|
|
static void generate() throws java.io.IOException {
|
|
|
|
|
String name = "$Han$English";
|
|
|
|
|
String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
|
|
|
|
String filter = "kJis0";
|
|
|
|
|
switch (type) {
|
|
|
|
|
default: break;
|
|
|
|
|
case 1: name = "$Han$OnRomaji";
|
|
|
|
|
key = "kJapaneseOn";
|
|
|
|
|
filter = "kJis0";
|
|
|
|
|
break;
|
|
|
|
|
case 2: name = "$Han$Pinyin";
|
|
|
|
|
key = "kMandarin";
|
|
|
|
|
filter = null;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out = Utility.openPrintWriter("Transliterate_Han_English.txt");
|
|
|
|
|
err = Utility.openPrintWriter("Transliterate_Han_English.log.txt");
|
|
|
|
|
|
2001-12-06 00:05:53 +00:00
|
|
|
|
BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0", true);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
|
|
|
|
|
int count = 0;
|
|
|
|
|
String oldCode = "";
|
|
|
|
|
String oldLine = "";
|
|
|
|
|
int oldStart = 0;
|
|
|
|
|
boolean foundFilter = (filter == null);
|
|
|
|
|
boolean foundKey = false;
|
|
|
|
|
|
|
|
|
|
int lineCounter = 0;
|
|
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
|
Utility.dot(++lineCounter);
|
|
|
|
|
|
|
|
|
|
String line = in.readLine();
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() < 6) continue;
|
|
|
|
|
if (line.charAt(0) == '#') continue;
|
|
|
|
|
String code = line.substring(2,6);
|
|
|
|
|
/* if (code.compareTo("9FA0") >= 0) {
|
|
|
|
|
System.out.println("? " + line);
|
|
|
|
|
}*/
|
|
|
|
|
if (!code.equals(oldCode)) {
|
|
|
|
|
if (foundKey && foundFilter) {
|
|
|
|
|
count++;
|
|
|
|
|
/*if (true) { //*/
|
|
|
|
|
if (count == 1 || (count % 100) == 0) {
|
|
|
|
|
System.out.println(count + ": " + oldLine);
|
|
|
|
|
}
|
|
|
|
|
printDef(out, oldCode, oldLine, oldStart);
|
|
|
|
|
}
|
|
|
|
|
if (TESTING) if (count > 1000) break;
|
|
|
|
|
oldCode = code;
|
|
|
|
|
foundKey = false;
|
|
|
|
|
foundFilter = (filter == null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// detect key, filter. Must be on different lines
|
|
|
|
|
if (!foundFilter && line.indexOf(filter) >= 0) {
|
|
|
|
|
foundFilter = true;
|
|
|
|
|
} else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) {
|
|
|
|
|
foundKey = true;
|
|
|
|
|
oldLine = line;
|
|
|
|
|
oldStart += key.length();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart);
|
|
|
|
|
|
|
|
|
|
in.close();
|
|
|
|
|
out.close();
|
|
|
|
|
err.close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void printDef(PrintWriter out, String code, String line, int start) {
|
|
|
|
|
if (code.length() == 0) return;
|
|
|
|
|
|
|
|
|
|
// skip spaces & numbers at start
|
|
|
|
|
for (;start < line.length(); ++start) {
|
|
|
|
|
char ch = line.charAt(start);
|
|
|
|
|
if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// go up to comma or semicolon, whichever is earlier
|
|
|
|
|
int end = line.indexOf(";", start);
|
|
|
|
|
if (end < 0) end = line.length();
|
|
|
|
|
|
|
|
|
|
int end2 = line.indexOf(",", start);
|
|
|
|
|
if (end2 < 0) end2 = line.length();
|
|
|
|
|
if (end > end2) end = end2;
|
|
|
|
|
|
|
|
|
|
if (type != 0) {
|
|
|
|
|
end2 = line.indexOf(" ", start);
|
|
|
|
|
if (end2 < 0) end2 = line.length();
|
|
|
|
|
if (end > end2) end = end2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
String definition = line.substring(start,end);
|
|
|
|
|
if (type == 2) definition = handlePinyin(definition, line);
|
|
|
|
|
definition.trim();
|
|
|
|
|
String cp = UTF16.valueOf(Integer.parseInt(code, 16));
|
|
|
|
|
String key = (String) definitionMap.get(definition);
|
|
|
|
|
if (key == null) {
|
|
|
|
|
definitionMap.put(definition, cp);
|
|
|
|
|
}
|
|
|
|
|
out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';");
|
|
|
|
|
if (TESTING) System.out.println("# " + code + " > " + definition);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static Map definitionMap = new HashMap();
|
|
|
|
|
|
|
|
|
|
static StringBuffer handlePinyinTemp = new StringBuffer();
|
|
|
|
|
|
|
|
|
|
static String handlePinyin(String source, String debugLine) {
|
|
|
|
|
try {
|
|
|
|
|
char ch = source.charAt(source.length()-1);
|
|
|
|
|
int num = (int)(ch-'1');
|
|
|
|
|
if (num < 0 || num > 5) throw new Exception("none");
|
|
|
|
|
handlePinyinTemp.setLength(0);
|
|
|
|
|
boolean gotIt = false;
|
|
|
|
|
boolean messageIfNoGotIt = true;
|
|
|
|
|
for (int i = source.length()-2; i >= 0; --i) {
|
|
|
|
|
ch = source.charAt(i);
|
|
|
|
|
if (!gotIt) switch (ch) {
|
|
|
|
|
case 'A': ch = "A<EFBFBD>\u0102<EFBFBD>\u0100".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'E': ch = "E<EFBFBD>\u0114<EFBFBD>\u0112".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'I': ch = "I<EFBFBD>\u012C<EFBFBD>\u012A".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'O': ch = "O<EFBFBD>\u014E<EFBFBD>\u014C".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'U': ch = "U<EFBFBD>\u016C<EFBFBD>\u016A".charAt(num); gotIt = true; break;
|
|
|
|
|
case '<27>': ch = "<EFBFBD>\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
|
|
|
|
|
}
|
|
|
|
|
handlePinyinTemp.insert(0,ch);
|
|
|
|
|
}
|
|
|
|
|
if (!gotIt && num > 0) {
|
|
|
|
|
handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num));
|
|
|
|
|
if (messageIfNoGotIt) {
|
|
|
|
|
err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp
|
|
|
|
|
.toString());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
source = handlePinyinTemp.toString().toLowerCase();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
err.println("Bad line: " + debugLine);
|
|
|
|
|
}
|
|
|
|
|
return source;
|
|
|
|
|
}
|
|
|
|
|
}
|