2001-10-25 20:37:09 +00:00
|
|
|
|
/**
|
|
|
|
|
*******************************************************************************
|
|
|
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
|
|
|
* others. All Rights Reserved. *
|
|
|
|
|
*******************************************************************************
|
|
|
|
|
*
|
|
|
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
2003-02-25 23:38:23 +00:00
|
|
|
|
* $Date: 2003/02/25 23:38:22 $
|
|
|
|
|
* $Revision: 1.11 $
|
2001-10-25 20:37:09 +00:00
|
|
|
|
*
|
|
|
|
|
*******************************************************************************
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
package com.ibm.text.UCD;
|
|
|
|
|
import java.io.*;
|
|
|
|
|
import com.ibm.text.utility.*;
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
import com.ibm.icu.text.Transliterator;
|
|
|
|
|
import com.ibm.icu.text.UnicodeSet;
|
2002-03-15 01:57:01 +00:00
|
|
|
|
import com.ibm.icu.text.UTF16;
|
2002-07-21 08:43:39 +00:00
|
|
|
|
import com.ibm.icu.text.Replaceable;
|
|
|
|
|
import com.ibm.icu.text.ReplaceableString;
|
|
|
|
|
import com.ibm.icu.text.UnicodeMatcher;
|
|
|
|
|
|
|
|
|
|
|
2001-10-25 20:37:09 +00:00
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
public final class GenerateHanTransliterator implements UCD_Types {
|
2001-10-25 20:37:09 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static final boolean DISAMBIG = false;
|
|
|
|
|
static final boolean DEBUG = false;
|
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
static class HanInfo {
|
|
|
|
|
int count = 0;
|
|
|
|
|
int minLen = Integer.MAX_VALUE;
|
|
|
|
|
int maxLen = Integer.MIN_VALUE;
|
|
|
|
|
int sampleLen = 0;
|
|
|
|
|
Set samples = new TreeSet();
|
|
|
|
|
Map map = new TreeMap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void readUnihan() throws java.io.IOException {
|
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log = Utility.openPrintWriter("Unihan_log.html", Utility.UTF8_WINDOWS);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
log.println("<body>");
|
|
|
|
|
|
2002-10-05 01:28:58 +00:00
|
|
|
|
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, Utility.UTF8);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
|
|
|
|
|
Map properties = new TreeMap();
|
|
|
|
|
|
|
|
|
|
Integer integerCode = new Integer(0);
|
|
|
|
|
int lineCounter = 0;
|
|
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
|
Utility.dot(++lineCounter);
|
|
|
|
|
|
|
|
|
|
String line = in.readLine();
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() < 6) continue;
|
|
|
|
|
if (line.charAt(0) == '#') continue;
|
|
|
|
|
line = line.trim();
|
|
|
|
|
|
|
|
|
|
int tabPos = line.indexOf('\t');
|
|
|
|
|
String scode = line.substring(2, tabPos).trim();
|
|
|
|
|
|
|
|
|
|
int code = Integer.parseInt(scode, 16);
|
|
|
|
|
if (code != integerCode.intValue()) {
|
|
|
|
|
integerCode = new Integer(code);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int tabPos2 = line.indexOf('\t', tabPos+1);
|
|
|
|
|
String property = line.substring(tabPos+1, tabPos2).trim();
|
|
|
|
|
|
|
|
|
|
String propertyValue = line.substring(tabPos2+1).trim();
|
2003-02-25 23:38:23 +00:00
|
|
|
|
if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
|
|
|
|
|
HanInfo values = (HanInfo) properties.get(property);
|
|
|
|
|
if (values == null) {
|
|
|
|
|
values = new HanInfo();
|
|
|
|
|
properties.put(property, values);
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("Property: " + property);
|
|
|
|
|
}
|
|
|
|
|
++values.count;
|
|
|
|
|
if (values.minLen > propertyValue.length()) values.minLen = propertyValue.length();
|
|
|
|
|
if (values.maxLen < propertyValue.length()) values.maxLen = propertyValue.length();
|
|
|
|
|
if (values.sampleLen < 150) {
|
|
|
|
|
String temp = scode + ":" + propertyValue;
|
|
|
|
|
values.sampleLen += temp.length() + 2;
|
|
|
|
|
values.samples.add(temp);
|
|
|
|
|
}
|
|
|
|
|
if (property.endsWith("Variant")
|
|
|
|
|
|| property.endsWith("Numeric")
|
|
|
|
|
|| property.startsWith("kRS")
|
|
|
|
|
|| property.equals("kTotalStrokes")) {
|
|
|
|
|
values.map.put(integerCode, propertyValue);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Set props = properties.keySet();
|
|
|
|
|
/*
|
|
|
|
|
log.println("Properties");
|
|
|
|
|
log.print(" ");
|
|
|
|
|
Utility.print(log, props, "\r\n ");
|
|
|
|
|
log.println();
|
|
|
|
|
log.println();
|
|
|
|
|
|
|
|
|
|
log.println("Sample Values");
|
|
|
|
|
*/
|
|
|
|
|
Iterator it = props.iterator();
|
|
|
|
|
log.println("<ol>");
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
String property = (String)it.next();
|
|
|
|
|
HanInfo values = (HanInfo) properties.get(property);
|
|
|
|
|
log.println("<li><b>" + property + "</b><ul><li>");
|
|
|
|
|
log.println("count: " + values.count
|
|
|
|
|
+ ", min length: " + values.minLen
|
|
|
|
|
+ ", max length: " + values.maxLen);
|
|
|
|
|
log.println("</li><li>samples:");
|
|
|
|
|
Utility.print(log, values.samples, "; ");
|
|
|
|
|
log.println("</li></ul></li>");
|
|
|
|
|
}
|
|
|
|
|
log.println("</ol>");
|
|
|
|
|
|
|
|
|
|
String[] list = {"kRSJapanese", "kRSKanWa", "kRSKangXi", "kRSKorean"};
|
|
|
|
|
Map kRSUnicodeMap = ((HanInfo) properties.get("kRSUnicode")).map;
|
|
|
|
|
Set redundants = new HashSet();
|
|
|
|
|
int unequalCount = 0;
|
|
|
|
|
for (int j = 0; j < list.length; ++j) {
|
|
|
|
|
unequalCount = 0;
|
|
|
|
|
log.println("<p><b>Checking Redundants for " + list[j] + "</b></p><blockquote>");
|
|
|
|
|
redundants.clear();
|
|
|
|
|
Map otherInfo = ((HanInfo) properties.get(list[j])).map;
|
|
|
|
|
it = otherInfo.keySet().iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
Integer key = (Integer) it.next();
|
|
|
|
|
Object ovalue = otherInfo.get(key);
|
|
|
|
|
Object uvalue = kRSUnicodeMap.get(key);
|
|
|
|
|
if (ovalue.equals(uvalue)) {
|
|
|
|
|
redundants.add(key);
|
|
|
|
|
} else if (++unequalCount < 5) {
|
|
|
|
|
log.println("<p>" + Integer.toString(key.intValue(),16)
|
|
|
|
|
+ ": <b>" + ovalue + "</b>, " + uvalue + "</p>");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
log.println("</p>Total Unique: " + (otherInfo.size() - redundants.size())
|
|
|
|
|
+ "(out of" + otherInfo.size() + ")</p></blockquote>");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log.println("<p><b>Checking Redundants for kTotalStrokes</b></p><blockquote>");
|
|
|
|
|
|
|
|
|
|
// pass through first to get a count for the radicals
|
|
|
|
|
Map kTotalStrokesMap = ((HanInfo) properties.get("kTotalStrokes")).map;
|
|
|
|
|
int[] radCount = new int[512];
|
|
|
|
|
it = kRSUnicodeMap.keySet().iterator();
|
|
|
|
|
while(it.hasNext()) {
|
|
|
|
|
Integer key = (Integer) it.next();
|
|
|
|
|
String uvalue = (String) kRSUnicodeMap.get(key);
|
|
|
|
|
if (uvalue.endsWith(".0")) {
|
|
|
|
|
String tvalue = (String) kTotalStrokesMap.get(key);
|
|
|
|
|
if (tvalue == null) continue;
|
|
|
|
|
int rs = getRadicalStroke(uvalue);
|
|
|
|
|
radCount[rs>>8] = Integer.parseInt(tvalue);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// now compare the computed value against the real value
|
|
|
|
|
it = kTotalStrokesMap.keySet().iterator();
|
|
|
|
|
unequalCount = 0;
|
|
|
|
|
redundants.clear();
|
|
|
|
|
while(it.hasNext()) {
|
|
|
|
|
Integer key = (Integer) it.next();
|
|
|
|
|
String uvalue = (String) kRSUnicodeMap.get(key);
|
|
|
|
|
int rs = getRadicalStroke(uvalue);
|
|
|
|
|
String tvalue = (String) kTotalStrokesMap.get(key);
|
|
|
|
|
int t = Integer.parseInt(tvalue);
|
|
|
|
|
int projected = radCount[rs>>8] + (rs & 0xFF);
|
|
|
|
|
if (t == projected) {
|
|
|
|
|
redundants.add(key);
|
|
|
|
|
} else if (++unequalCount < 5) {
|
|
|
|
|
log.println("<p>" + Integer.toString(key.intValue(),16)
|
|
|
|
|
+ ": <b>" + t + "</b>, " + projected + "</p>");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
log.println("</p>Total Unique: " + (kTotalStrokesMap.size() - redundants.size())
|
|
|
|
|
+ "(out of" + kTotalStrokesMap.size() + ")</p></blockquote>");
|
|
|
|
|
|
|
|
|
|
log.println("</body>");
|
|
|
|
|
in.close();
|
|
|
|
|
log.close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int getRadicalStroke(String s) {
|
|
|
|
|
int dotPos = s.indexOf('.');
|
|
|
|
|
int strokes = Integer.parseInt(s.substring(dotPos+1));
|
|
|
|
|
int radical = 0;
|
|
|
|
|
if (s.charAt(dotPos - 1) == '\'') {
|
|
|
|
|
radical = 256;
|
|
|
|
|
--dotPos;
|
|
|
|
|
}
|
|
|
|
|
radical += Integer.parseInt(s.substring(0,dotPos));
|
|
|
|
|
return (radical << 8) + strokes;
|
|
|
|
|
}
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
static Transliterator fromHexUnicode = Transliterator.getInstance("hex-any/unicode");
|
|
|
|
|
|
|
|
|
|
static Transliterator toHexUnicode = Transliterator.getInstance("any-hex/unicode");
|
2002-07-14 22:04:49 +00:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
static String convertUPlus(String other) {
|
|
|
|
|
int pos1 = other.indexOf("U+");
|
|
|
|
|
if (pos1 < 0) return other;
|
2003-02-25 23:38:23 +00:00
|
|
|
|
return fromHexUnicode(
|
2002-07-14 22:04:49 +00:00
|
|
|
|
pos1 += 2;
|
|
|
|
|
|
|
|
|
|
StringBuffer result = new StringBuffer();
|
|
|
|
|
while (pos1 < other.length()) {
|
|
|
|
|
int end = getHexEnd(s, pos1);
|
|
|
|
|
result.append(UTF16.valueOf(Integer.parseInt(other.substring(pos1, end), 16)));
|
|
|
|
|
pos1 = other.indexOf("U+", pos1);
|
|
|
|
|
if (pos2 < 0) pos2 = other.length();
|
|
|
|
|
pos1 = pos2;
|
|
|
|
|
}
|
|
|
|
|
return result.toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int getHexEnd(String s, int start) {
|
|
|
|
|
int i= start;
|
|
|
|
|
for (; i < s.length; ++i) {
|
|
|
|
|
char c = s.charAt(i);
|
|
|
|
|
if ('0' <= c && c <= '9') continue;
|
|
|
|
|
if ('A' <= c && c <= 'F') continue;
|
|
|
|
|
if ('a' <= c && c <= 'f') continue;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
*/
|
|
|
|
|
|
2001-10-25 20:37:09 +00:00
|
|
|
|
static final boolean TESTING = false;
|
|
|
|
|
static int type;
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
|
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
static final boolean DO_SIMPLE = true;
|
|
|
|
|
|
2002-05-29 02:01:00 +00:00
|
|
|
|
public static void main(int typeIn) {
|
|
|
|
|
type = typeIn;
|
|
|
|
|
Default.setUCD();
|
2001-10-25 20:37:09 +00:00
|
|
|
|
try {
|
|
|
|
|
System.out.println("Starting");
|
2002-07-21 08:43:39 +00:00
|
|
|
|
System.out.println("Quoting: " + quoteNonLetters.toRules(true));
|
|
|
|
|
System.out.println("Quoting: " + quoteNonLetters.toRules(true));
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
|
|
|
|
String key; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
|
|
|
|
String filename;
|
|
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
|
case DEFINITION:
|
|
|
|
|
key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
2002-07-21 08:43:39 +00:00
|
|
|
|
filename = "Raw_Transliterator_Han_Latin_Definition.txt";
|
2002-06-13 21:14:05 +00:00
|
|
|
|
break;
|
|
|
|
|
case JAPANESE:
|
|
|
|
|
key = "kJapaneseOn";
|
2002-07-21 08:43:39 +00:00
|
|
|
|
filename = "Raw_Transliterator_ja_Latin.txt";
|
2002-06-13 21:14:05 +00:00
|
|
|
|
break;
|
|
|
|
|
case CHINESE:
|
|
|
|
|
key = "kMandarin";
|
2002-07-21 08:43:39 +00:00
|
|
|
|
filename = "Raw_Transliterator_Han_Latin.txt";
|
2002-06-13 21:14:05 +00:00
|
|
|
|
break;
|
|
|
|
|
default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
|
|
|
|
|
}
|
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
err = Utility.openPrintWriter("Transliterate_err.txt", Utility.UTF8_WINDOWS);
|
|
|
|
|
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
log.print('\uFEFF');
|
|
|
|
|
|
2002-08-04 21:38:45 +00:00
|
|
|
|
log.println();
|
|
|
|
|
log.println("@*Override Data");
|
|
|
|
|
log.println();
|
|
|
|
|
readOverrides(type);
|
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log.println();
|
|
|
|
|
log.println("@*DICT Data");
|
|
|
|
|
log.println();
|
2002-07-21 08:43:39 +00:00
|
|
|
|
readCDICTDefinitions(type);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log.println();
|
|
|
|
|
log.println("@Unihan Data");
|
|
|
|
|
log.println();
|
|
|
|
|
readUnihanData(key);
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
if (false) {
|
|
|
|
|
readCDICT();
|
|
|
|
|
compareUnihanWithCEDICT();
|
|
|
|
|
}
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
readFrequencyData(type);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
Iterator it = fullPinyin.iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
String s = (String) it.next();
|
|
|
|
|
if (!isValidPinyin2(s)) {
|
|
|
|
|
err.println("?Valid Pinyin: " + s);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
it = unihanMap.keySet().iterator();
|
|
|
|
|
Map badPinyin = new TreeMap();
|
|
|
|
|
PrintWriter out2 = Utility.openPrintWriter("Raw_mapping.txt", Utility.UTF8_WINDOWS);
|
|
|
|
|
try {
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
String keyChar = (String) it.next();
|
|
|
|
|
String def = (String) unihanMap.get(keyChar);
|
|
|
|
|
if (!isValidPinyin(def)) {
|
|
|
|
|
String fixedDef = fixPinyin(def);
|
|
|
|
|
err.println(Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t" + fixedDef + "\t#" + def
|
|
|
|
|
+ (fixedDef.equals(def) ? " FAIL" : ""));
|
|
|
|
|
Utility.addToSet(badPinyin, def, keyChar);
|
|
|
|
|
}
|
|
|
|
|
// check both ways
|
|
|
|
|
String digitDef = accentPinyin_digitPinyin.transliterate(def);
|
|
|
|
|
String accentDef = digitPinyin_accentPinyin.transliterate(digitDef);
|
|
|
|
|
if (!accentDef.equals(def)) {
|
|
|
|
|
err.println("Failed Digit Pinyin: "
|
|
|
|
|
+ Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t"
|
|
|
|
|
+ def + " => " + digitDef + " => " + accentDef);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out2.println(toHexUnicode.transliterate(keyChar)
|
|
|
|
|
+ "\tkMandarin\t" + digitDef.toUpperCase() + "\t# " + keyChar + ";\t" + def);
|
|
|
|
|
}
|
|
|
|
|
err.println();
|
|
|
|
|
err.println("Summary of Bad syllables");
|
|
|
|
|
Utility.printMapOfCollection(err, badPinyin, "\r\n", ":\t", ", ");
|
|
|
|
|
} finally {
|
|
|
|
|
out2.close();
|
|
|
|
|
}
|
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
out.println("# Start RAW data for converting CJK characters");
|
|
|
|
|
/*
|
2002-06-13 21:14:05 +00:00
|
|
|
|
out.println("# Note: adds space between them and letters.");
|
|
|
|
|
out.println("{ ([:Han:]) } [:L:] > | $1 ' ';");
|
|
|
|
|
out.println("[\\.\\,\\?\\!\uFF0E\uFF0C\uFF1F\uFF01\u3001\u3002[:Pe:][:Pf:]] { } [:L:] > ' ';");
|
|
|
|
|
out.println("[:L:] { } [[:Han:][:Ps:][:Pi:]]> ' ';");
|
|
|
|
|
|
|
|
|
|
if (type == JAPANESE) {
|
|
|
|
|
out.println("$kata = [[\uFF9E\uFF9F\uFF70\u30FC][:katakana:]];");
|
|
|
|
|
out.println("$kata { } [[:L:]-$kata]> ' ';");
|
|
|
|
|
out.println("[[:L:]-$kata] { } $kata > ' ';");
|
|
|
|
|
out.println("[:hiragana:] { } [[:L:]-[:hiragana:]] > ' ';");
|
|
|
|
|
out.println("[[:L:]-[:hiragana:]] { } [:hiragana:]> ' ';");
|
|
|
|
|
}
|
2002-07-21 08:43:39 +00:00
|
|
|
|
*/
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
Set gotAlready = new HashSet();
|
|
|
|
|
Set lenSet = new TreeSet();
|
2002-07-14 22:04:49 +00:00
|
|
|
|
Set backSet = new TreeSet();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
int rank = 0;
|
2002-07-21 08:43:39 +00:00
|
|
|
|
Map definitionCount = new HashMap();
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
it = rankList.iterator();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
while (it.hasNext()) {
|
2002-07-21 08:43:39 +00:00
|
|
|
|
String keyChar = (String) it.next();
|
|
|
|
|
String def = (String) unihanMap.get(keyChar);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
if (def == null) continue; // skipping
|
|
|
|
|
// sort longer definitions first!
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
|
|
|
|
Integer countInteger = (Integer) definitionCount.get(def);
|
|
|
|
|
int defCount = (countInteger == null) ? 0 : countInteger.intValue();
|
|
|
|
|
String oldDef = def;
|
|
|
|
|
if (DISAMBIG && (defCount != 0 || def.indexOf(' ') >= 0)) {
|
|
|
|
|
def += " " + toSub.transliterate(String.valueOf(defCount));
|
|
|
|
|
}
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
lenSet.add(new Pair(
|
2002-07-21 08:43:39 +00:00
|
|
|
|
new Pair(new Integer(-UTF16.countCodePoint(keyChar)),
|
|
|
|
|
new Pair(new Integer(-def.length()), new Integer(rank++))),
|
2002-07-14 22:04:49 +00:00
|
|
|
|
new Pair(keyChar, def)));
|
|
|
|
|
backSet.add(new Pair(
|
2002-06-13 21:14:05 +00:00
|
|
|
|
new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
|
|
|
|
|
new Pair(keyChar, def)));
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
|
|
|
|
definitionCount.put(oldDef, new Integer(defCount+1));
|
2002-06-13 21:14:05 +00:00
|
|
|
|
gotAlready.add(keyChar);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add the ones that are not ranked!
|
|
|
|
|
it = unihanMap.keySet().iterator();
|
|
|
|
|
while (it.hasNext()) {
|
2002-07-21 08:43:39 +00:00
|
|
|
|
String keyChar = (String) it.next();
|
2002-07-14 22:04:49 +00:00
|
|
|
|
if (gotAlready.contains(keyChar)) continue;
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
String def = (String) unihanMap.get(keyChar);
|
|
|
|
|
|
|
|
|
|
Integer countInteger = (Integer) definitionCount.get(def);
|
|
|
|
|
int defCount = (countInteger == null) ? 0 : countInteger.intValue();
|
|
|
|
|
String oldDef = def;
|
|
|
|
|
if (DISAMBIG && (defCount != 0 || def.indexOf(' ') >= 0)) {
|
|
|
|
|
def += " " + toSub.transliterate(String.valueOf(defCount));
|
|
|
|
|
}
|
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
lenSet.add(new Pair(
|
2002-07-21 08:43:39 +00:00
|
|
|
|
new Pair(new Integer(-UTF16.countCodePoint(keyChar)),
|
2002-07-14 22:04:49 +00:00
|
|
|
|
new Pair(new Integer(-def.toString().length()), new Integer(rank++))),
|
|
|
|
|
new Pair(keyChar, def)));
|
|
|
|
|
backSet.add(new Pair(
|
|
|
|
|
new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
|
|
|
|
|
new Pair(keyChar, def)));
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
|
|
|
|
definitionCount.put(oldDef, new Integer(defCount+1));
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
2002-07-14 22:04:49 +00:00
|
|
|
|
|
|
|
|
|
// First, find the ones that we want a definition for, based on the ranking
|
|
|
|
|
// We might have a situation where the definitions are masked.
|
|
|
|
|
// In that case, write forwards and backwards separately
|
|
|
|
|
|
|
|
|
|
Set doReverse = new HashSet();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
Set gotIt = new HashSet();
|
2002-07-14 22:04:49 +00:00
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
if (!DO_SIMPLE) {
|
|
|
|
|
it = backSet.iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
Pair p = (Pair) it.next();
|
|
|
|
|
p = (Pair) p.second;
|
|
|
|
|
|
|
|
|
|
String keyChar = (String) p.first;
|
|
|
|
|
String def = (String) p.second;
|
|
|
|
|
if (!gotIt.contains(def)) {
|
|
|
|
|
if (unihanNonSingular) {
|
|
|
|
|
out.println(quoteNonLetters.transliterate(keyChar)
|
|
|
|
|
+ " < " + quoteNonLetters.transliterate(def) + ";");
|
|
|
|
|
} else {
|
|
|
|
|
doReverse.add(keyChar);
|
|
|
|
|
}
|
2002-07-14 22:04:49 +00:00
|
|
|
|
}
|
2002-07-30 09:57:18 +00:00
|
|
|
|
gotIt.add(def);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
it = lenSet.iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
Pair p = (Pair) it.next();
|
|
|
|
|
p = (Pair) p.second;
|
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
String keyChar = (String) p.first;
|
|
|
|
|
String def = (String) p.second;
|
2002-07-30 09:57:18 +00:00
|
|
|
|
String rel = !DO_SIMPLE && doReverse.contains(keyChar) ? "<>" : ">";
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
|
|
|
|
out.println(quoteNonLetters.transliterate(keyChar) + rel
|
2002-07-30 09:57:18 +00:00
|
|
|
|
+ quoteNonLetters.transliterate(def) + "|\\ ;");
|
2002-07-21 08:43:39 +00:00
|
|
|
|
//if (TESTING) System.out.println("# " + code + " > " + definition);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out.println("\u3002 <> '.';");
|
2002-07-21 08:43:39 +00:00
|
|
|
|
out.println("# End RAW data for converting CJK characters");
|
|
|
|
|
|
|
|
|
|
/*
|
2002-06-13 21:14:05 +00:00
|
|
|
|
if (type == JAPANESE) {
|
|
|
|
|
out.println(":: katakana-latin;");
|
|
|
|
|
out.println(":: hiragana-latin;");
|
|
|
|
|
}
|
2002-07-14 22:04:49 +00:00
|
|
|
|
out.println(":: fullwidth-halfwidth ();");
|
2002-07-21 08:43:39 +00:00
|
|
|
|
*/
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
System.out.println("Total: " + totalCount);
|
|
|
|
|
System.out.println("Defined Count: " + count);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log.println();
|
2002-08-04 21:38:45 +00:00
|
|
|
|
log.println("@Duplicates (Frequency Order");
|
|
|
|
|
log.println();
|
|
|
|
|
it = rankList.iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
String word = (String) it.next();
|
|
|
|
|
Collection dups = (Collection) duplicates.get(word);
|
|
|
|
|
if (dups == null) continue;
|
|
|
|
|
log.print(hex.transliterate(word) + "\t" + word + "\t");
|
|
|
|
|
Iterator it2 = dups.iterator();
|
|
|
|
|
boolean gotFirst = false;
|
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
|
if (!gotFirst) gotFirst = true;
|
|
|
|
|
else log.print(", ");
|
|
|
|
|
log.print(it2.next());
|
|
|
|
|
}
|
|
|
|
|
if (overrideSet.contains(word)) log.print(" *override*");
|
|
|
|
|
log.println();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log.println();
|
|
|
|
|
log.println("@Duplicates (Character Order)");
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log.println();
|
|
|
|
|
it = duplicates.keySet().iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
String word = (String) it.next();
|
|
|
|
|
log.print(hex.transliterate(word) + "\t" + word + "\t");
|
|
|
|
|
Collection dups = (Collection) duplicates.get(word);
|
|
|
|
|
Iterator it2 = dups.iterator();
|
|
|
|
|
boolean gotFirst = false;
|
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
|
if (!gotFirst) gotFirst = true;
|
|
|
|
|
else log.print(", ");
|
|
|
|
|
log.print(it2.next());
|
|
|
|
|
}
|
2002-08-04 21:38:45 +00:00
|
|
|
|
if (overrideSet.contains(word)) log.print(" *override*");
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log.println();
|
|
|
|
|
}
|
|
|
|
|
|
2001-10-25 20:37:09 +00:00
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
System.out.println("Exception: " + e);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
} finally {
|
|
|
|
|
if (log != null) log.close();
|
|
|
|
|
if (err != null) err.close();
|
2002-07-21 08:43:39 +00:00
|
|
|
|
if (out != null) out.close();
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
//http://fog.ccsf.cc.ca.us/~jliou/phonetic.htm
|
|
|
|
|
// longer ones must be AFTER!
|
|
|
|
|
// longer ones must be AFTER!
|
|
|
|
|
static final String[] initialPinyin = {
|
|
|
|
|
"",
|
|
|
|
|
"b", "p", "m", "f",
|
|
|
|
|
"d", "t", "n", "l",
|
|
|
|
|
"z", "c", "s",
|
|
|
|
|
"zh", "ch", "sh", "r",
|
|
|
|
|
"j", "q", "x",
|
|
|
|
|
"g", "k", "h",
|
|
|
|
|
"y", "w"}; // added to make checking simpler
|
|
|
|
|
|
|
|
|
|
static final String[] finalPinyin = {
|
|
|
|
|
"a", "ai", "ao", "an", "ang",
|
|
|
|
|
"o", "ou", "ong",
|
|
|
|
|
"e", "ei", "er", "en", "eng",
|
|
|
|
|
"i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
|
|
|
|
|
"u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
|
|
|
|
|
"<EFBFBD>", "<EFBFBD>e", "<EFBFBD>an", "<EFBFBD>n"
|
|
|
|
|
};
|
|
|
|
|
// Don't bother with the following rules; just add w,y to initials
|
|
|
|
|
// When <20>i<EFBFBD> stands alone, a <20>y<EFBFBD> will be added before it as <20>yi<79>.
|
|
|
|
|
// If <20>i<EFBFBD> is the first letter of the syllable it will be changed to <20>y<EFBFBD>.
|
|
|
|
|
// When <20>u<EFBFBD> stands alone, a <20>w<EFBFBD> will be added before it as <20>wu<77>.
|
|
|
|
|
// If <20>u<EFBFBD> is the first letter of the syllable it will be changed to <20>w<EFBFBD>. e.g. <20>uang -> wang<6E>.
|
|
|
|
|
// When <20><><EFBFBD> stands alone, a <20>y<EFBFBD> will be added before it and <20><><EFBFBD> will be changed to <20>u<EFBFBD> as <20>yu<79>.
|
|
|
|
|
// If <20><><EFBFBD> is the first letter of the syllable, then the spelling will be changed to <20>yu<79>. e.g. <20><>an -> yuan<61>.
|
|
|
|
|
//Note: The nasal final <20>ueng<6E> never occurs after an initial but always form a syllable by itself.
|
|
|
|
|
// The <20>o<EFBFBD> in <20>iou<6F> is hidden, so it will be wrote as <20>iu<69>. But, don<6F>t forget to pronounce it.
|
|
|
|
|
// The <20>e<EFBFBD> in <20>uei<65> is hidden, so it will be wrote as <20>ui<75>. But, don<6F>t forget to pronounce it.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static final String[] pinyin_bopomofo = {
|
|
|
|
|
"a", "\u311a",
|
|
|
|
|
"ai", "\u311e",
|
|
|
|
|
"an", "\u3122",
|
|
|
|
|
"ang", "\u3124",
|
|
|
|
|
"ao", "\u3120",
|
|
|
|
|
"ba", "\u3105\u311a",
|
|
|
|
|
"bai", "\u3105\u311e",
|
|
|
|
|
"ban", "\u3105\u3122",
|
|
|
|
|
"bang", "\u3105\u3124",
|
|
|
|
|
"bao", "\u3105\u3120",
|
|
|
|
|
"bei", "\u3105\u311f",
|
|
|
|
|
"ben", "\u3105\u3123",
|
|
|
|
|
"beng", "\u3105\u3125",
|
|
|
|
|
"bi", "\u3105\u3127",
|
|
|
|
|
"bian", "\u3105\u3127\u3122",
|
|
|
|
|
"biao", "\u3105\u3127\u3120",
|
|
|
|
|
"bie", "\u3105\u3127\u311d",
|
|
|
|
|
"bin", "\u3105\u3127\u3123",
|
|
|
|
|
"bing", "\u3105\u3127\u3125",
|
|
|
|
|
"bo", "\u3105\u311b",
|
|
|
|
|
"bu", "\u3105\u3128",
|
|
|
|
|
"ca", "\u3118\u311a",
|
|
|
|
|
"cai", "\u3118\u311e",
|
|
|
|
|
"can", "\u3118\u3122",
|
|
|
|
|
"cang", "\u3118\u3124",
|
|
|
|
|
"cao", "\u3118\u3120",
|
|
|
|
|
"ce", "\u3118",
|
|
|
|
|
"cen", "\u3118\u3123",
|
|
|
|
|
"ceng", "\u3118\u3125",
|
|
|
|
|
"cha", "\u3114\u311a",
|
|
|
|
|
"chai", "\u3114\u311e",
|
|
|
|
|
"chan", "\u3114\u3122",
|
|
|
|
|
"chang", "\u3114\u3124",
|
|
|
|
|
"chao", "\u3114\u3120",
|
|
|
|
|
"che", "\u3114\u311c",
|
|
|
|
|
"chen", "\u3114\u3123",
|
|
|
|
|
"cheng", "\u3114\u3125",
|
|
|
|
|
"chi", "\u3114",
|
|
|
|
|
"chong", "\u3114\u3121\u3125",
|
|
|
|
|
"chou", "\u3114\u3121",
|
|
|
|
|
"chu", "\u3114\u3128",
|
|
|
|
|
//"chua", "XXX",
|
|
|
|
|
"chuai", "\u3114\u3128\u311e",
|
|
|
|
|
"chuan", "\u3114\u3128\u3122",
|
|
|
|
|
"chuang", "\u3114\u3128\u3124",
|
|
|
|
|
"chui", "\u3114\u3128\u311f",
|
|
|
|
|
"chun", "\u3114\u3128\u3123",
|
|
|
|
|
"chuo", "\u3114\u3128\u311b",
|
|
|
|
|
"ci", "\u3118",
|
|
|
|
|
"cong", "\u3118\u3128\u3125",
|
|
|
|
|
"cou", "\u3118\u3121",
|
|
|
|
|
"cu", "\u3118\u3128",
|
|
|
|
|
"cuan", "\u3118\u3128\u3122",
|
|
|
|
|
"cui", "\u3118\u3128\u311f",
|
|
|
|
|
"cun", "\u3118\u3128\u3123",
|
|
|
|
|
"cuo", "\u3118\u3128\u311b",
|
|
|
|
|
"da", "\u3109\u311a",
|
|
|
|
|
"dai", "\u3109\u311e",
|
|
|
|
|
"dan", "\u3109\u3122",
|
|
|
|
|
"dang", "\u3109\u3124",
|
|
|
|
|
"dao", "\u3109\u3120",
|
|
|
|
|
"de", "\u3109\u311c",
|
|
|
|
|
"dei", "\u3109\u311f",
|
|
|
|
|
"den", "\u3109\u3123",
|
|
|
|
|
"deng", "\u3109\u3125",
|
|
|
|
|
"di", "\u3109\u3127",
|
|
|
|
|
"dia", "\u3109\u3127\u311a",
|
|
|
|
|
"dian", "\u3109\u3127\u3122",
|
|
|
|
|
"diao", "\u3109\u3127\u3120",
|
|
|
|
|
"die", "\u3109\u3127\u311d",
|
|
|
|
|
"ding", "\u3109\u3127\u3125",
|
|
|
|
|
"diu", "\u3109\u3127\u3121",
|
|
|
|
|
"dong", "\u3109\u3128\u3125",
|
|
|
|
|
"dou", "\u3109\u3121",
|
|
|
|
|
"du", "\u3109\u3128",
|
|
|
|
|
"duan", "\u3109\u3128\u3122",
|
|
|
|
|
"dui", "\u3109\u3128\u311f",
|
|
|
|
|
"dun", "\u3109\u3128\u3123",
|
|
|
|
|
"duo", "\u3109\u3128\u311b",
|
|
|
|
|
"e", "\u311c",
|
|
|
|
|
"ei", "\u311f",
|
|
|
|
|
"en", "\u3123",
|
|
|
|
|
"eng", "\u3125",
|
|
|
|
|
"er", "\u3126",
|
|
|
|
|
"fa", "\u3108\u311a",
|
|
|
|
|
"fan", "\u3108\u3122",
|
|
|
|
|
"fang", "\u3108\u3124",
|
|
|
|
|
"fei", "\u3108\u311f",
|
|
|
|
|
"fen", "\u3108\u3123",
|
|
|
|
|
"feng", "\u3108\u3125",
|
|
|
|
|
"fo", "\u3108\u311b",
|
|
|
|
|
"fou", "\u3108\u3121",
|
|
|
|
|
"fu", "\u3108\u3128",
|
|
|
|
|
"ga", "\u310d\u311a",
|
|
|
|
|
"gai", "\u310d\u311e",
|
|
|
|
|
"gan", "\u310d\u3122",
|
|
|
|
|
"gang", "\u310d\u3124",
|
|
|
|
|
"gao", "\u310d\u3120",
|
|
|
|
|
"ge", "\u310d\u311c",
|
|
|
|
|
"gei", "\u310d\u311f",
|
|
|
|
|
"gen", "\u310d\u3123",
|
|
|
|
|
"geng", "\u310d\u3125",
|
|
|
|
|
"gong", "\u310d\u3128\u3125",
|
|
|
|
|
"gou", "\u310d\u3121",
|
|
|
|
|
"gu", "\u310d\u3128",
|
|
|
|
|
"gua", "\u310d\u3128\u311a",
|
|
|
|
|
"guai", "\u310d\u3128\u311e",
|
|
|
|
|
"guan", "\u310d\u3128\u3122",
|
|
|
|
|
"guang", "\u310d\u3128\u3124",
|
|
|
|
|
"gui", "\u310d\u3128\u311f",
|
|
|
|
|
"gun", "\u310d\u3128\u3123",
|
|
|
|
|
"guo", "\u310d\u3128\u311b",
|
|
|
|
|
"ha", "\u310f\u311a",
|
|
|
|
|
"hai", "\u310f\u311e",
|
|
|
|
|
"han", "\u310f\u3122",
|
|
|
|
|
"hang", "\u310f\u3124",
|
|
|
|
|
"hao", "\u310f\u3120",
|
|
|
|
|
"he", "\u310f\u311c",
|
|
|
|
|
"hei", "\u310f\u311f",
|
|
|
|
|
"hen", "\u310f\u3123",
|
|
|
|
|
"heng", "\u310f\u3125",
|
|
|
|
|
"hm", "\u310f\u3107",
|
|
|
|
|
"hng", "\u310f\u312b", // 'dialect of n'
|
|
|
|
|
"hong", "\u310f\u3128\u3125",
|
|
|
|
|
"hou", "\u310f\u3121",
|
|
|
|
|
"hu", "\u310f\u3128",
|
|
|
|
|
"hua", "\u310f\u3128\u311a",
|
|
|
|
|
"huai", "\u310f\u3128\u311e",
|
|
|
|
|
"huan", "\u310f\u3128\u3122",
|
|
|
|
|
"huang", "\u310f\u3128\u3124",
|
|
|
|
|
"hui", "\u310f\u3128\u311f",
|
|
|
|
|
"hun", "\u310f\u3128\u3123",
|
|
|
|
|
"huo", "\u310f\u3128\u311b",
|
|
|
|
|
"ji", "\u3110\u3127",
|
|
|
|
|
"jia", "\u3110\u3127\u311a",
|
|
|
|
|
"jian", "\u3110\u3127\u3122",
|
|
|
|
|
"jiang", "\u3110\u3127\u3124",
|
|
|
|
|
"jiao", "\u3110\u3127\u3120",
|
|
|
|
|
"jie", "\u3110\u3127\u311d",
|
|
|
|
|
"jin", "\u3110\u3127\u3123",
|
|
|
|
|
"jing", "\u3110\u3127\u3125",
|
|
|
|
|
"jiong", "\u3110\u3129\u3125",
|
|
|
|
|
"jiu", "\u3110\u3127\u3121",
|
|
|
|
|
"ju", "\u3110\u3129",
|
|
|
|
|
"juan", "\u3110\u3129\u3122",
|
|
|
|
|
"jue", "\u3110\u3129\u311d",
|
|
|
|
|
"jun", "\u3110\u3129\u3123",
|
|
|
|
|
"ka", "\u310e\u311a",
|
|
|
|
|
"kai", "\u310e\u311e",
|
|
|
|
|
"kan", "\u310e\u3122",
|
|
|
|
|
"kang", "\u310e\u3124",
|
|
|
|
|
"kao", "\u310e\u3120",
|
|
|
|
|
"ke", "\u310e\u311c",
|
|
|
|
|
"kei", "\u310e\u311f",
|
|
|
|
|
"ken", "\u310e\u3123",
|
|
|
|
|
"keng", "\u310e\u3125",
|
|
|
|
|
"kong", "\u310e\u3128\u3125",
|
|
|
|
|
"kou", "\u310e\u3121",
|
|
|
|
|
"ku", "\u310e\u3128",
|
|
|
|
|
"kua", "\u310e\u3128\u311a",
|
|
|
|
|
"kuai", "\u310e\u3128\u311e",
|
|
|
|
|
"kuan", "\u310e\u3128\u3122",
|
|
|
|
|
"kuang", "\u310e\u3128\u3124",
|
|
|
|
|
"kui", "\u310e\u3128\u311f",
|
|
|
|
|
"kun", "\u310e\u3128\u3123",
|
|
|
|
|
"kuo", "\u310e\u3128\u311b",
|
|
|
|
|
"la", "\u310c\u311a",
|
|
|
|
|
"lai", "\u310c\u311e",
|
|
|
|
|
"lan", "\u310c\u3122",
|
|
|
|
|
"lang", "\u310c\u3124",
|
|
|
|
|
"lao", "\u310c\u3120",
|
|
|
|
|
"le", "\u310c\u311c",
|
|
|
|
|
"lei", "\u310c\u311f",
|
|
|
|
|
"leng", "\u310c\u3125",
|
|
|
|
|
"li", "\u310c\u3127",
|
|
|
|
|
"lia", "\u310c\u3127\u311a",
|
|
|
|
|
"lian", "\u310c\u3127\u3122",
|
|
|
|
|
"liang", "\u310c\u3127\u3124",
|
|
|
|
|
"liao", "\u310c\u3127\u3120",
|
|
|
|
|
"lie", "\u310c\u3127\u311d",
|
|
|
|
|
"lin", "\u310c\u3127\u3123",
|
|
|
|
|
"ling", "\u310c\u3127\u3125",
|
|
|
|
|
"liu", "\u310c\u3127\u3121",
|
|
|
|
|
"lo", "\u310c\u311b",
|
|
|
|
|
"long", "\u310c\u3128\u3125",
|
|
|
|
|
"lou", "\u310c\u3121",
|
|
|
|
|
"lu", "\u310c\u3128",
|
|
|
|
|
"l<EFBFBD>", "\u310c\u3129",
|
|
|
|
|
"luan", "\u310c\u3128\u3122",
|
|
|
|
|
"l<EFBFBD>e", "\u310c\u3129\u311d",
|
|
|
|
|
"lun", "\u310c\u3128\u3123",
|
|
|
|
|
"luo", "\u310c\u3128\u311b",
|
|
|
|
|
"m", "\u3107",
|
|
|
|
|
"ma", "\u3107\u311a",
|
|
|
|
|
"mai", "\u3107\u311e",
|
|
|
|
|
"man", "\u3107\u3122",
|
|
|
|
|
"mang", "\u3107\u3124",
|
|
|
|
|
"mao", "\u3107\u3120",
|
|
|
|
|
"me", "\u3107\u311c",
|
|
|
|
|
"mei", "\u3107\u311f",
|
|
|
|
|
"men", "\u3107\u3123",
|
|
|
|
|
"meng", "\u3107\u3125",
|
|
|
|
|
"mi", "\u3107\u3127",
|
|
|
|
|
"mian", "\u3107\u3127\u3122",
|
|
|
|
|
"miao", "\u3107\u3127\u3120",
|
|
|
|
|
"mie", "\u3107\u3127\u311d",
|
|
|
|
|
"min", "\u3107\u3127\u3123",
|
|
|
|
|
"ming", "\u3107\u3127\u3125",
|
|
|
|
|
"miu", "\u3107\u3127\u3121",
|
|
|
|
|
"mo", "\u3107\u311b",
|
|
|
|
|
"mou", "\u3107\u3121",
|
|
|
|
|
"mu", "\u3107\u3128",
|
|
|
|
|
"n", "\u310b",
|
|
|
|
|
"na", "\u310b\u311a",
|
|
|
|
|
"nai", "\u310b\u311e",
|
|
|
|
|
"nan", "\u310b\u3122",
|
|
|
|
|
"nang", "\u310b\u3124",
|
|
|
|
|
"nao", "\u310b\u3120",
|
|
|
|
|
"ne", "\u310b\u311c",
|
|
|
|
|
"nei", "\u310b\u311f",
|
|
|
|
|
"nen", "\u310b\u3123",
|
|
|
|
|
"neng", "\u310b\u3125",
|
|
|
|
|
"ng", "\u312b",
|
|
|
|
|
"ni", "\u310b\u3127",
|
|
|
|
|
"nian", "\u310b\u3127\u3122",
|
|
|
|
|
"niang", "\u310b\u3127\u3124",
|
|
|
|
|
"niao", "\u310b\u3127\u3120",
|
|
|
|
|
"nie", "\u310b\u3127\u311d",
|
|
|
|
|
"nin", "\u310b\u3127\u3123",
|
|
|
|
|
"ning", "\u310b\u3127\u3125",
|
|
|
|
|
"niu", "\u310b\u3127\u3121",
|
|
|
|
|
"nong", "\u310b\u3128\u3125",
|
|
|
|
|
"nou", "\u310b\u3121",
|
|
|
|
|
"nu", "\u310b\u3128",
|
|
|
|
|
"n<EFBFBD>", "\u310b\u3129",
|
|
|
|
|
"nuan", "\u310b\u3128\u3122",
|
|
|
|
|
"n<EFBFBD>e", "\u310b\u3129\u311d",
|
|
|
|
|
"nuo", "\u310b\u3128\u311b",
|
|
|
|
|
"o", "\u311b",
|
|
|
|
|
"ou", "\u3121",
|
|
|
|
|
"pa", "\u3106\u311a",
|
|
|
|
|
"pai", "\u3106\u311e",
|
|
|
|
|
"pan", "\u3106\u3122",
|
|
|
|
|
"pang", "\u3106\u3124",
|
|
|
|
|
"pao", "\u3106\u3120",
|
|
|
|
|
"pei", "\u3106\u311f",
|
|
|
|
|
"pen", "\u3106\u3123",
|
|
|
|
|
"peng", "\u3106\u3125",
|
|
|
|
|
"pi", "\u3106\u3127",
|
|
|
|
|
"pian", "\u3106\u3127\u3122",
|
|
|
|
|
"piao", "\u3106\u3127\u3120",
|
|
|
|
|
"pie", "\u3106\u3127\u311d",
|
|
|
|
|
"pin", "\u3106\u3127\u3123",
|
|
|
|
|
"ping", "\u3106\u3127\u3125",
|
|
|
|
|
"po", "\u3106\u311b",
|
|
|
|
|
"pou", "\u3106\u3121",
|
|
|
|
|
"pu", "\u3106\u3128",
|
|
|
|
|
"qi", "\u3111",
|
|
|
|
|
"qia", "\u3111\u3127\u311a",
|
|
|
|
|
"qian", "\u3111\u3127\u3122",
|
|
|
|
|
"qiang", "\u3111\u3127\u3124",
|
|
|
|
|
"qiao", "\u3111\u3127\u3120",
|
|
|
|
|
"qie", "\u3111\u3127\u311d",
|
|
|
|
|
"qin", "\u3111\u3127\u3123",
|
|
|
|
|
"qing", "\u3111\u3127\u3125",
|
|
|
|
|
"qiong", "\u3111\u3129\u3125",
|
|
|
|
|
"qiu", "\u3111\u3129\u3121",
|
|
|
|
|
"qu", "\u3111\u3129",
|
|
|
|
|
"quan", "\u3111\u3129\u3122",
|
|
|
|
|
"que", "\u3111\u3129\u311d",
|
|
|
|
|
"qun", "\u3111\u3129\u3123",
|
|
|
|
|
"ran", "\u3116\u3122",
|
|
|
|
|
"rang", "\u3116\u3124",
|
|
|
|
|
"rao", "\u3116\u3120",
|
|
|
|
|
"re", "\u3116\u311c",
|
|
|
|
|
"ren", "\u3116\u3123",
|
|
|
|
|
"reng", "\u3116\u3125",
|
|
|
|
|
"ri", "\u3116",
|
|
|
|
|
"rong", "\u3116\u3128\u3125",
|
|
|
|
|
"rou", "\u3116\u3121",
|
|
|
|
|
"ru", "\u3116\u3128",
|
|
|
|
|
"ruan", "\u3116\u3128\u3122",
|
|
|
|
|
"rui", "\u3116\u3128\u311f",
|
|
|
|
|
"run", "\u3116\u3128\u3123",
|
|
|
|
|
"ruo", "\u3116\u3128\u311b",
|
|
|
|
|
"sa", "\u3119\u311a",
|
|
|
|
|
"sai", "\u3119\u311e",
|
|
|
|
|
"san", "\u3119\u3122",
|
|
|
|
|
"sang", "\u3119\u3124",
|
|
|
|
|
"sao", "\u3119\u3120",
|
|
|
|
|
"se", "\u3119\u311c",
|
|
|
|
|
"sen", "\u3119\u3123",
|
|
|
|
|
"seng", "\u3119\u3125",
|
|
|
|
|
"sha", "\u3115\u311a",
|
|
|
|
|
"shai", "\u3115\u311e",
|
|
|
|
|
"shan", "\u3115\u3122",
|
|
|
|
|
"shang", "\u3115\u3124",
|
|
|
|
|
"shao", "\u3115\u3120",
|
|
|
|
|
"she", "\u3115\u311c",
|
|
|
|
|
"shei", "\u3115\u311f",
|
|
|
|
|
"shen", "\u3115\u3123",
|
|
|
|
|
"sheng", "\u3115\u3125",
|
|
|
|
|
"shi", "\u3115",
|
|
|
|
|
"shou", "\u3115\u3121",
|
|
|
|
|
"shu", "\u3115\u3128",
|
|
|
|
|
"shua", "\u3115\u3128\u311a",
|
|
|
|
|
"shuai", "\u3115\u3128\u311e",
|
|
|
|
|
"shuan", "\u3115\u3128\u3122",
|
|
|
|
|
"shuang", "\u3115\u3128\u3124",
|
|
|
|
|
"shui", "\u3115\u3128\u311f",
|
|
|
|
|
"shun", "\u3115\u3128\u3123",
|
|
|
|
|
"shuo", "\u3115\u3128\u311b",
|
|
|
|
|
"si", "\u3119",
|
|
|
|
|
"song", "\u3119\u3128\u3125",
|
|
|
|
|
"sou", "\u3119\u3121",
|
|
|
|
|
"su", "\u3119\u3128",
|
|
|
|
|
"suan", "\u3119\u3128\u3122",
|
|
|
|
|
"sui", "\u3119\u3128\u311f",
|
|
|
|
|
"sun", "\u3119\u3128\u3123",
|
|
|
|
|
"suo", "\u3119\u3128\u311b",
|
|
|
|
|
"ta", "\u310a\u311a",
|
|
|
|
|
"tai", "\u310a\u311e",
|
|
|
|
|
"tan", "\u310a\u3122",
|
|
|
|
|
"tang", "\u310a\u3124",
|
|
|
|
|
"tao", "\u310a\u3120",
|
|
|
|
|
"te", "\u310a\u311c",
|
|
|
|
|
"teng", "\u310a\u3125",
|
|
|
|
|
"ti", "\u310a\u3127",
|
|
|
|
|
"tian", "\u310a\u3127\u3122",
|
|
|
|
|
"tiao", "\u310a\u3127\u3120",
|
|
|
|
|
"tie", "\u310a\u3127\u311d",
|
|
|
|
|
"ting", "\u310a\u3127\u3125",
|
|
|
|
|
"tong", "\u310a\u3128\u3125",
|
|
|
|
|
"tou", "\u310a\u3121",
|
|
|
|
|
"tu", "\u310a\u3128",
|
|
|
|
|
"tuan", "\u310a\u3128\u3122",
|
|
|
|
|
"tui", "\u310a\u3128\u311f",
|
|
|
|
|
"tun", "\u310a\u3128\u3123",
|
|
|
|
|
"tuo", "\u310a\u3128\u311b",
|
|
|
|
|
"wa", "\u3128\u311a",
|
|
|
|
|
"wai", "\u3128\u311e",
|
|
|
|
|
"wan", "\u3128\u3122",
|
|
|
|
|
"wang", "\u3128\u3124",
|
|
|
|
|
"wei", "\u3128\u311f",
|
|
|
|
|
"wen", "\u3128\u3123",
|
|
|
|
|
"weng", "\u3128\u3125",
|
|
|
|
|
"wo", "\u3128\u311b",
|
|
|
|
|
"wu", "\u3128",
|
|
|
|
|
"xi", "\u3112\u3127",
|
|
|
|
|
"xia", "\u3112\u3127\u311a",
|
|
|
|
|
"xian", "\u3112\u3127\u3122",
|
|
|
|
|
"xiang", "\u3112\u3127\u3124",
|
|
|
|
|
"xiao", "\u3112\u3127\u3120",
|
|
|
|
|
"xie", "\u3112\u3127\u311d",
|
|
|
|
|
"xin", "\u3112\u3127\u3123",
|
|
|
|
|
"xing", "\u3112\u3127\u3125",
|
|
|
|
|
"xiong", "\u3112\u3129\u3125",
|
|
|
|
|
"xiu", "\u3112\u3127\u3121",
|
|
|
|
|
"xu", "\u3112\u3129",
|
|
|
|
|
"xuan", "\u3112\u3129\u3122",
|
|
|
|
|
"xue", "\u3112\u3129\u311d",
|
|
|
|
|
"xun", "\u3112\u3129\u3123",
|
|
|
|
|
"ya", "\u3127\u311a",
|
|
|
|
|
"yai", "\u3127\u311e", // not in xinhua zidian index, but listed as alternate pronunciation
|
|
|
|
|
"yan", "\u3127\u3122",
|
|
|
|
|
"yang", "\u3127\u3124",
|
|
|
|
|
"yao", "\u3127\u3120",
|
|
|
|
|
"ye", "\u3127\u311d",
|
|
|
|
|
"yi", "\u3127",
|
|
|
|
|
"yin", "\u3127\u3123",
|
|
|
|
|
"ying", "\u3127\u3125",
|
|
|
|
|
"yo", "\u3127\u311b",
|
|
|
|
|
"yong", "\u3129\u3125",
|
|
|
|
|
"you", "\u3127\u3121",
|
|
|
|
|
"yu", "\u3129",
|
|
|
|
|
"yuan", "\u3129\u3122",
|
|
|
|
|
"yue", "\u3129\u311d",
|
|
|
|
|
"yun", "\u3129\u3123",
|
|
|
|
|
"za", "\u3117\u311a",
|
|
|
|
|
"zai", "\u3117\u311e",
|
|
|
|
|
"zan", "\u3117\u3122",
|
|
|
|
|
"zang", "\u3117\u3124",
|
|
|
|
|
"zao", "\u3117\u3120",
|
|
|
|
|
"ze", "\u3117",
|
|
|
|
|
"zei", "\u3117\u311f",
|
|
|
|
|
"zen", "\u3117\u3123",
|
|
|
|
|
"zeng", "\u3117\u3125",
|
|
|
|
|
"zha", "\u3113\u311a",
|
|
|
|
|
"zhai", "\u3113\u311e",
|
|
|
|
|
"zhan", "\u3113\u3122",
|
|
|
|
|
"zhang", "\u3113\u3124",
|
|
|
|
|
"zhao", "\u3113\u3120",
|
|
|
|
|
"zhe", "\u3113\u311d",
|
|
|
|
|
"zhei", "\u3113\u311f",
|
|
|
|
|
"zhen", "\u3113\u3123",
|
|
|
|
|
"zheng", "\u3113\u3125",
|
|
|
|
|
"zhi", "\u3113",
|
|
|
|
|
"zhong", "\u3113\u3128\u3125",
|
|
|
|
|
"zhou", "\u3113\u3121",
|
|
|
|
|
"zhu", "\u3113\u3128",
|
|
|
|
|
"zhua", "\u3113\u3128\u311a",
|
|
|
|
|
"zhuai", "\u3113\u3128\u311e",
|
|
|
|
|
"zhuan", "\u3113\u3128\u3122",
|
|
|
|
|
"zhuang", "\u3113\u3128\u3124",
|
|
|
|
|
"zhui", "\u3113\u3128\u311f",
|
|
|
|
|
"zhun", "\u3113\u3128\u3123",
|
|
|
|
|
"zhuo", "\u3113\u3128\u311b",
|
|
|
|
|
"zi", "\u3117",
|
|
|
|
|
"zong", "\u3117\u3128\u3125",
|
|
|
|
|
"zou", "\u3117\u3121",
|
|
|
|
|
"zu", "\u3117\u3128",
|
|
|
|
|
"zuan", "\u3117\u3128\u3122",
|
|
|
|
|
"zui", "\u3117\u3128\u311f",
|
|
|
|
|
"zun", "\u3117\u3128\u3123",
|
|
|
|
|
"zuo", "\u3117\u3128\u311b",
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static final Set fullPinyin = new TreeSet();
|
|
|
|
|
static {
|
|
|
|
|
for (int i = 0; i < pinyin_bopomofo.length; i+= 2) {
|
|
|
|
|
fullPinyin.add(pinyin_bopomofo[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static boolean isValidPinyin(String s) {
|
|
|
|
|
s = dropTones.transliterate(s);
|
|
|
|
|
if (fullPinyin.contains(s)) return true;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static boolean isValidPinyin2(String s) {
|
|
|
|
|
s = dropTones.transliterate(s);
|
|
|
|
|
for (int i = initialPinyin.length-1; i >= 0; --i) {
|
|
|
|
|
if (s.startsWith(initialPinyin[i])) {
|
|
|
|
|
String end = s.substring(initialPinyin[i].length());
|
|
|
|
|
for (int j = finalPinyin.length-1; j >= 0; --j) {
|
|
|
|
|
if (end.equals(finalPinyin[j])) return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
U+347C <EFBFBD> li<EFBFBD> #lyu<EFBFBD>
|
|
|
|
|
U+3500 <EFBFBD> l<EFBFBD><EFBFBD> #lv<EFBFBD>
|
|
|
|
|
U+3527 <EFBFBD> li<EFBFBD> #ly<EFBFBD>
|
|
|
|
|
U+3729 <EFBFBD> <EFBFBD>o #<EFBFBD>u
|
|
|
|
|
U+380E <EFBFBD> j<EFBFBD> #jj<EFBFBD>
|
|
|
|
|
U+3825 <EFBFBD> l<EFBFBD> #lv<EFBFBD>
|
|
|
|
|
U+3A3C <EFBFBD> l<EFBFBD><EFBFBD> #lu<EFBFBD>
|
|
|
|
|
U+3B5A <EFBFBD> li<EFBFBD> #ly<EFBFBD> *** l<EFBFBD>?
|
|
|
|
|
U+3CB6 <EFBFBD> l<EFBFBD> #lv<EFBFBD>
|
|
|
|
|
U+3D56 <EFBFBD> ni<EFBFBD> #ny<EFBFBD> *** n<EFBFBD>?
|
|
|
|
|
U+3D88 <EFBFBD> li<EFBFBD>ng #li<EFBFBD>ng
|
|
|
|
|
U+3EF2 <EFBFBD> li<EFBFBD> #ly<EFBFBD>*** l<EFBFBD>?
|
|
|
|
|
U+3F94 <EFBFBD> li<EFBFBD> #ly<EFBFBD>*** l<EFBFBD>?
|
|
|
|
|
U+4071 <EFBFBD> <EFBFBD>o #<EFBFBD>u
|
|
|
|
|
U+40AE <EFBFBD> li<EFBFBD> #lyu<EFBFBD> *** l<EFBFBD>e?
|
|
|
|
|
U+430E <EFBFBD> li<EFBFBD> #lyu<EFBFBD> *** l<EFBFBD>e?
|
|
|
|
|
U+451E <EFBFBD> li<EFBFBD> #ly<EFBFBD> *** l<EFBFBD>?
|
|
|
|
|
U+4588 <EFBFBD> n<EFBFBD><EFBFBD> #nu<EFBFBD>
|
|
|
|
|
U+458B <EFBFBD> n<EFBFBD><EFBFBD> #nu<EFBFBD>
|
|
|
|
|
U+45A1 <EFBFBD> ni<EFBFBD> #ny<EFBFBD> *** n<EFBFBD>?
|
|
|
|
|
U+4610 <EFBFBD> ni<EFBFBD> #ny<EFBFBD> *** n<EFBFBD>?
|
|
|
|
|
U+46BC <EFBFBD> ni<EFBFBD> #ny<EFBFBD> *** n<EFBFBD>?
|
|
|
|
|
U+46DA <EFBFBD> li<EFBFBD> #lyu<EFBFBD> *** l<EFBFBD>e?
|
|
|
|
|
U+4896 <EFBFBD> li<EFBFBD> #ly<EFBFBD> *** l<EFBFBD>?
|
|
|
|
|
U+4923 <EFBFBD> li<EFBFBD> #lyu<EFBFBD> *** l<EFBFBD>e?
|
|
|
|
|
U+4968 <EFBFBD> li<EFBFBD> #ly<EFBFBD> *** l<EFBFBD>?
|
|
|
|
|
U+4A0B <EFBFBD> ni<EFBFBD> #nyu<EFBFBD> *** n<EFBFBD>e?
|
|
|
|
|
U+4AC4 <EFBFBD> chu<EFBFBD> #chu<EFBFBD>
|
|
|
|
|
U+4D08 <EFBFBD> <EFBFBD>o #<EFBFBD>u
|
|
|
|
|
U+4D8A <EFBFBD> ni<EFBFBD> #ny<EFBFBD> *** n<EFBFBD>?
|
|
|
|
|
U+51CA <EFBFBD> q<EFBFBD>ng #q<EFBFBD>ng
|
|
|
|
|
U+51D6 <EFBFBD> zhu<EFBFBD>n #zhu<EFBFBD>n *** this is probably zh<EFBFBD>n
|
|
|
|
|
U+5481 <EFBFBD> g<EFBFBD>n #g<EFBFBD>m
|
|
|
|
|
U+5838 <EFBFBD> f<EFBFBD>ng #f<EFBFBD>ng
|
|
|
|
|
U+639F <EFBFBD> l<EFBFBD><EFBFBD> #lu<EFBFBD> *** this pronunciation surprises me, but I don't know...
|
|
|
|
|
U+66D5 <EFBFBD> y<EFBFBD>n #yi<EFBFBD>n
|
|
|
|
|
U+6B3B <EFBFBD> chu<EFBFBD> #chu<EFBFBD> *** chua _is_ ok after all, my table missed an entry
|
|
|
|
|
U+6B56 <EFBFBD> chu<EFBFBD> #chu<EFBFBD> *** chua
|
|
|
|
|
U+6C7C <EFBFBD> ni<EFBFBD> #ni<EFBFBD>u
|
|
|
|
|
U+6E6D <EFBFBD> qi<EFBFBD> #qi<EFBFBD>u
|
|
|
|
|
U+6F71 <EFBFBD> y<EFBFBD> #yi<EFBFBD>
|
|
|
|
|
U+7493 <EFBFBD> xi<EFBFBD> #xi<EFBFBD>u
|
|
|
|
|
U+7607 <EFBFBD> zh<EFBFBD>ng #zh<EFBFBD>ng *** I suspect zh<EFBFBD>ng
|
|
|
|
|
U+7674 <EFBFBD> lu<EFBFBD>n #l<EFBFBD><EFBFBD>n
|
|
|
|
|
U+7867 <EFBFBD> y<EFBFBD>ng #i<EFBFBD>ng
|
|
|
|
|
U+7878 <EFBFBD> n<EFBFBD><EFBFBD> #nu<EFBFBD>
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static Transliterator fixTypos = Transliterator.createFromRules("fix_typos",
|
|
|
|
|
"$cons=[bcdfghjklmnpqrstvwxyz];"
|
|
|
|
|
+"$nlet=[^[:Letter:][:Mark:]];"
|
|
|
|
|
+"$cons{iou}$nlet > iu;"
|
|
|
|
|
+"$cons{em}$nlet > an;"
|
|
|
|
|
+"$cons{uen}$nlet > ueng;"
|
|
|
|
|
+"$cons{ve}$nlet > <20>e;"
|
|
|
|
|
+"$cons{v}$nlet > <20>;"
|
|
|
|
|
+"$cons{yue}$nlet > iu;"
|
|
|
|
|
+"$cons{yng}$nlet > ing;"
|
|
|
|
|
+"$cons{yu}$nlet > iu;"
|
|
|
|
|
//+"$cons{ue} > <20>e;"
|
|
|
|
|
+"jj > j;"
|
|
|
|
|
//+"$nlet{ng}$nlet > eng;"
|
|
|
|
|
//+"$nlet{n}$nlet > en;"
|
|
|
|
|
//+"$nlet{m}$nlet > en;"
|
|
|
|
|
+"$nlet{au}$nlet > ao;"
|
|
|
|
|
|
|
|
|
|
// new fixes
|
|
|
|
|
+"zhueng}$nlet > zhong;"
|
|
|
|
|
+"zhuen}$nlet > zhuan;"
|
|
|
|
|
+"lue > l<>e;"
|
|
|
|
|
+"liong > liang;"
|
|
|
|
|
+"nue > n<>e;"
|
|
|
|
|
+"chua > chuo;"
|
|
|
|
|
+"yian > yan;"
|
|
|
|
|
+"yie > ye;"
|
|
|
|
|
+"l<EFBFBD>an > luan;"
|
|
|
|
|
+"iong > yong;"
|
|
|
|
|
, Transliterator.FORWARD);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static String fixPinyin(String s) {
|
|
|
|
|
String original = s;
|
|
|
|
|
//err.println("Source: " + s);
|
|
|
|
|
s = accentPinyin_digitPinyin.transliterate(s);
|
|
|
|
|
//err.println("Digit: " + s);
|
|
|
|
|
s = fixTypos.transliterate(s);
|
|
|
|
|
//err.println("fixed: " + s);
|
|
|
|
|
s = digitPinyin_accentPinyin.transliterate(s);
|
|
|
|
|
//err.println("Result: " + s);
|
|
|
|
|
if (isValidPinyin(s)) return s;
|
|
|
|
|
return original;
|
|
|
|
|
}
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
static PrintWriter log;
|
2001-10-25 20:37:09 +00:00
|
|
|
|
static PrintWriter out;
|
|
|
|
|
static PrintWriter err;
|
|
|
|
|
|
|
|
|
|
static int count;
|
2002-06-13 21:14:05 +00:00
|
|
|
|
static int totalCount;
|
2001-10-25 20:37:09 +00:00
|
|
|
|
static int oldLine;
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static void readFrequencyData(int type) throws java.io.IOException {
|
2002-06-13 21:14:05 +00:00
|
|
|
|
String line = "";
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
|
|
// chinese_frequency.txt
|
|
|
|
|
// 1 的 1588561 1588561 3.5008%
|
|
|
|
|
// japanese_frequency.txt
|
|
|
|
|
// 1 ? 17176
|
|
|
|
|
|
|
|
|
|
Set combinedRank = new TreeSet();
|
2002-07-21 08:43:39 +00:00
|
|
|
|
BufferedReader br;
|
2002-06-13 21:14:05 +00:00
|
|
|
|
int counter = 0;
|
2002-07-21 08:43:39 +00:00
|
|
|
|
Iterator it;
|
|
|
|
|
|
|
|
|
|
if (type == CHINESE) {
|
|
|
|
|
System.out.println("Reading chinese_frequency.txt");
|
2002-10-05 01:28:58 +00:00
|
|
|
|
br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", Utility.UTF8);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
counter = 0;
|
|
|
|
|
while (true) {
|
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
Utility.dot(counter++);
|
|
|
|
|
int tabPos = line.indexOf('\t');
|
|
|
|
|
int rank = Integer.parseInt(line.substring(0,tabPos));
|
|
|
|
|
int cp = line.charAt(tabPos+1);
|
|
|
|
|
//if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
|
|
|
|
|
combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp)));
|
|
|
|
|
}
|
|
|
|
|
br.close();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
if (type == JAPANESE) {
|
|
|
|
|
System.out.println("Reading japanese_frequency.txt");
|
|
|
|
|
|
2002-10-05 01:28:58 +00:00
|
|
|
|
br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", Utility.UTF8);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
Map japaneseMap = new HashMap();
|
|
|
|
|
while (true) {
|
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
Utility.dot(counter++);
|
|
|
|
|
int tabPos = line.indexOf(' ');
|
|
|
|
|
|
|
|
|
|
int tabPos2 = line.indexOf(' ', tabPos+1);
|
|
|
|
|
int freq = Integer.parseInt(line.substring(tabPos2+1));
|
|
|
|
|
|
|
|
|
|
for (int i = tabPos+1; i < tabPos2; ++i) {
|
|
|
|
|
int cp = line.charAt(i);
|
|
|
|
|
int script = Default.ucd.getScript(cp);
|
|
|
|
|
if (script != HAN_SCRIPT) {
|
|
|
|
|
if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT) {
|
|
|
|
|
System.out.println("Huh: " + Default.ucd.getCodeAndName(cp));
|
|
|
|
|
}
|
|
|
|
|
continue;
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
2002-07-21 08:43:39 +00:00
|
|
|
|
// if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
|
|
|
|
|
Utility.addCount(japaneseMap, UTF16.valueOf(cp), -freq);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
2002-07-21 08:43:39 +00:00
|
|
|
|
br.close();
|
|
|
|
|
// get rank order japanese
|
|
|
|
|
it = japaneseMap.keySet().iterator();
|
|
|
|
|
int countJapanese = 0;
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
Comparable key = (Comparable) it.next();
|
|
|
|
|
Comparable val = (Comparable) japaneseMap.get(key);
|
|
|
|
|
combinedRank.add(new Pair(new Integer(++countJapanese), key));
|
|
|
|
|
}
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int overallRank = 0;
|
|
|
|
|
it = combinedRank.iterator();
|
|
|
|
|
|
2002-08-04 21:38:45 +00:00
|
|
|
|
boolean showFrequency = false;
|
|
|
|
|
|
|
|
|
|
if (showFrequency) {
|
|
|
|
|
log.println();
|
|
|
|
|
log.println("@Frequency data: Rank of Character");
|
|
|
|
|
log.println();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// make up rankMap, rankList
|
2002-07-30 09:57:18 +00:00
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
while(it.hasNext()) {
|
|
|
|
|
Pair p = (Pair) it.next();
|
2002-08-04 21:38:45 +00:00
|
|
|
|
if (showFrequency) log.println(p.first + ", " + p.second);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
Object rank = rankMap.get(p.second);
|
|
|
|
|
if (rank == null) {
|
|
|
|
|
rankMap.put(p.second, new Integer(++overallRank));
|
|
|
|
|
rankList.add(p.second);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2002-08-04 21:38:45 +00:00
|
|
|
|
if (showFrequency) {
|
|
|
|
|
log.println();
|
|
|
|
|
log.println("@Frequency data: Character to Rank");
|
|
|
|
|
log.println();
|
|
|
|
|
|
|
|
|
|
// get full order
|
|
|
|
|
it = rankList.iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
Comparable key = (Comparable) it.next();
|
|
|
|
|
Comparable val = (Comparable) rankMap.get(key);
|
|
|
|
|
log.println(key + ", " + val);
|
|
|
|
|
}
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new ChainException("Line \"{0}\"", new String[] {line}, e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void compareUnihanWithCEDICT() {
|
|
|
|
|
System.out.println("@Comparing CEDICT to Unihan");
|
|
|
|
|
log.println("@Comparing CEDICT to Unihan");
|
|
|
|
|
Iterator it = unihanMap.keySet().iterator();
|
|
|
|
|
List inCEDICT = new ArrayList();
|
|
|
|
|
List inUnihan = new ArrayList();
|
|
|
|
|
List inBoth = new ArrayList();
|
|
|
|
|
UnicodeSet noPinyin = new UnicodeSet();
|
|
|
|
|
UnicodeSet kPinyin = new UnicodeSet();
|
|
|
|
|
UnicodeSet tPinyin = new UnicodeSet();
|
|
|
|
|
UnicodeSet sPinyin = new UnicodeSet();
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < 0x10FFFF; ++i) {
|
|
|
|
|
if (!Default.ucd.isAllocated(i)) continue;
|
|
|
|
|
if (Default.ucd.getScript(i) != HAN_SCRIPT) continue;
|
|
|
|
|
Utility.dot(i);
|
|
|
|
|
|
|
|
|
|
String ch = UTF16.valueOf(i);
|
|
|
|
|
|
|
|
|
|
String pinyin = (String) unihanMap.get(ch);
|
|
|
|
|
if (pinyin == null) {
|
|
|
|
|
String ch2 = Default.nfkd.normalize(ch);
|
|
|
|
|
pinyin = (String) unihanMap.get(ch2);
|
|
|
|
|
if (pinyin != null) {
|
2002-07-14 22:04:49 +00:00
|
|
|
|
addCheck(ch, pinyin, "n/a");
|
2002-06-13 21:14:05 +00:00
|
|
|
|
kPinyin.add(i);
|
|
|
|
|
} else {
|
|
|
|
|
String trial = (String) simplifiedToTraditional.get(ch2);
|
|
|
|
|
if (trial != null) {
|
|
|
|
|
pinyin = (String) unihanMap.get(trial);
|
|
|
|
|
if (pinyin != null) {
|
2002-07-14 22:04:49 +00:00
|
|
|
|
addCheck(ch, pinyin, "n/a");
|
2002-06-13 21:14:05 +00:00
|
|
|
|
tPinyin.add(i);
|
|
|
|
|
} else {
|
|
|
|
|
trial = (String) traditionalToSimplified.get(ch2);
|
|
|
|
|
if (trial != null) {
|
|
|
|
|
pinyin = (String) unihanMap.get(trial);
|
|
|
|
|
if (pinyin != null) {
|
2002-07-14 22:04:49 +00:00
|
|
|
|
addCheck(ch, pinyin, "n/a");
|
2002-06-13 21:14:05 +00:00
|
|
|
|
sPinyin.add(i);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Map pinyinSet = (Map) cdict.get(ch);
|
|
|
|
|
if (pinyin == null) {
|
|
|
|
|
if (pinyinSet != null) inCEDICT.add(ch + " => " + pinyinSet);
|
|
|
|
|
noPinyin.add(i);
|
|
|
|
|
} else if (pinyinSet == null) {
|
|
|
|
|
inUnihan.add(ch + " => " + pinyin);
|
|
|
|
|
} else {
|
|
|
|
|
Object temp = pinyinSet.get(pinyin);
|
|
|
|
|
if (temp == null) {
|
|
|
|
|
inBoth.add(ch + " => " + pinyin + "; " + pinyinSet);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log.println("@In CEDICT but not Unihan: ");
|
|
|
|
|
printCollection(log, inCEDICT);
|
|
|
|
|
|
|
|
|
|
log.println("@In Unihan but not CEDICT: ");
|
|
|
|
|
printCollection(log, inUnihan);
|
|
|
|
|
|
|
|
|
|
log.println("@In Unihan and CEDICT, but different: ");
|
|
|
|
|
printCollection(log, inBoth);
|
|
|
|
|
|
|
|
|
|
log.println("@Missing from Unihan: ");
|
|
|
|
|
log.println(noPinyin.toPattern(true));
|
|
|
|
|
|
|
|
|
|
log.println("@Has mapping if we NFKD it: ");
|
|
|
|
|
log.println(kPinyin.toPattern(true));
|
|
|
|
|
|
|
|
|
|
log.println("@Has mapping if we NFKC & simp-trad it: ");
|
|
|
|
|
log.println(tPinyin.toPattern(true));
|
|
|
|
|
|
|
|
|
|
log.println("@Has mapping if we NFKC & trad-simp it: ");
|
|
|
|
|
log.println(sPinyin.toPattern(true));
|
|
|
|
|
|
|
|
|
|
log.println("@Done comparison");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void printCollection(PrintWriter p, Collection c) {
|
|
|
|
|
Iterator it = c.iterator();
|
|
|
|
|
int count = 0;
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
p.println((++count) + "\t" + it.next());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static Map rankMap = new TreeMap(); // maps from single char strings to overall rank
|
|
|
|
|
static List rankList = new ArrayList(10000);
|
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
// form: ???? [ai4 wu1 ji2 wu1] /love me/love my dog/
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static void readCDICTDefinitions(int type) throws IOException {
|
|
|
|
|
String fname = "cdict.txt";
|
|
|
|
|
if (type == JAPANESE) fname = "edict.txt";
|
|
|
|
|
|
|
|
|
|
System.out.println("Reading " + fname);
|
2002-10-05 01:28:58 +00:00
|
|
|
|
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
int counter = 0;
|
|
|
|
|
String[] pieces = new String[50];
|
|
|
|
|
String line = "";
|
2002-07-21 08:43:39 +00:00
|
|
|
|
String definition;
|
2002-07-14 22:04:49 +00:00
|
|
|
|
try {
|
|
|
|
|
while (true) {
|
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
Utility.dot(counter++);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int pinyinStart = line.indexOf('[');
|
|
|
|
|
int pinyinEnd = line.indexOf(']', pinyinStart+1);
|
|
|
|
|
int defStart = line.indexOf('/', pinyinEnd+1);
|
|
|
|
|
int defEnd = line.indexOf('/', defStart+1);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
|
|
|
|
int firstData = pinyinStart >= 0 ? pinyinStart : defStart;
|
|
|
|
|
|
|
|
|
|
String word = line.substring(0,firstData).trim();
|
|
|
|
|
|
|
|
|
|
if (type == DEFINITION) {
|
|
|
|
|
definition = fixDefinition(line.substring(defStart+1, defEnd), line);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
addCheck(word, definition, line);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
} else if (pinyinStart >= 0) {
|
|
|
|
|
definition = line.substring(pinyinStart+1, pinyinEnd).trim();
|
|
|
|
|
if (type == JAPANESE) {
|
|
|
|
|
processEdict(word, definition, line);
|
|
|
|
|
} else {
|
2003-02-25 23:38:23 +00:00
|
|
|
|
definition = digitToPinyin(definition, line);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
//definition = Utility.replace(definition, " ", "\\ ");
|
|
|
|
|
addCheck(word, definition, line);
|
|
|
|
|
}
|
2002-07-14 22:04:49 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
br.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2002-08-04 21:38:45 +00:00
|
|
|
|
static void readOverrides(int type) throws IOException {
|
|
|
|
|
if (type != CHINESE) return;
|
|
|
|
|
String fname = "Chinese_override.txt";
|
|
|
|
|
|
|
|
|
|
System.out.println("Reading " + fname);
|
2002-10-05 01:28:58 +00:00
|
|
|
|
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
|
2002-08-04 21:38:45 +00:00
|
|
|
|
int counter = 0;
|
|
|
|
|
String[] pieces = new String[50];
|
|
|
|
|
String line = "";
|
2003-02-25 23:38:23 +00:00
|
|
|
|
boolean noOverrideFailure = true;
|
2002-08-04 21:38:45 +00:00
|
|
|
|
try {
|
|
|
|
|
while (true) {
|
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
Utility.dot(counter++);
|
2003-02-25 23:38:23 +00:00
|
|
|
|
//System.out.println(line);
|
2002-08-04 21:38:45 +00:00
|
|
|
|
|
|
|
|
|
// skip code
|
2003-02-25 23:38:23 +00:00
|
|
|
|
line=line.toLowerCase();
|
|
|
|
|
|
2002-08-04 21:38:45 +00:00
|
|
|
|
int wordStart = line.indexOf('\t') + 1;
|
|
|
|
|
int wordEnd = line.indexOf('\t', wordStart);
|
|
|
|
|
String word = line.substring(wordStart, wordEnd);
|
2003-02-25 23:38:23 +00:00
|
|
|
|
String definition = fixPinyin(line.substring(wordEnd+1));
|
|
|
|
|
String old = (String) unihanMap.get(word);
|
|
|
|
|
if (old != null) {
|
|
|
|
|
if (!old.equals(definition)) {
|
|
|
|
|
if (noOverrideFailure) {
|
|
|
|
|
System.out.println("Overriding Failure");
|
|
|
|
|
noOverrideFailure = false;
|
|
|
|
|
}
|
|
|
|
|
err.println("Overriding Failure: " + word
|
|
|
|
|
+ "\t" + old + " " + toHexUnicode.transliterate(old)
|
|
|
|
|
+ "\t" + definition + " " + toHexUnicode.transliterate(definition));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
addCheck(word, definition, line);
|
|
|
|
|
overrideSet.add(word);
|
|
|
|
|
}
|
2002-08-04 21:38:45 +00:00
|
|
|
|
}
|
|
|
|
|
br.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
@Unihan Data
|
|
|
|
|
|
|
|
|
|
Bad pinyin data: \u4E7F ? LE
|
|
|
|
|
\u7684 ? de, de, d<EFBFBD>, d<EFBFBD>
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static void fixChineseOverrides() throws IOException {
|
|
|
|
|
|
|
|
|
|
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
|
|
|
|
|
out = Utility.openPrintWriter("new_Chinese_override.txt", Utility.UTF8_WINDOWS);
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
|
|
String fname = "fixed_Chinese_transliterate_log.txt";
|
|
|
|
|
|
|
|
|
|
int counter = 0;
|
|
|
|
|
String line = "";
|
|
|
|
|
String pinyinPrefix = "Bad pinyin data: ";
|
|
|
|
|
|
|
|
|
|
System.out.println("Reading " + fname);
|
|
|
|
|
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
|
|
|
|
|
try {
|
|
|
|
|
while (true) {
|
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
if (line.charAt(0) == 0xFEFF) {
|
|
|
|
|
line = line.substring(1); // remove BOM
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
}
|
|
|
|
|
Utility.dot(counter++);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (line.charAt(0) == '@') continue;
|
|
|
|
|
if (line.startsWith(pinyinPrefix)) {
|
|
|
|
|
line = line.substring(pinyinPrefix.length());
|
|
|
|
|
}
|
|
|
|
|
line = line.toLowerCase();
|
|
|
|
|
|
|
|
|
|
//System.out.println(Default.ucd.getCode(line));
|
|
|
|
|
// skip code
|
|
|
|
|
int wordStart = line.indexOf('\t') + 1;
|
|
|
|
|
int wordEnd = line.indexOf('\t', wordStart);
|
|
|
|
|
String word = line.substring(wordStart, wordEnd).trim();
|
|
|
|
|
|
|
|
|
|
int defStart = wordEnd+1;
|
|
|
|
|
int defEnd = line.indexOf(',', defStart);
|
|
|
|
|
if (defEnd < 0) defEnd = line.length();
|
|
|
|
|
|
|
|
|
|
String definition = fixCircumflex.transliterate(line.substring(defStart, defEnd).trim());
|
|
|
|
|
|
|
|
|
|
String notones = dropTones.transliterate(definition);
|
|
|
|
|
if (definition.equals(notones)) {
|
|
|
|
|
definition = digitPinyin_accentPinyin.transliterate(definition + "1");
|
|
|
|
|
if (definition == null) {
|
|
|
|
|
System.out.println("Huh? " + notones);
|
|
|
|
|
}
|
|
|
|
|
log.println("Fixing: " + notones + " => " + definition + "; " + line);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out.println(hex.transliterate(word) + "\t" + word + "\t" + definition);
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
|
|
|
|
} finally {
|
|
|
|
|
br.close();
|
|
|
|
|
}
|
|
|
|
|
} finally {
|
|
|
|
|
out.close();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2002-08-04 21:38:45 +00:00
|
|
|
|
static Set overrideSet = new HashSet();
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static void processEdict(String word, String definition, String line) {
|
|
|
|
|
// We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH
|
|
|
|
|
// C = CJK, H = Hiragana, K = katakana
|
|
|
|
|
|
|
|
|
|
// We want to break those up into the following rules.
|
|
|
|
|
// { CCC } HHHKKKCCCHH => HHH
|
|
|
|
|
// CCCHHHKKK { CC } HHCCH => HH
|
|
|
|
|
// CCCHHHKKKCCHH { CC } H => HH
|
|
|
|
|
|
|
|
|
|
int[] offset = {0};
|
|
|
|
|
int[] offset2 = {0};
|
|
|
|
|
int[][] pairList = new int[50][2];
|
|
|
|
|
int pairCount = 0;
|
|
|
|
|
|
|
|
|
|
// first gather the information as to where the CJK blocks are
|
|
|
|
|
// do this all at once, so we can refer to stuff ahead of us
|
|
|
|
|
while (true) {
|
|
|
|
|
// find next CJK block
|
|
|
|
|
// where CJK really means anything but kana
|
|
|
|
|
int type = find(word, kana, offset, offset2, word.length(), false, false);
|
|
|
|
|
if (type == UnicodeMatcher.U_MISMATCH) break; // we are done.
|
|
|
|
|
pairList[pairCount][0] = offset[0];
|
|
|
|
|
pairList[pairCount++][1] = offset2[0];
|
|
|
|
|
offset[0] = offset2[0]; // get ready for the next one
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// IF we only got one CJK block, and it goes from the start to the end, then just do it.
|
|
|
|
|
|
|
|
|
|
if (pairCount == 1 && pairList[0][0] == 0 && pairList[0][1] == word.length()) {
|
|
|
|
|
addCheck(word, kanaToLatin.transliterate(definition), line);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// IF we didn't find any Kanji, bail.
|
|
|
|
|
|
|
|
|
|
if (pairCount < 1) {
|
|
|
|
|
System.out.println("No Kanji on line, skipping");
|
|
|
|
|
System.out.println(hex.transliterate(word) + " > " + hex.transliterate(definition)
|
|
|
|
|
+ ", " + kanaToLatin.transliterate(definition));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Now generate the rules
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (DEBUG && pairCount > 1) {
|
|
|
|
|
System.out.println("Paircount: " + pairCount);
|
|
|
|
|
System.out.println("\t" + hex.transliterate(word) + " > " + hex.transliterate(definition) + ", " + kanaToLatin.transliterate(definition));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pairList[pairCount][0] = word.length(); // to make the algorithm easier, we add a termination
|
|
|
|
|
int delta = 0; // the current difference in positions between the definition and the word
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < pairCount; ++i) {
|
|
|
|
|
int start = pairList[i][0];
|
|
|
|
|
int limit = pairList[i][1];
|
|
|
|
|
if (DEBUG && pairCount > 1) System.out.println(start + ", " + limit + ", " + delta);
|
|
|
|
|
|
|
|
|
|
// that part was easy. the hard part is figuring out where this corresponds to in the definition.
|
|
|
|
|
// For now, we use a simple mechanism.
|
|
|
|
|
|
|
|
|
|
// The word and the definition should match to this point, so we just use the start (offset by delta)
|
|
|
|
|
// We'll check just to be sure.
|
|
|
|
|
|
|
|
|
|
int lastLimit = i == 0 ? 0 : pairList[i-1][1];
|
|
|
|
|
|
|
|
|
|
int defStart = start + delta;
|
|
|
|
|
|
|
|
|
|
String defPrefix = definition.substring(0, defStart);
|
|
|
|
|
String wordInfix = word.substring(lastLimit, start);
|
|
|
|
|
|
|
|
|
|
boolean firstGood = defPrefix.endsWith(wordInfix);
|
|
|
|
|
if (!firstGood) {
|
|
|
|
|
String wordInfix2 = katakanatoHiragana.transliterate(wordInfix);
|
|
|
|
|
firstGood = defPrefix.endsWith(wordInfix2);
|
|
|
|
|
}
|
|
|
|
|
if (!firstGood) {
|
|
|
|
|
// Houston, we have a problem.
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("Suspect line: " + hex.transliterate(word) + " > " + hex.transliterate(definition)
|
|
|
|
|
+ ", " + kanaToLatin.transliterate(definition));
|
|
|
|
|
System.out.println("\tNo match for " + hex.transliterate(word.substring(lastLimit, start))
|
|
|
|
|
+ " at end of " + hex.transliterate(definition.substring(0, defStart)));
|
|
|
|
|
break; // BAIL
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// For the limit of the defintion, we get the intermediate portion of the word
|
|
|
|
|
// then search for it in the definition.
|
|
|
|
|
// We could get tripped up if the end of the transliteration of the Kanji matched the start.
|
|
|
|
|
// If so, we should find out on the next pass.
|
|
|
|
|
|
|
|
|
|
int defLimit;
|
|
|
|
|
if (limit == word.length()) {
|
|
|
|
|
defLimit = definition.length();
|
|
|
|
|
} else {
|
|
|
|
|
String afterPart = word.substring(limit, pairList[i+1][0]);
|
|
|
|
|
defLimit = definition.indexOf(afterPart, defStart+1); // we assume the CJK is at least one!
|
|
|
|
|
if (defLimit < 0) {
|
|
|
|
|
String afterPart2 = katakanatoHiragana.transliterate(afterPart);
|
|
|
|
|
defLimit = definition.indexOf(afterPart2, defStart+1); // we assume the CJK is at least one!
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (defLimit < 0) {
|
|
|
|
|
// Houston, we have a problem.
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("Suspect line: " + hex.transliterate(word) + " > " + hex.transliterate(definition)
|
|
|
|
|
+ ", " + kanaToLatin.transliterate(definition));
|
|
|
|
|
System.out.println("\tNo match for " + hex.transliterate(afterPart)
|
|
|
|
|
+ " in " + hex.transliterate(definition.substring(0, defStart+1)));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
String defPart = definition.substring(defStart, defLimit);
|
|
|
|
|
defPart = kanaToLatin.transliterate(defPart);
|
|
|
|
|
|
|
|
|
|
// FOR NOW, JUNK the context before!!
|
|
|
|
|
// String contextWord = word.substring(0, start) + "{" + word.substring(start, limit) + "}" + word.substring(limit);
|
|
|
|
|
String contextWord = word.substring(start, limit);
|
|
|
|
|
if (limit != word.length()) contextWord += "}" + word.substring(limit);
|
|
|
|
|
|
|
|
|
|
addCheck(contextWord, defPart, line);
|
|
|
|
|
if (DEBUG && pairCount > 1) System.out.println("\t" + hex.transliterate(contextWord) + " > " + hex.transliterate(defPart));
|
|
|
|
|
|
|
|
|
|
delta = defLimit - limit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Useful Utilities?
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the start of the first substring that matches m.
|
|
|
|
|
* Most arguments are the same as UnicodeMatcher.matches, except for offset[]
|
|
|
|
|
* @positive Use true if you want the first point that matches, and false if you want the first point that doesn't match.
|
|
|
|
|
* @offset On input, the starting position. On output, the start of the match position (not the end!!)
|
|
|
|
|
*/
|
|
|
|
|
static int find(Replaceable s, UnicodeMatcher m, int[] offset, int limit, boolean incremental, boolean positive) {
|
|
|
|
|
int direction = offset[0] <= limit ? 1 : -1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while (offset[0] != limit) {
|
|
|
|
|
int original = offset[0];
|
|
|
|
|
int type = m.matches(s, offset, limit, incremental); // if successful, changes offset.
|
|
|
|
|
if (type == UnicodeMatcher.U_MISMATCH) {
|
|
|
|
|
if (!positive) {
|
|
|
|
|
return UnicodeMatcher.U_MATCH;
|
|
|
|
|
}
|
|
|
|
|
offset[0] += direction; // used to skip to next code unit, in the positive case
|
|
|
|
|
// !! This should be safe, and saves checking the length of the code point
|
|
|
|
|
} else if (positive) {
|
|
|
|
|
offset[0] = original; // reset to the start position!!!
|
|
|
|
|
return type;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return UnicodeMatcher.U_MISMATCH;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the start/limit of the first substring that matches m. Most arguments are the same as find().<br>
|
|
|
|
|
* <b>Warning:</b> if the search is backwards, then substringEnd will contain the <i>start</i> of the substring
|
|
|
|
|
* and offset will contain the </i>limit</i> of the substring.
|
|
|
|
|
*/
|
|
|
|
|
static int find(Replaceable s, UnicodeMatcher m, int[] offset, int[] offset2, int limit, boolean incremental, boolean positive) {
|
|
|
|
|
int type = find(s, m, offset, limit, incremental, positive);
|
|
|
|
|
if (type == UnicodeMatcher.U_MISMATCH) return type;
|
|
|
|
|
offset2[0] = offset[0];
|
|
|
|
|
int type2 = find(s, m, offset2, limit, incremental, !positive);
|
|
|
|
|
return type;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int find(String ss, UnicodeMatcher m, int[] offset, int limit, boolean incremental, boolean positive) {
|
|
|
|
|
// UGLY that we have to create a wrapper!
|
|
|
|
|
return find(new ReplaceableString(ss), m, offset, limit, incremental, positive);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int find(String ss, UnicodeMatcher m, int[] offset, int[] offset2, int limit, boolean incremental, boolean positive) {
|
|
|
|
|
// UGLY that we have to create a wrapper!
|
|
|
|
|
return find(new ReplaceableString(ss), m, offset, offset2, limit, incremental, positive);
|
|
|
|
|
}
|
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
static UnicodeSet pua = new UnicodeSet("[:private use:]");
|
|
|
|
|
static UnicodeSet numbers = new UnicodeSet("[0-9]");
|
|
|
|
|
|
|
|
|
|
static void addCheck(String word, String definition, String line) {
|
2002-07-21 08:43:39 +00:00
|
|
|
|
int lastSlash = 0;
|
|
|
|
|
while (lastSlash < word.length()) {
|
|
|
|
|
int wordSlash = word.indexOf('/', lastSlash);
|
|
|
|
|
if (wordSlash < 0) wordSlash = word.length();
|
|
|
|
|
addCheck2(word.substring(lastSlash, wordSlash), definition, line);
|
|
|
|
|
lastSlash = wordSlash + 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void addCheck2(String word, String definition, String line) {
|
2002-07-30 09:57:18 +00:00
|
|
|
|
definition = Default.nfc.normalize(definition);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
word = Default.nfc.normalize(word);
|
2002-07-30 09:57:18 +00:00
|
|
|
|
if (DO_SIMPLE && UTF16.countCodePoint(word) > 1) return;
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
if (pua.containsSome(word) ) {
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("PUA on: " + line);
|
|
|
|
|
} else if (numbers.containsAll(definition) ) {
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("Only numbers on: " + line);
|
|
|
|
|
} else {
|
2002-07-30 09:57:18 +00:00
|
|
|
|
Object alreadyThere = unihanMap.get(word);
|
|
|
|
|
if (alreadyThere == null) {
|
|
|
|
|
unihanMap.put(word, definition);
|
|
|
|
|
} else if (!definition.equals(alreadyThere)) {
|
|
|
|
|
Utility.addToList(duplicates, word, alreadyThere, true);
|
|
|
|
|
Utility.addToList(duplicates, word, definition, true);
|
|
|
|
|
}
|
2002-07-14 22:04:49 +00:00
|
|
|
|
}
|
|
|
|
|
if (UTF16.countCodePoint(word) > 1) unihanNonSingular = true;
|
|
|
|
|
}
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
static void readCDICT() throws IOException {
|
|
|
|
|
System.out.println("Reading cdict.txt");
|
2003-02-25 23:38:23 +00:00
|
|
|
|
String fname = "cdict.txt";
|
|
|
|
|
|
|
|
|
|
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
int counter = 0;
|
|
|
|
|
String[] pieces = new String[50];
|
|
|
|
|
String line = "";
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
while (true) {
|
|
|
|
|
line = Utility.readDataLine(br);
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
Utility.dot(counter++);
|
|
|
|
|
int tabPos = line.indexOf('[');
|
|
|
|
|
String word = line.substring(0,tabPos).trim();
|
|
|
|
|
word = Utility.replace(word, "\uFE4D", "");
|
|
|
|
|
word = Utility.replace(word, ".", "");
|
|
|
|
|
word = Utility.replace(word, "/", "");
|
|
|
|
|
word = Utility.replace(word, "(", "");
|
|
|
|
|
word = Utility.replace(word, ")", "");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int tab2Pos = line.indexOf(']', tabPos+1);
|
|
|
|
|
String pinyins = line.substring(tabPos+1, tab2Pos);
|
|
|
|
|
int len = Utility.split(pinyins, ' ', pieces);
|
|
|
|
|
if (word.length() != len) {
|
|
|
|
|
log.println("Len mismatch: " + line);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
|
|
|
String chr = word.substring(i, i+1);
|
2003-02-25 23:38:23 +00:00
|
|
|
|
|
|
|
|
|
String piece = digitToPinyin(pieces[i], line);
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
Map oldMap = (Map) cdict.get(chr);
|
|
|
|
|
if (oldMap == null) {
|
|
|
|
|
oldMap = new TreeMap();
|
|
|
|
|
cdict.put(chr, oldMap);
|
|
|
|
|
}
|
|
|
|
|
/*&& !oldMap.equals(piece)) {
|
|
|
|
|
log.println("Variant for '" + chr + "', new: '" + piece + "', old: '" + oldMap + "'");
|
|
|
|
|
}
|
|
|
|
|
*/
|
|
|
|
|
Utility.addCount(oldMap, piece, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
br.close();
|
|
|
|
|
|
|
|
|
|
Iterator it = cdict.keySet().iterator();
|
|
|
|
|
Set tempSet = new TreeSet();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
Object key = it.next();
|
|
|
|
|
Map val = (Map) cdict.get(key);
|
|
|
|
|
log.print(key + ": ");
|
|
|
|
|
Iterator it2 = val.keySet().iterator();
|
|
|
|
|
tempSet.clear();
|
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
|
Comparable key2 = (Comparable) it2.next();
|
|
|
|
|
Comparable count = (Comparable) val.get(key2);
|
|
|
|
|
Pair p = new Pair(count, key2);
|
|
|
|
|
tempSet.add(p); // reverse the order
|
|
|
|
|
}
|
|
|
|
|
it2 = tempSet.iterator();
|
|
|
|
|
int counter2 = 0;
|
|
|
|
|
while (it2.hasNext()) {
|
|
|
|
|
if (counter2++ != 0) log.print("/");
|
|
|
|
|
log.print(it2.next());
|
|
|
|
|
}
|
|
|
|
|
log.println();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
static String digitToPinyin(String source, String line) {
|
|
|
|
|
if (source.indexOf('5') >= 0) log.println("Pinyin Tone5 at: " + line);
|
|
|
|
|
return digitPinyin_accentPinyin.transliterate(source);
|
|
|
|
|
}
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
static Map cdict = new TreeMap();
|
|
|
|
|
static Map simplifiedToTraditional = new HashMap();
|
|
|
|
|
static Map traditionalToSimplified = new HashMap();
|
2001-10-25 20:37:09 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static void readUnihanData(String key) throws java.io.IOException {
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-10-05 01:28:58 +00:00
|
|
|
|
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, Utility.UTF8);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
|
|
|
|
|
int count = 0;
|
|
|
|
|
int lineCounter = 0;
|
|
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
|
Utility.dot(++lineCounter);
|
|
|
|
|
|
|
|
|
|
String line = in.readLine();
|
|
|
|
|
if (line == null) break;
|
|
|
|
|
if (line.length() < 6) continue;
|
|
|
|
|
if (line.charAt(0) == '#') continue;
|
2002-06-13 21:14:05 +00:00
|
|
|
|
line = line.trim();
|
|
|
|
|
|
|
|
|
|
int tabPos = line.indexOf('\t');
|
2002-07-21 08:43:39 +00:00
|
|
|
|
int tabPos2 = line.indexOf('\t', tabPos+1);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
String scode = line.substring(2, tabPos).trim();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
int code = Integer.parseInt(scode, 16);
|
|
|
|
|
String property = line.substring(tabPos+1, tabPos2).trim();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
String propertyValue = line.substring(tabPos2+1).trim();
|
2003-02-25 23:38:23 +00:00
|
|
|
|
if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
// gather traditional mapping
|
|
|
|
|
if (property.equals("kTraditionalVariant")) {
|
|
|
|
|
simplifiedToTraditional.put(UTF16.valueOf(code), propertyValue);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
if (property.equals("kSimplifiedVariant")) {
|
|
|
|
|
traditionalToSimplified.put(UTF16.valueOf(code), propertyValue);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
|
|
|
|
if (property.equals(key) || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")) {
|
|
|
|
|
storeDef(out, code, propertyValue, line);
|
|
|
|
|
}
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
in.close();
|
|
|
|
|
}
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static void storeDef(PrintWriter out, int cp, String rawDefinition, String line) {
|
2001-10-25 20:37:09 +00:00
|
|
|
|
// skip spaces & numbers at start
|
2002-07-21 08:43:39 +00:00
|
|
|
|
int start;
|
|
|
|
|
for (start = 0;start < rawDefinition.length(); ++start) {
|
|
|
|
|
char ch = rawDefinition.charAt(start);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// go up to comma or semicolon, whichever is earlier
|
2002-07-21 08:43:39 +00:00
|
|
|
|
int end = rawDefinition.indexOf(";", start);
|
|
|
|
|
if (end < 0) end = rawDefinition.length();
|
2001-10-25 20:37:09 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
int end2 = rawDefinition.indexOf(",", start);
|
|
|
|
|
if (end2 < 0) end2 = rawDefinition.length();
|
2001-10-25 20:37:09 +00:00
|
|
|
|
if (end > end2) end = end2;
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
// IF CHINESE or JAPANESE, stop at first space!!!
|
2002-07-30 09:57:18 +00:00
|
|
|
|
rawDefinition = rawDefinition.substring(start,end);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
2002-07-30 09:57:18 +00:00
|
|
|
|
if (type == DEFINITION) {
|
|
|
|
|
storeDef2(out, cp, rawDefinition, line);
|
|
|
|
|
} else {
|
|
|
|
|
if (rawDefinition.indexOf(' ') < 0) storeDef2(out, cp, rawDefinition, line);
|
|
|
|
|
else {
|
|
|
|
|
String [] pieces = Utility.split(rawDefinition, ' ');
|
|
|
|
|
for (int i = 0; i < pieces.length; ++i) {
|
|
|
|
|
storeDef2(out, cp, pieces[i], line);
|
|
|
|
|
}
|
|
|
|
|
}
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
2002-07-30 09:57:18 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void storeDef2(PrintWriter out, int cp, String definition, String line) {
|
2002-06-13 21:14:05 +00:00
|
|
|
|
if (type == CHINESE) {
|
|
|
|
|
// since data are messed up, terminate after first digit
|
|
|
|
|
int end3 = findInString(definition, "12345")+1;
|
|
|
|
|
if (end3 == 0) {
|
2002-07-30 09:57:18 +00:00
|
|
|
|
log.println("Bad pinyin data: " + hex.transliterate(UTF16.valueOf(cp))
|
|
|
|
|
+ "\t" + UTF16.valueOf(cp) + "\t" + definition);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
end3 = definition.length();
|
|
|
|
|
}
|
|
|
|
|
definition = definition.substring(0, end3);
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
definition = digitToPinyin(definition, line);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
|
|
|
|
if (type == DEFINITION) {
|
2002-07-30 09:57:18 +00:00
|
|
|
|
definition = removeMatched(definition,'(', ')', line);
|
|
|
|
|
definition = removeMatched(definition,'[', ']', line);
|
|
|
|
|
definition = fixDefinition(definition, line);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
}
|
2002-07-14 22:04:49 +00:00
|
|
|
|
definition = definition.trim();
|
2002-06-13 21:14:05 +00:00
|
|
|
|
definition = Default.ucd.getCase(definition, FULL, LOWER);
|
2002-07-21 08:43:39 +00:00
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
if (definition.length() == 0) {
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("Zero value for " + Default.ucd.getCode(cp) + " on: " + hex.transliterate(line));
|
|
|
|
|
} else {
|
2002-07-30 09:57:18 +00:00
|
|
|
|
addCheck(UTF16.valueOf(cp), definition, line);
|
2002-07-14 22:04:49 +00:00
|
|
|
|
}
|
2002-06-13 21:14:05 +00:00
|
|
|
|
/*
|
|
|
|
|
String key = (String) unihanMap.get(definition);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
if (key == null) {
|
2002-06-13 21:14:05 +00:00
|
|
|
|
unihanMap.put(definition, cp);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
2002-06-13 21:14:05 +00:00
|
|
|
|
out.println(cp + (key == null ? " <> " : " > ") + Default.ucd.getCase(definition, FULL, TITLE) + ";");
|
2001-10-25 20:37:09 +00:00
|
|
|
|
if (TESTING) System.out.println("# " + code + " > " + definition);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
*/
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static String fixDefinition(String definition, String rawDefinition) {
|
2002-07-14 22:04:49 +00:00
|
|
|
|
definition = definition.trim();
|
|
|
|
|
definition = Utility.replace(definition, " ", " ");
|
|
|
|
|
definition = Utility.replace(definition, " ", "-");
|
|
|
|
|
definition = Default.ucd.getCase(definition, FULL, LOWER);
|
|
|
|
|
return definition;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
// WARNING not supplemenatary-safe!
|
|
|
|
|
|
|
|
|
|
static int findInString(String source, String chars) {
|
|
|
|
|
for (int i = 0; i < source.length(); ++i) {
|
|
|
|
|
if (chars.indexOf(source.charAt(i)) >= 0) return i;
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// WARNING not supplemenatary-safe!
|
|
|
|
|
|
|
|
|
|
static String removeMatched(String source, char start, char end, String originalLine) {
|
|
|
|
|
while (true) {
|
|
|
|
|
int pos = source.indexOf(start);
|
|
|
|
|
if (pos < 0) break;
|
|
|
|
|
int epos = source.indexOf(end, pos+1);
|
|
|
|
|
if (epos < 0) {
|
|
|
|
|
epos = source.length()-1;
|
|
|
|
|
log.println("Mismatches with " + start + ", " + end + ": " + originalLine);
|
|
|
|
|
}
|
|
|
|
|
source = source.substring(0,pos) + source.substring(epos+1);
|
|
|
|
|
}
|
|
|
|
|
return source;
|
|
|
|
|
}
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
static Map unihanMap = new TreeMap(); // could be hashmap
|
2002-07-30 09:57:18 +00:00
|
|
|
|
static Map duplicates = new TreeMap();
|
|
|
|
|
|
2002-07-14 22:04:49 +00:00
|
|
|
|
static boolean unihanNonSingular = false;
|
2001-10-25 20:37:09 +00:00
|
|
|
|
|
|
|
|
|
static StringBuffer handlePinyinTemp = new StringBuffer();
|
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static final Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex");
|
|
|
|
|
static final Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters",
|
|
|
|
|
"([[\\u0020-\\u007E]-[:L:]-[\\'\\{\\}]-[0-9]]) > \\u005C $1; "
|
|
|
|
|
+ "\\' > \\'\\';",
|
|
|
|
|
Transliterator.FORWARD);
|
|
|
|
|
static final Transliterator toSub = Transliterator.createFromRules("any-subscript",
|
|
|
|
|
" 0 > \u2080; "
|
|
|
|
|
+ " 1 > \u2081; "
|
|
|
|
|
+ " 2 > \u2082; "
|
|
|
|
|
+ " 3 > \u2084; "
|
|
|
|
|
+ " 4 > \u2084; "
|
|
|
|
|
+ " 5 > \u2085; "
|
|
|
|
|
+ " 6 > \u2086; "
|
|
|
|
|
+ " 7 > \u2087; "
|
|
|
|
|
+ " 8 > \u2088; "
|
|
|
|
|
+ " 9 > \u2089; ",
|
|
|
|
|
Transliterator.FORWARD);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static final Transliterator kanaToLatin = Transliterator.createFromRules("any-subscript",
|
|
|
|
|
" $kata = [[:katakana:]\u30FC]; "
|
|
|
|
|
+ "[:hiragana:] {} [:^hiragana:] > ' '; "
|
|
|
|
|
+ "$kata {} [^[:hiragana:]$kata] > ' '; "
|
|
|
|
|
+ "::Katakana-Latin; "
|
|
|
|
|
+ "::Hiragana-Latin;",
|
|
|
|
|
Transliterator.FORWARD);
|
|
|
|
|
|
|
|
|
|
static final Transliterator katakanatoHiragana = Transliterator.getInstance("katakana-hiragana");
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2002-07-21 08:43:39 +00:00
|
|
|
|
static final UnicodeSet kana = new UnicodeSet("[[:hiragana:][:katakana:]\u30FC]");
|
|
|
|
|
// since we are working in NFC, we don't worry about the combining marks.
|
|
|
|
|
|
2002-06-13 21:14:05 +00:00
|
|
|
|
// ADD Factory since otherwise getInverse blows out
|
|
|
|
|
static class DummyFactory implements Transliterator.Factory {
|
|
|
|
|
static DummyFactory singleton = new DummyFactory();
|
|
|
|
|
static HashMap m = new HashMap();
|
|
|
|
|
|
|
|
|
|
// Since Transliterators are immutable, we don't have to clone on set & get
|
|
|
|
|
static void add(String ID, Transliterator t) {
|
|
|
|
|
m.put(ID, t);
|
|
|
|
|
System.out.println("Registering: " + ID + ", " + t.toRules(true));
|
|
|
|
|
Transliterator.registerFactory(ID, singleton);
|
|
|
|
|
}
|
|
|
|
|
public Transliterator getInstance(String ID) {
|
|
|
|
|
return (Transliterator) m.get(ID);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
static Transliterator digitPinyin_accentPinyin;
|
|
|
|
|
|
|
|
|
|
static Transliterator accentPinyin_digitPinyin = Transliterator.createFromRules("accentPinyin_digitPinyin",
|
|
|
|
|
"::NFD; "
|
|
|
|
|
+ " ([\u0304\u0301\u030C\u0300\u0306]) ([[:Mark:][:Letter:]]+) > $2 | $1;"
|
|
|
|
|
+ "\u0304 > '1'; \u0301 > '2'; \u030C > '3'; \u0300 > '4'; \u0306 > '3';"
|
|
|
|
|
+ " ::NFC;", Transliterator.FORWARD);
|
|
|
|
|
|
|
|
|
|
static Transliterator fixCircumflex = Transliterator.createFromRules("fix_circumflex",
|
|
|
|
|
"::NFD; \u0306 > \u030C; ::NFC;", Transliterator.FORWARD);
|
|
|
|
|
|
|
|
|
|
static Transliterator dropTones = Transliterator.createFromRules("drop_tones",
|
|
|
|
|
"::NFD; \u0304 > ; \u0301 > ; \u030C > ; \u0300 > ; \u0306 > ; ::NFC;", Transliterator.FORWARD);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
|
|
|
|
static {
|
2003-02-25 23:38:23 +00:00
|
|
|
|
String dt = "1 > \u0304;\n"
|
2002-06-13 21:14:05 +00:00
|
|
|
|
+ "2 <> \u0301;\n"
|
2003-02-25 23:38:23 +00:00
|
|
|
|
+ "3 <> \u030C;\n"
|
2002-06-13 21:14:05 +00:00
|
|
|
|
+ "4 <> \u0300;\n"
|
2003-02-25 23:38:23 +00:00
|
|
|
|
+ "5 <> ;";
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
|
|
|
|
String dp = "# syllable is ...vowel+ consonant* number\n"
|
|
|
|
|
+ "# 'a', 'e' are the preferred bases\n"
|
|
|
|
|
+ "# otherwise 'o'\n"
|
|
|
|
|
+ "# otherwise last vowel\n"
|
|
|
|
|
+ "::NFC;\n"
|
|
|
|
|
+ "$vowel = [aAeEiIoOuU<75><55>];\n"
|
|
|
|
|
+ "$consonant = [[a-z A-Z] - [$vowel]];\n"
|
|
|
|
|
+ "$digit = [1-5];\n"
|
|
|
|
|
+ "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
|
|
|
|
+ "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
|
|
|
|
+ "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
2002-07-21 08:43:39 +00:00
|
|
|
|
+ "($digit) > &digit-tone($1);\n"
|
2002-06-13 21:14:05 +00:00
|
|
|
|
+ "::NFC;\n";
|
|
|
|
|
|
|
|
|
|
Transliterator at = Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD);
|
|
|
|
|
System.out.println(at.transliterate("a1a2a3a4a5"));
|
|
|
|
|
DummyFactory.add(at.getID(), at);
|
|
|
|
|
|
2003-02-25 23:38:23 +00:00
|
|
|
|
digitPinyin_accentPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
|
|
|
|
|
System.out.println(digitPinyin_accentPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
/*
|
|
|
|
|
|
|
|
|
|
static String convertTones(String source, String debugLine) {
|
2001-10-25 20:37:09 +00:00
|
|
|
|
try {
|
2002-06-13 21:14:05 +00:00
|
|
|
|
result = new StringBuffer();
|
|
|
|
|
main:
|
|
|
|
|
for (int i = 0; i < source.length(); ++i) {
|
|
|
|
|
ch = source.charAt(i);
|
|
|
|
|
switch (ch) {
|
|
|
|
|
case ':':
|
|
|
|
|
if (i > 0) {
|
|
|
|
|
char last = result.charAt(result.length()-1);
|
|
|
|
|
if (last == 'u') {
|
|
|
|
|
result.setCharAt(result.length()-1, '<27>');
|
|
|
|
|
continue main;
|
|
|
|
|
} else if (last == 'U') {
|
|
|
|
|
result.setCharAt(result.length()-1, '<27>');
|
|
|
|
|
continue main;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case '1': break; // skip character
|
|
|
|
|
case '2': case '3': case '4': case '5':
|
|
|
|
|
applyToPrecedingBase(result, ch-'0');
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
result.append(ch);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
source = source.trim();
|
2001-10-25 20:37:09 +00:00
|
|
|
|
char ch = source.charAt(source.length()-1);
|
|
|
|
|
int num = (int)(ch-'1');
|
|
|
|
|
if (num < 0 || num > 5) throw new Exception("none");
|
|
|
|
|
handlePinyinTemp.setLength(0);
|
|
|
|
|
boolean gotIt = false;
|
|
|
|
|
boolean messageIfNoGotIt = true;
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
2001-10-25 20:37:09 +00:00
|
|
|
|
for (int i = source.length()-2; i >= 0; --i) {
|
|
|
|
|
ch = source.charAt(i);
|
2002-06-13 21:14:05 +00:00
|
|
|
|
if (ch == ':') {
|
|
|
|
|
ch = '<27>';
|
|
|
|
|
--i;
|
|
|
|
|
}
|
|
|
|
|
if ('0' <= ch && ch <= '9') break;
|
|
|
|
|
if (ch != '<27>' && (ch < 'A' || ch > 'Z')) {
|
|
|
|
|
Utility.fixDot();
|
|
|
|
|
System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")");
|
|
|
|
|
break;
|
|
|
|
|
}
|
2001-10-25 20:37:09 +00:00
|
|
|
|
if (!gotIt) switch (ch) {
|
|
|
|
|
case 'A': ch = "A<EFBFBD>\u0102<EFBFBD>\u0100".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'E': ch = "E<EFBFBD>\u0114<EFBFBD>\u0112".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'I': ch = "I<EFBFBD>\u012C<EFBFBD>\u012A".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'O': ch = "O<EFBFBD>\u014E<EFBFBD>\u014C".charAt(num); gotIt = true; break;
|
|
|
|
|
case 'U': ch = "U<EFBFBD>\u016C<EFBFBD>\u016A".charAt(num); gotIt = true; break;
|
|
|
|
|
case '<27>': ch = "<EFBFBD>\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
|
|
|
|
|
}
|
|
|
|
|
handlePinyinTemp.insert(0,ch);
|
|
|
|
|
}
|
|
|
|
|
if (!gotIt && num > 0) {
|
|
|
|
|
handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num));
|
|
|
|
|
if (messageIfNoGotIt) {
|
|
|
|
|
err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp
|
|
|
|
|
.toString());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
source = handlePinyinTemp.toString().toLowerCase();
|
|
|
|
|
} catch (Exception e) {
|
2002-06-13 21:14:05 +00:00
|
|
|
|
log.println("Bad line: " + debugLine);
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|
|
|
|
|
return source;
|
|
|
|
|
}
|
2002-06-13 21:14:05 +00:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
A and e trump all other vowels and always take the tone mark.
|
|
|
|
|
There are no Mandarin syllables that contain both a and e.
|
|
|
|
|
In the combination ou, o takes the mark.
|
|
|
|
|
In all other cases, the final vowel takes the mark.
|
|
|
|
|
*/
|
|
|
|
|
/*
|
|
|
|
|
static String applyToPrecedingBase(StringBuffer result, int tone) {
|
|
|
|
|
for (int i = result.length()-1; i >= 0; --i) {
|
|
|
|
|
char ch = result.charAt(i);
|
|
|
|
|
switch (ch) {
|
|
|
|
|
case 'a': case 'e': case 'A': case 'E':
|
|
|
|
|
result.setCharAt(i, mapTone(ch, tone));
|
|
|
|
|
return;
|
|
|
|
|
case 'o': case 'O': bestSoFar = i; break;
|
|
|
|
|
case 'i': case 'I': case 'u': case 'U': case '
|
|
|
|
|
if (tone == 1) return String.valueOf(ch);
|
|
|
|
|
return Default.nfc.normalize(ch + mapTone[tone]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static final char[] MAP_TONE = {"\u0301", "\u0306", "\u0300", "\u0304"};
|
|
|
|
|
*/
|
2001-10-25 20:37:09 +00:00
|
|
|
|
}
|