25561ba9b8
X-SVN-Rev: 8886
744 lines
29 KiB
Java
744 lines
29 KiB
Java
/**
|
||
*******************************************************************************
|
||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||
* others. All Rights Reserved. *
|
||
*******************************************************************************
|
||
*
|
||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||
* $Date: 2002/06/13 21:14:05 $
|
||
* $Revision: 1.5 $
|
||
*
|
||
*******************************************************************************
|
||
*/
|
||
|
||
package com.ibm.text.UCD;
|
||
import java.io.*;
|
||
import com.ibm.text.utility.*;
|
||
import com.ibm.icu.text.Transliterator;
|
||
import com.ibm.icu.text.UnicodeSet;
|
||
import com.ibm.icu.text.UTF16;
|
||
import java.util.*;
|
||
|
||
|
||
public final class GenerateHanTransliterator implements UCD_Types {
|
||
|
||
static final boolean TESTING = false;
|
||
static int type;
|
||
|
||
static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
|
||
|
||
public static void main(int typeIn) {
|
||
type = typeIn;
|
||
Default.setUCD();
|
||
try {
|
||
System.out.println("Starting");
|
||
log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
|
||
err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
|
||
log.print('\uFEFF');
|
||
|
||
String key; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
||
String filter; // "kJis0";
|
||
String filename;
|
||
|
||
switch (type) {
|
||
case DEFINITION:
|
||
key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
||
filter = null; // "kJis0";
|
||
filename = "Transliterator_Han_Latin_Definition.txt";
|
||
break;
|
||
case JAPANESE:
|
||
key = "kJapaneseOn";
|
||
filter = null; // "kJis0";
|
||
filename = "Transliterator_ja_Latin.txt";
|
||
break;
|
||
case CHINESE:
|
||
key = "kMandarin";
|
||
filename = "Transliterator_Han_Latin.txt";
|
||
filter = null;
|
||
break;
|
||
default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
|
||
}
|
||
|
||
readUnihanData(key, filter);
|
||
|
||
if (false) {
|
||
readCDICT();
|
||
compareUnihanWithCEDICT();
|
||
}
|
||
|
||
readFrequencyData();
|
||
|
||
out = Utility.openPrintWriter(filename, false, false);
|
||
out.println("# Convert CJK characters");
|
||
out.println("# Note: adds space between them and letters.");
|
||
out.println("{ ([:Han:]) } [:L:] > | $1 ' ';");
|
||
out.println("[\\.\\,\\?\\!\uFF0E\uFF0C\uFF1F\uFF01\u3001\u3002[:Pe:][:Pf:]] { } [:L:] > ' ';");
|
||
out.println("[:L:] { } [[:Han:][:Ps:][:Pi:]]> ' ';");
|
||
|
||
if (type == JAPANESE) {
|
||
out.println("$kata = [[\uFF9E\uFF9F\uFF70\u30FC][:katakana:]];");
|
||
out.println("$kata { } [[:L:]-$kata]> ' ';");
|
||
out.println("[[:L:]-$kata] { } $kata > ' ';");
|
||
out.println("[:hiragana:] { } [[:L:]-[:hiragana:]] > ' ';");
|
||
out.println("[[:L:]-[:hiragana:]] { } [:hiragana:]> ' ';");
|
||
}
|
||
|
||
Set gotAlready = new HashSet();
|
||
Iterator it = rankList.iterator();
|
||
Set lenSet = new TreeSet();
|
||
int rank = 0;
|
||
while (it.hasNext()) {
|
||
Comparable keyChar = (Comparable) it.next();
|
||
Comparable def = (Comparable) unihanMap.get(keyChar);
|
||
if (def == null) continue; // skipping
|
||
// sort longer definitions first!
|
||
lenSet.add(new Pair(
|
||
new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
|
||
new Pair(keyChar, def)));
|
||
gotAlready.add(keyChar);
|
||
}
|
||
|
||
// add the ones that are not ranked!
|
||
it = unihanMap.keySet().iterator();
|
||
while (it.hasNext()) {
|
||
Comparable keyChar = (Comparable) it.next();
|
||
Comparable def = (Comparable) unihanMap.get(keyChar);
|
||
if (!gotAlready.contains(keyChar)) {
|
||
lenSet.add(new Pair(
|
||
new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
|
||
new Pair(keyChar, def)));
|
||
}
|
||
}
|
||
|
||
Set gotIt = new HashSet();
|
||
it = lenSet.iterator();
|
||
while (it.hasNext()) {
|
||
Pair p = (Pair) it.next();
|
||
p = (Pair) p.second;
|
||
|
||
Comparable keyChar = p.first;
|
||
Comparable def = p.second;
|
||
String rel = gotIt.contains(def) ? " > " : " <> ";
|
||
out.println(keyChar + rel + def + ";");
|
||
//if (TESTING) System.out.println("# " + code + " > " + definition);
|
||
gotIt.add(def);
|
||
}
|
||
|
||
out.println("\u3002 <> '.';");
|
||
if (type == JAPANESE) {
|
||
out.println(":: katakana-latin;");
|
||
out.println(":: hiragana-latin;");
|
||
}
|
||
out.println(":: fullwidth-halfwidth;");
|
||
|
||
|
||
|
||
System.out.println("Total: " + totalCount);
|
||
System.out.println("Defined Count: " + count);
|
||
} catch (Exception e) {
|
||
System.out.println("Exception: " + e);
|
||
} finally {
|
||
if (log != null) log.close();
|
||
if (out != null) out.close();
|
||
if (err != null) err.close();
|
||
}
|
||
}
|
||
|
||
static PrintWriter log;
|
||
static PrintWriter out;
|
||
static PrintWriter err;
|
||
|
||
static int count;
|
||
static int totalCount;
|
||
static int oldLine;
|
||
|
||
static void readFrequencyData() throws java.io.IOException {
|
||
String line = "";
|
||
try {
|
||
|
||
// chinese_frequency.txt
|
||
// 1 的 1588561 1588561 3.5008%
|
||
// japanese_frequency.txt
|
||
// 1 ? 17176
|
||
|
||
Set combinedRank = new TreeSet();
|
||
|
||
System.out.println("Reading chinese_frequency.txt");
|
||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", true);
|
||
int counter = 0;
|
||
while (true) {
|
||
line = Utility.readDataLine(br);
|
||
if (line == null) break;
|
||
if (line.length() == 0) continue;
|
||
Utility.dot(counter++);
|
||
int tabPos = line.indexOf('\t');
|
||
int rank = Integer.parseInt(line.substring(0,tabPos));
|
||
int cp = line.charAt(tabPos+1);
|
||
//if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
|
||
combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp)));
|
||
}
|
||
br.close();
|
||
|
||
System.out.println("Reading japanese_frequency.txt");
|
||
|
||
br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", true);
|
||
Map japaneseMap = new HashMap();
|
||
while (true) {
|
||
line = Utility.readDataLine(br);
|
||
if (line == null) break;
|
||
if (line.length() == 0) continue;
|
||
Utility.dot(counter++);
|
||
int tabPos = line.indexOf(' ');
|
||
|
||
int tabPos2 = line.indexOf(' ', tabPos+1);
|
||
int freq = Integer.parseInt(line.substring(tabPos2+1));
|
||
|
||
for (int i = tabPos+1; i < tabPos2; ++i) {
|
||
int cp = line.charAt(i);
|
||
int script = Default.ucd.getScript(cp);
|
||
if (script != HAN_SCRIPT) {
|
||
if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT) {
|
||
System.out.println("Huh: " + Default.ucd.getCodeAndName(cp));
|
||
}
|
||
continue;
|
||
}
|
||
// if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
|
||
Utility.addCount(japaneseMap, UTF16.valueOf(cp), -freq);
|
||
}
|
||
}
|
||
br.close();
|
||
|
||
// get rank order japanese
|
||
Iterator it = japaneseMap.keySet().iterator();
|
||
int countJapanese = 0;
|
||
while (it.hasNext()) {
|
||
Comparable key = (Comparable) it.next();
|
||
Comparable val = (Comparable) japaneseMap.get(key);
|
||
combinedRank.add(new Pair(new Integer(++countJapanese), key));
|
||
}
|
||
|
||
|
||
int overallRank = 0;
|
||
it = combinedRank.iterator();
|
||
|
||
while(it.hasNext()) {
|
||
Pair p = (Pair) it.next();
|
||
log.println(p.first + ", " + p.second);
|
||
Object rank = rankMap.get(p.second);
|
||
if (rank == null) {
|
||
rankMap.put(p.second, new Integer(++overallRank));
|
||
rankList.add(p.second);
|
||
}
|
||
}
|
||
|
||
log.println("@character to rank");
|
||
|
||
// get full order
|
||
it = rankList.iterator();
|
||
while (it.hasNext()) {
|
||
Comparable key = (Comparable) it.next();
|
||
Comparable val = (Comparable) rankMap.get(key);
|
||
log.println(key + ", " + val);
|
||
}
|
||
|
||
} catch (Exception e) {
|
||
throw new ChainException("Line \"{0}\"", new String[] {line}, e);
|
||
}
|
||
}
|
||
|
||
static void compareUnihanWithCEDICT() {
|
||
System.out.println("@Comparing CEDICT to Unihan");
|
||
log.println("@Comparing CEDICT to Unihan");
|
||
Iterator it = unihanMap.keySet().iterator();
|
||
List inCEDICT = new ArrayList();
|
||
List inUnihan = new ArrayList();
|
||
List inBoth = new ArrayList();
|
||
UnicodeSet noPinyin = new UnicodeSet();
|
||
UnicodeSet kPinyin = new UnicodeSet();
|
||
UnicodeSet tPinyin = new UnicodeSet();
|
||
UnicodeSet sPinyin = new UnicodeSet();
|
||
|
||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||
if (!Default.ucd.isAllocated(i)) continue;
|
||
if (Default.ucd.getScript(i) != HAN_SCRIPT) continue;
|
||
Utility.dot(i);
|
||
|
||
String ch = UTF16.valueOf(i);
|
||
|
||
String pinyin = (String) unihanMap.get(ch);
|
||
if (pinyin == null) {
|
||
String ch2 = Default.nfkd.normalize(ch);
|
||
pinyin = (String) unihanMap.get(ch2);
|
||
if (pinyin != null) {
|
||
unihanMap.put(ch, pinyin);
|
||
kPinyin.add(i);
|
||
} else {
|
||
String trial = (String) simplifiedToTraditional.get(ch2);
|
||
if (trial != null) {
|
||
pinyin = (String) unihanMap.get(trial);
|
||
if (pinyin != null) {
|
||
unihanMap.put(ch, pinyin);
|
||
tPinyin.add(i);
|
||
} else {
|
||
trial = (String) traditionalToSimplified.get(ch2);
|
||
if (trial != null) {
|
||
pinyin = (String) unihanMap.get(trial);
|
||
if (pinyin != null) {
|
||
unihanMap.put(ch, pinyin);
|
||
sPinyin.add(i);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
Map pinyinSet = (Map) cdict.get(ch);
|
||
if (pinyin == null) {
|
||
if (pinyinSet != null) inCEDICT.add(ch + " => " + pinyinSet);
|
||
noPinyin.add(i);
|
||
} else if (pinyinSet == null) {
|
||
inUnihan.add(ch + " => " + pinyin);
|
||
} else {
|
||
Object temp = pinyinSet.get(pinyin);
|
||
if (temp == null) {
|
||
inBoth.add(ch + " => " + pinyin + "; " + pinyinSet);
|
||
}
|
||
}
|
||
}
|
||
|
||
log.println("@In CEDICT but not Unihan: ");
|
||
printCollection(log, inCEDICT);
|
||
|
||
log.println("@In Unihan but not CEDICT: ");
|
||
printCollection(log, inUnihan);
|
||
|
||
log.println("@In Unihan and CEDICT, but different: ");
|
||
printCollection(log, inBoth);
|
||
|
||
log.println("@Missing from Unihan: ");
|
||
log.println(noPinyin.toPattern(true));
|
||
|
||
log.println("@Has mapping if we NFKD it: ");
|
||
log.println(kPinyin.toPattern(true));
|
||
|
||
log.println("@Has mapping if we NFKC & simp-trad it: ");
|
||
log.println(tPinyin.toPattern(true));
|
||
|
||
log.println("@Has mapping if we NFKC & trad-simp it: ");
|
||
log.println(sPinyin.toPattern(true));
|
||
|
||
log.println("@Done comparison");
|
||
}
|
||
|
||
static void printCollection(PrintWriter p, Collection c) {
|
||
Iterator it = c.iterator();
|
||
int count = 0;
|
||
while (it.hasNext()) {
|
||
p.println((++count) + "\t" + it.next());
|
||
}
|
||
}
|
||
|
||
|
||
static Map rankMap = new TreeMap(); // maps from single char strings to overall rank
|
||
static List rankList = new ArrayList(10000);
|
||
|
||
static void readCDICT() throws IOException {
|
||
System.out.println("Reading cdict.txt");
|
||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true);
|
||
int counter = 0;
|
||
String[] pieces = new String[50];
|
||
String line = "";
|
||
|
||
try {
|
||
while (true) {
|
||
line = Utility.readDataLine(br);
|
||
if (line == null) break;
|
||
if (line.length() == 0) continue;
|
||
Utility.dot(counter++);
|
||
int tabPos = line.indexOf('[');
|
||
String word = line.substring(0,tabPos).trim();
|
||
word = Utility.replace(word, "\uFE4D", "");
|
||
word = Utility.replace(word, ".", "");
|
||
word = Utility.replace(word, "/", "");
|
||
word = Utility.replace(word, "(", "");
|
||
word = Utility.replace(word, ")", "");
|
||
|
||
|
||
int tab2Pos = line.indexOf(']', tabPos+1);
|
||
String pinyins = line.substring(tabPos+1, tab2Pos);
|
||
int len = Utility.split(pinyins, ' ', pieces);
|
||
if (word.length() != len) {
|
||
log.println("Len mismatch: " + line);
|
||
continue;
|
||
}
|
||
for (int i = 0; i < len; ++i) {
|
||
String chr = word.substring(i, i+1);
|
||
String piece = convertPinyin.transliterate(pieces[i]);
|
||
Map oldMap = (Map) cdict.get(chr);
|
||
if (oldMap == null) {
|
||
oldMap = new TreeMap();
|
||
cdict.put(chr, oldMap);
|
||
}
|
||
/*&& !oldMap.equals(piece)) {
|
||
log.println("Variant for '" + chr + "', new: '" + piece + "', old: '" + oldMap + "'");
|
||
}
|
||
*/
|
||
Utility.addCount(oldMap, piece, 1);
|
||
}
|
||
}
|
||
br.close();
|
||
|
||
Iterator it = cdict.keySet().iterator();
|
||
Set tempSet = new TreeSet();
|
||
while (it.hasNext()) {
|
||
Object key = it.next();
|
||
Map val = (Map) cdict.get(key);
|
||
log.print(key + ": ");
|
||
Iterator it2 = val.keySet().iterator();
|
||
tempSet.clear();
|
||
while (it2.hasNext()) {
|
||
Comparable key2 = (Comparable) it2.next();
|
||
Comparable count = (Comparable) val.get(key2);
|
||
Pair p = new Pair(count, key2);
|
||
tempSet.add(p); // reverse the order
|
||
}
|
||
it2 = tempSet.iterator();
|
||
int counter2 = 0;
|
||
while (it2.hasNext()) {
|
||
if (counter2++ != 0) log.print("/");
|
||
log.print(it2.next());
|
||
}
|
||
log.println();
|
||
}
|
||
|
||
} catch (Exception e) {
|
||
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
||
}
|
||
}
|
||
|
||
static Map cdict = new TreeMap();
|
||
static Map simplifiedToTraditional = new HashMap();
|
||
static Map traditionalToSimplified = new HashMap();
|
||
|
||
static void readUnihanData(String key, String filter) throws java.io.IOException {
|
||
|
||
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true);
|
||
|
||
int count = 0;
|
||
String oldCode = "";
|
||
String oldLine = "";
|
||
int oldStart = 0;
|
||
boolean foundFilter = (filter == null);
|
||
boolean foundKey = false;
|
||
|
||
int lineCounter = 0;
|
||
|
||
while (true) {
|
||
Utility.dot(++lineCounter);
|
||
|
||
String line = in.readLine();
|
||
if (line == null) break;
|
||
if (line.length() < 6) continue;
|
||
if (line.charAt(0) == '#') continue;
|
||
line = line.trim();
|
||
|
||
int tabPos = line.indexOf('\t');
|
||
String code = line.substring(2, tabPos);
|
||
|
||
// gather traditional mapping
|
||
if (line.indexOf("kTraditionalVariant") >= 0) {
|
||
int tabPos2 = line.indexOf('\t', tabPos+1);
|
||
int tabPos3 = line.indexOf(' ', tabPos2+1);
|
||
if (tabPos3 < 0) tabPos3 = line.length();
|
||
|
||
String code2 = line.substring(tabPos2+3, tabPos3);
|
||
simplifiedToTraditional.put(UTF16.valueOf(Integer.parseInt(code, 16)),
|
||
UTF16.valueOf(Integer.parseInt(code2, 16)));
|
||
}
|
||
|
||
if (line.indexOf("kSimplifiedVariant") >= 0) {
|
||
int tabPos2 = line.indexOf('\t', tabPos+1);
|
||
int tabPos3 = line.indexOf(' ', tabPos2+1);
|
||
if (tabPos3 < 0) tabPos3 = line.length();
|
||
|
||
String code2 = line.substring(tabPos2+3, tabPos3);
|
||
traditionalToSimplified.put(UTF16.valueOf(Integer.parseInt(code, 16)),
|
||
UTF16.valueOf(Integer.parseInt(code2, 16)));
|
||
}
|
||
|
||
|
||
|
||
/* if (code.compareTo("9FA0") >= 0) {
|
||
System.out.println("? " + line);
|
||
}*/
|
||
if (!code.equals(oldCode)) {
|
||
totalCount++;
|
||
|
||
if (foundKey && foundFilter) {
|
||
count++;
|
||
/*if (true) { //*/
|
||
if (TESTING && (count == 1 || (count % 100) == 0)) {
|
||
System.out.println(count + ": " + oldLine);
|
||
}
|
||
storeDef(out, oldCode, oldLine, oldStart);
|
||
}
|
||
if (TESTING) if (count > 1000) {
|
||
System.out.println("ABORTING at 1000 for testing");
|
||
break;
|
||
}
|
||
oldCode = code;
|
||
foundKey = false;
|
||
foundFilter = (filter == null);
|
||
}
|
||
|
||
// detect key, filter. Must be on different lines
|
||
if (!foundFilter && line.indexOf(filter) >= 0) {
|
||
foundFilter = true;
|
||
} else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) {
|
||
foundKey = true;
|
||
oldLine = line;
|
||
oldStart += key.length();
|
||
}
|
||
}
|
||
if (foundKey && foundFilter) storeDef(out, oldCode, oldLine, oldStart);
|
||
|
||
in.close();
|
||
}
|
||
|
||
static void storeDef(PrintWriter out, String code, String line, int start) {
|
||
if (code.length() == 0) return;
|
||
|
||
// skip spaces & numbers at start
|
||
for (;start < line.length(); ++start) {
|
||
char ch = line.charAt(start);
|
||
if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
|
||
}
|
||
|
||
// go up to comma or semicolon, whichever is earlier
|
||
int end = line.indexOf(";", start);
|
||
if (end < 0) end = line.length();
|
||
|
||
int end2 = line.indexOf(",", start);
|
||
if (end2 < 0) end2 = line.length();
|
||
if (end > end2) end = end2;
|
||
|
||
if (type != DEFINITION) {
|
||
end2 = line.indexOf(" ", start);
|
||
if (end2 < 0) end2 = line.length();
|
||
if (end > end2) end = end2;
|
||
}
|
||
|
||
String definition = line.substring(start,end);
|
||
if (type == CHINESE) {
|
||
// since data are messed up, terminate after first digit
|
||
int end3 = findInString(definition, "12345")+1;
|
||
if (end3 == 0) {
|
||
log.println("Bad pinyin data: " + line);
|
||
end3 = definition.length();
|
||
}
|
||
definition = definition.substring(0, end3);
|
||
|
||
definition = convertPinyin.transliterate(definition);
|
||
}
|
||
if (type == DEFINITION) {
|
||
definition = removeMatched(definition,'(', ')', line);
|
||
definition = removeMatched(definition,'[', ']', line);
|
||
definition = definition.trim();
|
||
definition = Utility.replace(definition, " ", " ");
|
||
definition = "'[" + quoteNonLetters.transliterate(definition) + "]'";
|
||
}
|
||
definition.trim();
|
||
definition = Default.ucd.getCase(definition, FULL, LOWER);
|
||
String cp = UTF16.valueOf(Integer.parseInt(code, 16));
|
||
unihanMap.put(cp, definition);
|
||
/*
|
||
String key = (String) unihanMap.get(definition);
|
||
if (key == null) {
|
||
unihanMap.put(definition, cp);
|
||
}
|
||
out.println(cp + (key == null ? " <> " : " > ") + Default.ucd.getCase(definition, FULL, TITLE) + ";");
|
||
if (TESTING) System.out.println("# " + code + " > " + definition);
|
||
*/
|
||
}
|
||
|
||
// WARNING not supplemenatary-safe!
|
||
|
||
static int findInString(String source, String chars) {
|
||
for (int i = 0; i < source.length(); ++i) {
|
||
if (chars.indexOf(source.charAt(i)) >= 0) return i;
|
||
}
|
||
return -1;
|
||
}
|
||
|
||
// WARNING not supplemenatary-safe!
|
||
|
||
static String removeMatched(String source, char start, char end, String originalLine) {
|
||
while (true) {
|
||
int pos = source.indexOf(start);
|
||
if (pos < 0) break;
|
||
int epos = source.indexOf(end, pos+1);
|
||
if (epos < 0) {
|
||
epos = source.length()-1;
|
||
log.println("Mismatches with " + start + ", " + end + ": " + originalLine);
|
||
}
|
||
source = source.substring(0,pos) + source.substring(epos+1);
|
||
}
|
||
return source;
|
||
}
|
||
|
||
static Map unihanMap = new HashMap();
|
||
|
||
static StringBuffer handlePinyinTemp = new StringBuffer();
|
||
|
||
static Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex");
|
||
static Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters",
|
||
"([[\\u0021-\\u007E]-[:L:]-[\\']]) > \\u005C $1; \\' > \\'\\';", Transliterator.FORWARD);
|
||
|
||
|
||
|
||
// ADD Factory since otherwise getInverse blows out
|
||
static class DummyFactory implements Transliterator.Factory {
|
||
static DummyFactory singleton = new DummyFactory();
|
||
static HashMap m = new HashMap();
|
||
|
||
// Since Transliterators are immutable, we don't have to clone on set & get
|
||
static void add(String ID, Transliterator t) {
|
||
m.put(ID, t);
|
||
System.out.println("Registering: " + ID + ", " + t.toRules(true));
|
||
Transliterator.registerFactory(ID, singleton);
|
||
}
|
||
public Transliterator getInstance(String ID) {
|
||
return (Transliterator) m.get(ID);
|
||
}
|
||
}
|
||
|
||
static Transliterator convertPinyin;
|
||
|
||
static {
|
||
String dt = "1 > ;\n"
|
||
+ "2 <> \u0301;\n"
|
||
+ "3 <> \u0306;\n"
|
||
+ "4 <> \u0300;\n"
|
||
+ "5 <> \u0304;";
|
||
|
||
String dp = "# syllable is ...vowel+ consonant* number\n"
|
||
+ "# 'a', 'e' are the preferred bases\n"
|
||
+ "# otherwise 'o'\n"
|
||
+ "# otherwise last vowel\n"
|
||
+ "::NFC;\n"
|
||
+ "$vowel = [aAeEiIoOuU<75><55>];\n"
|
||
+ "$consonant = [[a-z A-Z] - [$vowel]];\n"
|
||
+ "$digit = [1-5];\n"
|
||
+ "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||
+ "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||
+ "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||
+ "::NFC;\n";
|
||
|
||
Transliterator at = Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD);
|
||
System.out.println(at.transliterate("a1a2a3a4a5"));
|
||
DummyFactory.add(at.getID(), at);
|
||
|
||
convertPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
|
||
System.out.println(convertPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
|
||
|
||
}
|
||
/*
|
||
|
||
static String convertTones(String source, String debugLine) {
|
||
try {
|
||
result = new StringBuffer();
|
||
main:
|
||
for (int i = 0; i < source.length(); ++i) {
|
||
ch = source.charAt(i);
|
||
switch (ch) {
|
||
case ':':
|
||
if (i > 0) {
|
||
char last = result.charAt(result.length()-1);
|
||
if (last == 'u') {
|
||
result.setCharAt(result.length()-1, '<27>');
|
||
continue main;
|
||
} else if (last == 'U') {
|
||
result.setCharAt(result.length()-1, '<27>');
|
||
continue main;
|
||
}
|
||
}
|
||
break;
|
||
case '1': break; // skip character
|
||
case '2': case '3': case '4': case '5':
|
||
applyToPrecedingBase(result, ch-'0');
|
||
break;
|
||
default:
|
||
result.append(ch);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
source = source.trim();
|
||
char ch = source.charAt(source.length()-1);
|
||
int num = (int)(ch-'1');
|
||
if (num < 0 || num > 5) throw new Exception("none");
|
||
handlePinyinTemp.setLength(0);
|
||
boolean gotIt = false;
|
||
boolean messageIfNoGotIt = true;
|
||
|
||
for (int i = source.length()-2; i >= 0; --i) {
|
||
ch = source.charAt(i);
|
||
if (ch == ':') {
|
||
ch = '<27>';
|
||
--i;
|
||
}
|
||
if ('0' <= ch && ch <= '9') break;
|
||
if (ch != '<27>' && (ch < 'A' || ch > 'Z')) {
|
||
Utility.fixDot();
|
||
System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")");
|
||
break;
|
||
}
|
||
if (!gotIt) switch (ch) {
|
||
case 'A': ch = "A<>\u0102<30>\u0100".charAt(num); gotIt = true; break;
|
||
case 'E': ch = "E<>\u0114<31>\u0112".charAt(num); gotIt = true; break;
|
||
case 'I': ch = "I<>\u012C<32>\u012A".charAt(num); gotIt = true; break;
|
||
case 'O': ch = "O<>\u014E<34>\u014C".charAt(num); gotIt = true; break;
|
||
case 'U': ch = "U<>\u016C<36>\u016A".charAt(num); gotIt = true; break;
|
||
case '<27>': ch = "<22>\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
|
||
}
|
||
handlePinyinTemp.insert(0,ch);
|
||
}
|
||
if (!gotIt && num > 0) {
|
||
handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num));
|
||
if (messageIfNoGotIt) {
|
||
err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp
|
||
.toString());
|
||
}
|
||
}
|
||
source = handlePinyinTemp.toString().toLowerCase();
|
||
} catch (Exception e) {
|
||
log.println("Bad line: " + debugLine);
|
||
}
|
||
return source;
|
||
}
|
||
|
||
/*
|
||
A and e trump all other vowels and always take the tone mark.
|
||
There are no Mandarin syllables that contain both a and e.
|
||
In the combination ou, o takes the mark.
|
||
In all other cases, the final vowel takes the mark.
|
||
*/
|
||
/*
|
||
static String applyToPrecedingBase(StringBuffer result, int tone) {
|
||
for (int i = result.length()-1; i >= 0; --i) {
|
||
char ch = result.charAt(i);
|
||
switch (ch) {
|
||
case 'a': case 'e': case 'A': case 'E':
|
||
result.setCharAt(i, mapTone(ch, tone));
|
||
return;
|
||
case 'o': case 'O': bestSoFar = i; break;
|
||
case 'i': case 'I': case 'u': case 'U': case '
|
||
if (tone == 1) return String.valueOf(ch);
|
||
return Default.nfc.normalize(ch + mapTone[tone]);
|
||
}
|
||
|
||
static final char[] MAP_TONE = {"\u0301", "\u0306", "\u0300", "\u0304"};
|
||
*/
|
||
} |