scuffed-code/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java

81 lines
3.0 KiB
Java
Raw Normal View History

package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.text.DecimalFormat;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.Pair;
import com.ibm.text.utility.Utility;
public class ChineseFrequency {
static final String DICT_DIR = "C:\\DATA\\dict\\";
static NumberFormat percent = new DecimalFormat("0.000000%");
static NumberFormat percent3 = new DecimalFormat("000.000000%");
static NumberFormat number = new DecimalFormat("#,##0");
static class InverseCompareTo implements Comparator {
public int compare(Object o1, Object o2) {
return -((Comparable)o1).compareTo(o2);
}
}
public static void test() throws IOException{
Set freq_char = new TreeSet(new InverseCompareTo());
BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt");
double grandTotal = 0.0;
while (true) {
String line = br.readLine();
if (line == null) break;
String[] pieces = Utility.split(line,'\t');
int cp = Integer.parseInt(pieces[0],16);
String[] says = Utility.split(pieces[1],',');
long total = 0;
for (int i = 0; i < says.length; ++i) {
int start = says[i].indexOf('(');
int end = says[i].indexOf(')');
long count = Long.parseLong(says[i].substring(start+1, end));
total += count;
}
grandTotal += total;
freq_char.add(new Pair(new Long(total), new Integer(cp)));
}
br.close();
PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt");
pw.write("\uFEFF");
pw.println("No.\tPercentage\tAccummulated\tHex\tChar");
Iterator it = freq_char.iterator();
int counter = 0;
double cummulative = 0;
double cummulativePercentage = 0;
while (it.hasNext()) {
Pair item = (Pair)it.next();
Long total = (Long) item.first;
Integer cp = (Integer) item.second;
double current = total.longValue();
cummulative += current;
double percentage = current / grandTotal;
cummulativePercentage += percentage;
pw.println(
++counter
//+ "\t" + number.format(current)
//+ "\t" + number.format(cummulative)
+ "\t" + percent.format(percentage)
+ "\t" + percent3.format(cummulativePercentage)
+ "\t" + Integer.toHexString(cp.intValue()).toUpperCase()
+ "\t" + UTF16.valueOf(cp.intValue()));
}
//pw.println("Grand total: " + (long)grandTotal);
pw.close();
}
}