81 lines
3.0 KiB
Java
81 lines
3.0 KiB
Java
|
package com.ibm.text.UCD;
|
||
|
|
||
|
import java.io.BufferedReader;
|
||
|
import java.io.IOException;
|
||
|
import java.io.PrintWriter;
|
||
|
import java.util.Comparator;
|
||
|
import java.util.Iterator;
|
||
|
import java.util.Map;
|
||
|
import java.util.Set;
|
||
|
import java.util.TreeMap;
|
||
|
import java.util.TreeSet;
|
||
|
|
||
|
import com.ibm.icu.dev.test.util.BagFormatter;
|
||
|
import com.ibm.icu.text.DecimalFormat;
|
||
|
import com.ibm.icu.text.NumberFormat;
|
||
|
import com.ibm.icu.text.UTF16;
|
||
|
import com.ibm.text.utility.Pair;
|
||
|
import com.ibm.text.utility.Utility;
|
||
|
|
||
|
public class ChineseFrequency {
|
||
|
static final String DICT_DIR = "C:\\DATA\\dict\\";
|
||
|
static NumberFormat percent = new DecimalFormat("0.000000%");
|
||
|
static NumberFormat percent3 = new DecimalFormat("000.000000%");
|
||
|
static NumberFormat number = new DecimalFormat("#,##0");
|
||
|
|
||
|
static class InverseCompareTo implements Comparator {
|
||
|
public int compare(Object o1, Object o2) {
|
||
|
return -((Comparable)o1).compareTo(o2);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static void test() throws IOException{
|
||
|
Set freq_char = new TreeSet(new InverseCompareTo());
|
||
|
BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt");
|
||
|
double grandTotal = 0.0;
|
||
|
while (true) {
|
||
|
String line = br.readLine();
|
||
|
if (line == null) break;
|
||
|
String[] pieces = Utility.split(line,'\t');
|
||
|
int cp = Integer.parseInt(pieces[0],16);
|
||
|
String[] says = Utility.split(pieces[1],',');
|
||
|
long total = 0;
|
||
|
for (int i = 0; i < says.length; ++i) {
|
||
|
int start = says[i].indexOf('(');
|
||
|
int end = says[i].indexOf(')');
|
||
|
long count = Long.parseLong(says[i].substring(start+1, end));
|
||
|
total += count;
|
||
|
}
|
||
|
grandTotal += total;
|
||
|
freq_char.add(new Pair(new Long(total), new Integer(cp)));
|
||
|
}
|
||
|
br.close();
|
||
|
PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt");
|
||
|
pw.write("\uFEFF");
|
||
|
pw.println("No.\tPercentage\tAccummulated\tHex\tChar");
|
||
|
|
||
|
Iterator it = freq_char.iterator();
|
||
|
int counter = 0;
|
||
|
double cummulative = 0;
|
||
|
double cummulativePercentage = 0;
|
||
|
while (it.hasNext()) {
|
||
|
Pair item = (Pair)it.next();
|
||
|
Long total = (Long) item.first;
|
||
|
Integer cp = (Integer) item.second;
|
||
|
double current = total.longValue();
|
||
|
cummulative += current;
|
||
|
double percentage = current / grandTotal;
|
||
|
cummulativePercentage += percentage;
|
||
|
pw.println(
|
||
|
++counter
|
||
|
//+ "\t" + number.format(current)
|
||
|
//+ "\t" + number.format(cummulative)
|
||
|
+ "\t" + percent.format(percentage)
|
||
|
+ "\t" + percent3.format(cummulativePercentage)
|
||
|
+ "\t" + Integer.toHexString(cp.intValue()).toUpperCase()
|
||
|
+ "\t" + UTF16.valueOf(cp.intValue()));
|
||
|
}
|
||
|
//pw.println("Grand total: " + (long)grandTotal);
|
||
|
pw.close();
|
||
|
}
|
||
|
}
|