package com.ibm.text.UCD; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; import java.util.Comparator; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.text.DecimalFormat; import com.ibm.icu.text.NumberFormat; import com.ibm.icu.text.UTF16; import com.ibm.text.utility.Pair; import com.ibm.text.utility.Utility; public class ChineseFrequency { static final String DICT_DIR = "C:\\DATA\\dict\\"; static NumberFormat percent = new DecimalFormat("0.000000%"); static NumberFormat percent3 = new DecimalFormat("000.000000%"); static NumberFormat number = new DecimalFormat("#,##0"); static class InverseCompareTo implements Comparator { public int compare(Object o1, Object o2) { return -((Comparable)o1).compareTo(o2); } } public static void test() throws IOException{ Set freq_char = new TreeSet(new InverseCompareTo()); BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt"); double grandTotal = 0.0; while (true) { String line = br.readLine(); if (line == null) break; String[] pieces = Utility.split(line,'\t'); int cp = Integer.parseInt(pieces[0],16); String[] says = Utility.split(pieces[1],','); long total = 0; for (int i = 0; i < says.length; ++i) { int start = says[i].indexOf('('); int end = says[i].indexOf(')'); long count = Long.parseLong(says[i].substring(start+1, end)); total += count; } grandTotal += total; freq_char.add(new Pair(new Long(total), new Integer(cp))); } br.close(); PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt"); pw.write("\uFEFF"); pw.println("No.\tPercentage\tAccummulated\tHex\tChar"); Iterator it = freq_char.iterator(); int counter = 0; double cummulative = 0; double cummulativePercentage = 0; while (it.hasNext()) { Pair item = (Pair)it.next(); Long total = (Long) item.first; Integer cp = (Integer) item.second; double current = total.longValue(); cummulative += current; double percentage = current / grandTotal; cummulativePercentage += percentage; pw.println( ++counter //+ "\t" + number.format(current) //+ "\t" + number.format(cummulative) + "\t" + percent.format(percentage) + "\t" + percent3.format(cummulativePercentage) + "\t" + Integer.toHexString(cp.intValue()).toUpperCase() + "\t" + UTF16.valueOf(cp.intValue())); } //pw.println("Grand total: " + (long)grandTotal); pw.close(); } }