scuffed-code/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java
2002-08-09 23:56:24 +00:00

351 lines
12 KiB
Java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
// http://java.sun.com/j2se/1.3/docs/guide/intl/encoding.doc.html
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.NumberFormat;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* This is a quick and dirty program to get some idea of collation performance, comparing old Java to new stuff.
*/
abstract public class CheckCollator {
static final String PREFIX = "C:\\ICUInternal\\icu4c\\collation-perf-data\\TestNames_";
static final boolean DO_RAW = false;
static final NumberFormat nf = NumberFormat.getInstance();
static final NumberFormat percent = NumberFormat.getPercentInstance();
static {
nf.setMaximumFractionDigits(2);
}
public static void main(String[] args) throws IOException {
// later, drive off of args
// choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai
//test(Locale.KOREAN, "Korean");
test(Locale.ENGLISH, "Latin");
test(Locale.FRENCH, "Latin");
test(Locale.JAPANESE, "Japanese");
}
public static void test(Locale loc, String name) throws IOException {
System.out.println();
System.out.println("Testing " + loc.getDisplayName() + ", file: " + name);
System.out.println();
// get test data
String fileName = PREFIX + name + ".txt";
FileInputStream fis = new FileInputStream(fileName);
InputStreamReader isr = new InputStreamReader(fis, "UnicodeLittle");
BufferedReader br = new BufferedReader(isr, 32*1024);
int counter = 0;
ArrayList list = new ArrayList();
while (true) {
String line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
Utility.dot(counter++);
list.add(line);
}
System.out.println("Read " + counter + " lines in file");
int limit = 800; // put a limit on it to save time
// pump it up if there aren't very many
while (list.size() < limit) {
list.addAll(list);
}
int size = list.size();
// later, adjust these so we always get a reasonble number of tries
int extraIterations = 200;
if (size > limit) size = limit;
String[] tests = new String [size];
for (int i = 0; i < size; ++i) {
tests[i] = (String) list.get(i);
}
// get collators
com.ibm.icu.text.Collator newCol = com.ibm.icu.text.Collator.getInstance(loc);
java.text.Collator oldCol = java.text.Collator.getInstance(loc);
double startTime, endTime;
double delta, oldDelta;
String probe;
// load classes at least once before starting
newCol.compare("a", "b");
oldCol.compare("a", "b");
// ================================================
// check sort key size
int stringSize = 0, newSize = 0, oldSize = 0;
for (int i = 0; i < size; ++i) {
stringSize += tests[i].length() * 2;
byte[] newKey = newCol.getCollationKey(tests[i]).toByteArray();
newSize += newKey.length;
byte[] oldKey = oldCol.getCollationKey(tests[i]).toByteArray();
oldSize += oldKey.length;
}
delta = stringSize/(size + 0.0);
System.out.println("string size: " + nf.format(delta) + " bytes per key");
System.out.println();
delta = oldDelta = (oldSize/(size + 0.0));
System.out.println("old sortkey size: " + nf.format(delta) + " bytes per key ");
delta = (newSize/(size + 0.0));
System.out.println("new sortkey size: " + nf.format(delta) + " bytes per key " + percent.format(delta/oldDelta));
System.out.println();
// ================================================
// Sort Key: old time
// get overhead time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
counter++;
}
}
endTime = System.currentTimeMillis();
double overhead = (1000*(endTime - startTime) / counter);
System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
oldCol.getCollationKey(probe);
counter++;
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("Old sort key time: " + nf.format(delta)
+ " micros (" + counter + " iterations)");
// Sort Key: new time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
newCol.getCollationKey(probe);
counter++;
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("New sort key time: " + nf.format(delta)
+ " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
System.out.println();
// ================================================
// Raw Compare
if (DO_RAW) {
// get overhead time
counter = 0;
startTime = System.currentTimeMillis();
int opt = 0; // to keep the compiler from optimizing out
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int j = 0; j < size; ++j) {
opt ^= probe.compareTo(tests[j]);
counter++;
}
}
endTime = System.currentTimeMillis();
overhead = (1000*(endTime - startTime) / counter);
System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
// Raw Compare: old time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int j = 0; j < size; ++j) {
opt ^= oldCol.compare(probe, tests[j]);
counter++;
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("Old raw compare time: " + nf.format(delta)
+ " micros (" + counter + " iterations)");
// Raw Compare: new time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int j = 0; j < size; ++j) {
opt ^= newCol.compare(probe, tests[j]);
counter++;
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("New raw compare time: " + nf.format(delta)
+ " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
System.out.println();
}
// ================================================
// Binary Search
// note: I don't worry about getting the binary search precisely right, since I just want to
// see which strings would get compared.
// overhead
int iterations = (size * extraIterations);
startTime = System.currentTimeMillis();
Arrays.sort(tests);
int opt2 = 0; // keep from optimizing out
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
opt2 ^= Arrays.binarySearch(tests, probe);
}
}
endTime = System.currentTimeMillis();
overhead = delta = (1000*(endTime - startTime) / iterations);
System.out.println("Overhead: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// old time
startTime = System.currentTimeMillis();
Arrays.sort(tests, oldCol);
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
opt2 ^= Arrays.binarySearch(tests, probe, oldCol);
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("Old binary search time: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// new time
Arrays.sort(tests, newCol);
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
opt2 ^= Arrays.binarySearch(tests, probe, newCol);
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("New binary search time: " + nf.format(delta)
+ " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
System.out.println();
// ================================================
// Sort
String[] sortTests = (String[]) tests.clone();
extraIterations = 5;
iterations = (size * extraIterations);
// overhead
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int k = 0; k < extraIterations; ++k) {
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
Arrays.sort(sortTests);
}
}
endTime = System.currentTimeMillis();
overhead = delta = (1000*(endTime - startTime) / iterations);
System.out.println("overhead: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// old time
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int k = 0; k < extraIterations; ++k) {
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
Arrays.sort(sortTests, oldCol);
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("Old sort time: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// new time
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int k = 0; k < extraIterations; ++k) {
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
Arrays.sort(sortTests, newCol);
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("New sort time: " + nf.format(delta)
+ " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
}
}