added utilities
X-SVN-Rev: 9932
This commit is contained in:
parent
68ae250e76
commit
ef407b98a6
329
tools/unicodetools/com/ibm/text/utility/FastBinarySearch.java
Normal file
329
tools/unicodetools/com/ibm/text/utility/FastBinarySearch.java
Normal file
@ -0,0 +1,329 @@
|
||||
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FastBinarySearch.java,v $
|
||||
* $Date: 2002/10/01 01:12:10 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.utility;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.Arrays;
|
||||
import com.ibm.icu.text.NumberFormat;
|
||||
|
||||
/**
|
||||
* Quick & Dirty test program for fast (unrolled) binary search
|
||||
* Should use new PerfTest once that is done, although since there is no object
|
||||
* creation the numbers should be fairly reliable.
|
||||
*/
|
||||
|
||||
final public class FastBinarySearch {
|
||||
|
||||
/**
|
||||
* Testing
|
||||
*/
|
||||
|
||||
static void test() {
|
||||
perfTest(100, 100); // warmup
|
||||
|
||||
// try different combinations of data size and iterations
|
||||
|
||||
perfTest(100, 200000);
|
||||
perfTest(1000, 2000);
|
||||
perfTest(100000, 200);
|
||||
|
||||
// skip the following
|
||||
if (true) return;
|
||||
validityTest();
|
||||
}
|
||||
|
||||
static void perfTest(int dataSize, int iterations) {
|
||||
NumberFormat percent = NumberFormat.getPercentInstance();
|
||||
percent.setMaximumFractionDigits(0);
|
||||
|
||||
Random random = new Random(123456789L);
|
||||
int[] myData = new int[dataSize];
|
||||
FastBinarySearch fbs = new FastBinarySearch();
|
||||
|
||||
// produce test case
|
||||
for (int i = 0; i < myData.length; ++i) {
|
||||
myData[i] = (int) (random.nextDouble() * myData.length * 3);
|
||||
}
|
||||
Arrays.sort(myData, 0, myData.length);
|
||||
fbs.setData(myData, myData.length);
|
||||
|
||||
// produce probe data
|
||||
int[] probe = new int[myData.length*2];
|
||||
for (int i = 0; i < probe.length; ++i) {
|
||||
probe[i] = (int) (random.nextDouble() * myData.length * 3);
|
||||
}
|
||||
|
||||
int sum = 0;
|
||||
double startTime, endTime, time, baseTime;
|
||||
|
||||
System.out.println();
|
||||
long totalIterations = iterations * probe.length;
|
||||
System.out.println("Iterations = " + totalIterations + ", Data size = " + dataSize);
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
for (int testCount = 0; testCount < iterations; ++testCount) {
|
||||
for (int i = 0; i < probe.length; ++i) {
|
||||
sum += fbs.findCodePoint(i);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
baseTime = time = (endTime - startTime)*1000/totalIterations;
|
||||
System.out.println("Basic; time=" + time + " microsecs/call");
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
for (int testCount = 0; testCount < iterations; ++testCount) {
|
||||
for (int i = 0; i < probe.length; ++i) {
|
||||
sum += fbs.highestIndexLEQ(i);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
time = (endTime - startTime)*1000/totalIterations;
|
||||
System.out.println("Fast; time=" + time + " microsecs/call\t" + percent.format(time/baseTime-1));
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
for (int testCount = 0; testCount < iterations; ++testCount) {
|
||||
for (int i = 0; i < probe.length; ++i) {
|
||||
sum += fbs.highestIndexLEQ2(i);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
time = (endTime - startTime)*1000/totalIterations;
|
||||
System.out.println("Compact; time=" + time + " microsecs/call\t" + percent.format(time/baseTime-1));
|
||||
}
|
||||
|
||||
|
||||
static void validityTest() {
|
||||
Random random = new Random(123456789L);
|
||||
int[] myData = new int[50];
|
||||
FastBinarySearch fbs = new FastBinarySearch();
|
||||
|
||||
for (int testCount = 0; testCount < 100; ++testCount) {
|
||||
|
||||
// produce test case
|
||||
double ran = random.nextDouble();
|
||||
//System.out.println(ran);
|
||||
int myCount = 2+ (int) (ran * (myData.length - 2));
|
||||
for (int i = 0; i < myCount; ++i) {
|
||||
ran = random.nextDouble();
|
||||
//System.out.println(ran);
|
||||
myData[i] = (int) (ran * myData.length * 3);
|
||||
}
|
||||
System.out.println("Trial " + testCount + ", len: " + myCount);
|
||||
Arrays.sort(myData, 0, myCount);
|
||||
fbs.setData(myData, myCount);
|
||||
|
||||
// compare brute force & fast methods
|
||||
boolean ok = true;
|
||||
for (int i = -1; i < myData.length * 3 + 1; ++i) {
|
||||
int brute = fbs.bruteForce(i);
|
||||
int fast = fbs.highestIndexLEQ(i);
|
||||
if (fast != brute) {
|
||||
if (ok) {
|
||||
System.out.println(fbs);
|
||||
}
|
||||
System.out.println("Error: probe=" + i + ", brute=" + brute + ", fast=" + fast);
|
||||
fast = fbs.highestIndexLEQ(i); // do again with debugger
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
if (!ok) return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the data to be scanned. It must be in sorted order.
|
||||
*/
|
||||
|
||||
public void setData(int data[], int count) {
|
||||
|
||||
this.data = (int[]) data.clone(); // clone for safety
|
||||
isValid = this.count == count; // isValid only depends on the count remaining the same
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic binary search
|
||||
*/
|
||||
|
||||
private final int findCodePoint(int c) {
|
||||
// Return the smallest i such that c < list[i]. Assume
|
||||
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
|
||||
if (c < data[0]) return 0;
|
||||
int lo = 0;
|
||||
int hi = count - 1;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
for (;;) {
|
||||
int i = (lo + hi) >>> 1;
|
||||
if (i == lo) return hi;
|
||||
if (c < data[i]) {
|
||||
hi = i;
|
||||
} else {
|
||||
lo = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return greatest index whose value is less than or equal to the searchValue.
|
||||
* If there is no such index, then -1 is returned
|
||||
*/
|
||||
|
||||
public int bruteForce(int searchValue) {
|
||||
int i = count;
|
||||
while (--i >= 0 && data[i] > searchValue) {}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return greatest index such that data[index] <= searchValue
|
||||
* If there is no such index (e.g. searchValue < data[0]), then -1 is returned
|
||||
*/
|
||||
|
||||
public int highestIndexLEQ(int searchValue) {
|
||||
|
||||
if (!isValid) validate();
|
||||
int temp;
|
||||
|
||||
// set up initial range to search. Each subrange is a power of two in length
|
||||
int high = searchValue < data[topOfLow] ? topOfLow : topOfHigh;
|
||||
|
||||
// Completely unrolled binary search, folhighing "Programming Pearls"
|
||||
// Each case deliberately falls through to the next
|
||||
// Logically, data[-1] < all_search_values && data[count] > all_search_values
|
||||
// although the values -1 and count are never actually touched.
|
||||
|
||||
// The bounds at each point are low & high,
|
||||
// where low == high - delta*2
|
||||
// so high - delta is the midpoint
|
||||
|
||||
// The invariant AFTER each line is that data[low] < searchValue <= data[high]
|
||||
|
||||
switch (power) {
|
||||
//case 31: if (searchValue < data[temp = high-0x40000000]) high = temp; // no unsigned int in Java
|
||||
case 30: if (searchValue < data[temp = high-0x20000000]) high = temp;
|
||||
case 29: if (searchValue < data[temp = high-0x10000000]) high = temp;
|
||||
|
||||
case 28: if (searchValue < data[temp = high- 0x8000000]) high = temp;
|
||||
case 27: if (searchValue < data[temp = high- 0x4000000]) high = temp;
|
||||
case 26: if (searchValue < data[temp = high- 0x2000000]) high = temp;
|
||||
case 25: if (searchValue < data[temp = high- 0x1000000]) high = temp;
|
||||
|
||||
case 24: if (searchValue < data[temp = high- 0x800000]) high = temp;
|
||||
case 23: if (searchValue < data[temp = high- 0x400000]) high = temp;
|
||||
case 22: if (searchValue < data[temp = high- 0x200000]) high = temp;
|
||||
case 21: if (searchValue < data[temp = high- 0x100000]) high = temp;
|
||||
|
||||
case 20: if (searchValue < data[temp = high- 0x80000]) high = temp;
|
||||
case 19: if (searchValue < data[temp = high- 0x40000]) high = temp;
|
||||
case 18: if (searchValue < data[temp = high- 0x20000]) high = temp;
|
||||
case 17: if (searchValue < data[temp = high- 0x10000]) high = temp;
|
||||
|
||||
case 16: if (searchValue < data[temp = high- 0x8000]) high = temp;
|
||||
case 15: if (searchValue < data[temp = high- 0x4000]) high = temp;
|
||||
case 14: if (searchValue < data[temp = high- 0x2000]) high = temp;
|
||||
case 13: if (searchValue < data[temp = high- 0x1000]) high = temp;
|
||||
|
||||
case 12: if (searchValue < data[temp = high- 0x800]) high = temp;
|
||||
case 11: if (searchValue < data[temp = high- 0x400]) high = temp;
|
||||
case 10: if (searchValue < data[temp = high- 0x200]) high = temp;
|
||||
case 9: if (searchValue < data[temp = high- 0x100]) high = temp;
|
||||
|
||||
case 8: if (searchValue < data[temp = high- 0x80]) high = temp;
|
||||
case 7: if (searchValue < data[temp = high- 0x40]) high = temp;
|
||||
case 6: if (searchValue < data[temp = high- 0x20]) high = temp;
|
||||
case 5: if (searchValue < data[temp = high- 0x10]) high = temp;
|
||||
|
||||
case 4: if (searchValue < data[temp = high- 0x8]) high = temp;
|
||||
case 3: if (searchValue < data[temp = high- 0x4]) high = temp;
|
||||
case 2: if (searchValue < data[temp = high- 0x2]) high = temp;
|
||||
case 1: if (searchValue < data[temp = high- 0x1]) high = temp;
|
||||
}
|
||||
if (high == topOfHigh && searchValue >= data[high]) return high;
|
||||
return high-1;
|
||||
}
|
||||
|
||||
|
||||
// NOTE: on some machines the above may not be optimal, if the size of the function
|
||||
// forces code out of the cache. For that case, it would be better for program in a loop, like the following
|
||||
|
||||
public int highestIndexLEQ2(int searchValue) {
|
||||
|
||||
if (!isValid) validate();
|
||||
int temp;
|
||||
int high = searchValue < data[topOfLow] ? topOfLow : topOfHigh;
|
||||
for (int delta = deltaStart; delta != 0; delta >>= 1) {
|
||||
if (searchValue < data[temp = high-delta]) high = temp;
|
||||
}
|
||||
if (high == topOfHigh && searchValue >= data[high]) return high;
|
||||
return high-1;
|
||||
}
|
||||
|
||||
/**
|
||||
* For debugging
|
||||
*/
|
||||
public String toString() {
|
||||
String result = "[";
|
||||
for (int j = 0; j < count; ++j) {
|
||||
if (j != 0) result += ", ";
|
||||
result += data[j];
|
||||
}
|
||||
result += "]";
|
||||
result += ", power: " + power;
|
||||
result += ", topOfLow: " + topOfLow;
|
||||
result += ", topOfHigh: " + topOfHigh;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ================ Privates ================
|
||||
|
||||
// data
|
||||
|
||||
int data[];
|
||||
int count;
|
||||
|
||||
// validate internal parameters
|
||||
|
||||
private void validate() {
|
||||
if (count <= 1) throw new IllegalArgumentException("Array must have at least 2 elements");
|
||||
|
||||
// find greatest power of 2 less than or equal to count
|
||||
for (power = exp2.length-1; power > 0 && exp2[power] > count; power--) {}
|
||||
|
||||
// determine the starting points
|
||||
topOfLow = exp2[power] - 1;
|
||||
topOfHigh = count - 1;
|
||||
deltaStart = exp2[power-1];
|
||||
isValid = true;
|
||||
}
|
||||
|
||||
private boolean isValid = false;
|
||||
private int topOfLow;
|
||||
private int topOfHigh;
|
||||
private int power;
|
||||
private int deltaStart;
|
||||
|
||||
private static final int exp2[] = {
|
||||
0x1, 0x2, 0x4, 0x8,
|
||||
0x10, 0x20, 0x40, 0x80,
|
||||
0x100, 0x200, 0x400, 0x800,
|
||||
0x1000, 0x2000, 0x4000, 0x8000,
|
||||
0x10000, 0x20000, 0x40000, 0x80000,
|
||||
0x100000, 0x200000, 0x400000, 0x800000,
|
||||
0x1000000, 0x2000000, 0x4000000, 0x8000000,
|
||||
0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
|
||||
};
|
||||
}
|
@ -0,0 +1,96 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FileLineIterator.java,v $
|
||||
* $Date: 2002/10/01 01:12:10 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.utility;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.*;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
|
||||
import com.ibm.text.UCD.*;
|
||||
|
||||
/**
|
||||
* Opens a file, and iterates through the lines in the file.
|
||||
* Options allow trimming and comment handling, and splitting
|
||||
*/
|
||||
public class FileLineIterator {
|
||||
static public final char NOTCHAR = '\uFFFF';
|
||||
|
||||
// public writable
|
||||
public boolean doCounter = true;
|
||||
public int lineLimit = Integer.MAX_VALUE;
|
||||
public char commentChar = '#'; // NOTCHAR if no comments
|
||||
public boolean showFilename = true;
|
||||
|
||||
// public readable
|
||||
public String originalLine = "";
|
||||
public String cleanedLine = "";
|
||||
public int counter = 0;
|
||||
|
||||
private BufferedReader br = null;
|
||||
private boolean isUTF8 = false;
|
||||
|
||||
/**
|
||||
* Open the file for reading. If useGenDir is set, use the normal generation directory
|
||||
*/
|
||||
public void open(String filename, boolean isUTF8) throws IOException {
|
||||
if (showFilename) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Reading File: " + new File(filename).getCanonicalPath());
|
||||
}
|
||||
br = Utility.openReadFile(filename, isUTF8);
|
||||
this.isUTF8 = isUTF8;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a non-zero-length line from the file, stripping comments & using counter, according to settings.
|
||||
*/
|
||||
public String read() throws IOException {
|
||||
while (true) {
|
||||
if (counter >= lineLimit) return null;
|
||||
cleanedLine = originalLine = br.readLine();
|
||||
if (doCounter) Utility.dot(counter++);
|
||||
if (cleanedLine == null) return null;
|
||||
|
||||
// drop BOM
|
||||
if (isUTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
|
||||
cleanedLine = cleanedLine.substring(1);
|
||||
}
|
||||
|
||||
// drop comment
|
||||
if (commentChar != NOTCHAR) {
|
||||
int commentPos = cleanedLine.indexOf(commentChar);
|
||||
if (commentPos >= 0) cleanedLine = cleanedLine.substring(0, commentPos);
|
||||
}
|
||||
cleanedLine = cleanedLine.trim();
|
||||
if (cleanedLine.length() != 0) return cleanedLine;
|
||||
}
|
||||
}
|
||||
|
||||
public int readSplit(String[] results, char delimiter) throws IOException {
|
||||
String line = read();
|
||||
if (line == null) return 0;
|
||||
return Utility.split(line, delimiter, results);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
Utility.fixDot();
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user