diff --git a/tools/unicodetools/com/ibm/text/utility/FastBinarySearch.java b/tools/unicodetools/com/ibm/text/utility/FastBinarySearch.java new file mode 100644 index 0000000000..d20da339d7 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/utility/FastBinarySearch.java @@ -0,0 +1,329 @@ + +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FastBinarySearch.java,v $ +* $Date: 2002/10/01 01:12:10 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.utility; + +import java.util.Random; +import java.util.Arrays; +import com.ibm.icu.text.NumberFormat; + +/** + * Quick & Dirty test program for fast (unrolled) binary search + * Should use new PerfTest once that is done, although since there is no object + * creation the numbers should be fairly reliable. + */ + +final public class FastBinarySearch { + + /** + * Testing + */ + + static void test() { + perfTest(100, 100); // warmup + + // try different combinations of data size and iterations + + perfTest(100, 200000); + perfTest(1000, 2000); + perfTest(100000, 200); + + // skip the following + if (true) return; + validityTest(); + } + + static void perfTest(int dataSize, int iterations) { + NumberFormat percent = NumberFormat.getPercentInstance(); + percent.setMaximumFractionDigits(0); + + Random random = new Random(123456789L); + int[] myData = new int[dataSize]; + FastBinarySearch fbs = new FastBinarySearch(); + + // produce test case + for (int i = 0; i < myData.length; ++i) { + myData[i] = (int) (random.nextDouble() * myData.length * 3); + } + Arrays.sort(myData, 0, myData.length); + fbs.setData(myData, myData.length); + + // produce probe data + int[] probe = new int[myData.length*2]; + for (int i = 0; i < probe.length; ++i) { + probe[i] = (int) (random.nextDouble() * myData.length * 3); + } + + int sum = 0; + double startTime, endTime, time, baseTime; + + System.out.println(); + long totalIterations = iterations * probe.length; + System.out.println("Iterations = " + totalIterations + ", Data size = " + dataSize); + + startTime = System.currentTimeMillis(); + for (int testCount = 0; testCount < iterations; ++testCount) { + for (int i = 0; i < probe.length; ++i) { + sum += fbs.findCodePoint(i); + } + } + endTime = System.currentTimeMillis(); + baseTime = time = (endTime - startTime)*1000/totalIterations; + System.out.println("Basic; time=" + time + " microsecs/call"); + + startTime = System.currentTimeMillis(); + for (int testCount = 0; testCount < iterations; ++testCount) { + for (int i = 0; i < probe.length; ++i) { + sum += fbs.highestIndexLEQ(i); + } + } + endTime = System.currentTimeMillis(); + time = (endTime - startTime)*1000/totalIterations; + System.out.println("Fast; time=" + time + " microsecs/call\t" + percent.format(time/baseTime-1)); + + startTime = System.currentTimeMillis(); + for (int testCount = 0; testCount < iterations; ++testCount) { + for (int i = 0; i < probe.length; ++i) { + sum += fbs.highestIndexLEQ2(i); + } + } + endTime = System.currentTimeMillis(); + time = (endTime - startTime)*1000/totalIterations; + System.out.println("Compact; time=" + time + " microsecs/call\t" + percent.format(time/baseTime-1)); + } + + + static void validityTest() { + Random random = new Random(123456789L); + int[] myData = new int[50]; + FastBinarySearch fbs = new FastBinarySearch(); + + for (int testCount = 0; testCount < 100; ++testCount) { + + // produce test case + double ran = random.nextDouble(); + //System.out.println(ran); + int myCount = 2+ (int) (ran * (myData.length - 2)); + for (int i = 0; i < myCount; ++i) { + ran = random.nextDouble(); + //System.out.println(ran); + myData[i] = (int) (ran * myData.length * 3); + } + System.out.println("Trial " + testCount + ", len: " + myCount); + Arrays.sort(myData, 0, myCount); + fbs.setData(myData, myCount); + + // compare brute force & fast methods + boolean ok = true; + for (int i = -1; i < myData.length * 3 + 1; ++i) { + int brute = fbs.bruteForce(i); + int fast = fbs.highestIndexLEQ(i); + if (fast != brute) { + if (ok) { + System.out.println(fbs); + } + System.out.println("Error: probe=" + i + ", brute=" + brute + ", fast=" + fast); + fast = fbs.highestIndexLEQ(i); // do again with debugger + ok = false; + } + } + if (!ok) return; + } + } + + /** + * Set the data to be scanned. It must be in sorted order. + */ + + public void setData(int data[], int count) { + + this.data = (int[]) data.clone(); // clone for safety + isValid = this.count == count; // isValid only depends on the count remaining the same + this.count = count; + } + + /** + * Basic binary search + */ + + private final int findCodePoint(int c) { + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if (c < data[0]) return 0; + int lo = 0; + int hi = count - 1; + // invariant: c >= list[lo] + // invariant: c < list[hi] + for (;;) { + int i = (lo + hi) >>> 1; + if (i == lo) return hi; + if (c < data[i]) { + hi = i; + } else { + lo = i; + } + } + } + + /** + * @return greatest index whose value is less than or equal to the searchValue. + * If there is no such index, then -1 is returned + */ + + public int bruteForce(int searchValue) { + int i = count; + while (--i >= 0 && data[i] > searchValue) {} + return i; + } + + /** + * @return greatest index such that data[index] <= searchValue + * If there is no such index (e.g. searchValue < data[0]), then -1 is returned + */ + + public int highestIndexLEQ(int searchValue) { + + if (!isValid) validate(); + int temp; + + // set up initial range to search. Each subrange is a power of two in length + int high = searchValue < data[topOfLow] ? topOfLow : topOfHigh; + + // Completely unrolled binary search, folhighing "Programming Pearls" + // Each case deliberately falls through to the next + // Logically, data[-1] < all_search_values && data[count] > all_search_values + // although the values -1 and count are never actually touched. + + // The bounds at each point are low & high, + // where low == high - delta*2 + // so high - delta is the midpoint + + // The invariant AFTER each line is that data[low] < searchValue <= data[high] + + switch (power) { + //case 31: if (searchValue < data[temp = high-0x40000000]) high = temp; // no unsigned int in Java + case 30: if (searchValue < data[temp = high-0x20000000]) high = temp; + case 29: if (searchValue < data[temp = high-0x10000000]) high = temp; + + case 28: if (searchValue < data[temp = high- 0x8000000]) high = temp; + case 27: if (searchValue < data[temp = high- 0x4000000]) high = temp; + case 26: if (searchValue < data[temp = high- 0x2000000]) high = temp; + case 25: if (searchValue < data[temp = high- 0x1000000]) high = temp; + + case 24: if (searchValue < data[temp = high- 0x800000]) high = temp; + case 23: if (searchValue < data[temp = high- 0x400000]) high = temp; + case 22: if (searchValue < data[temp = high- 0x200000]) high = temp; + case 21: if (searchValue < data[temp = high- 0x100000]) high = temp; + + case 20: if (searchValue < data[temp = high- 0x80000]) high = temp; + case 19: if (searchValue < data[temp = high- 0x40000]) high = temp; + case 18: if (searchValue < data[temp = high- 0x20000]) high = temp; + case 17: if (searchValue < data[temp = high- 0x10000]) high = temp; + + case 16: if (searchValue < data[temp = high- 0x8000]) high = temp; + case 15: if (searchValue < data[temp = high- 0x4000]) high = temp; + case 14: if (searchValue < data[temp = high- 0x2000]) high = temp; + case 13: if (searchValue < data[temp = high- 0x1000]) high = temp; + + case 12: if (searchValue < data[temp = high- 0x800]) high = temp; + case 11: if (searchValue < data[temp = high- 0x400]) high = temp; + case 10: if (searchValue < data[temp = high- 0x200]) high = temp; + case 9: if (searchValue < data[temp = high- 0x100]) high = temp; + + case 8: if (searchValue < data[temp = high- 0x80]) high = temp; + case 7: if (searchValue < data[temp = high- 0x40]) high = temp; + case 6: if (searchValue < data[temp = high- 0x20]) high = temp; + case 5: if (searchValue < data[temp = high- 0x10]) high = temp; + + case 4: if (searchValue < data[temp = high- 0x8]) high = temp; + case 3: if (searchValue < data[temp = high- 0x4]) high = temp; + case 2: if (searchValue < data[temp = high- 0x2]) high = temp; + case 1: if (searchValue < data[temp = high- 0x1]) high = temp; + } + if (high == topOfHigh && searchValue >= data[high]) return high; + return high-1; + } + + + // NOTE: on some machines the above may not be optimal, if the size of the function + // forces code out of the cache. For that case, it would be better for program in a loop, like the following + + public int highestIndexLEQ2(int searchValue) { + + if (!isValid) validate(); + int temp; + int high = searchValue < data[topOfLow] ? topOfLow : topOfHigh; + for (int delta = deltaStart; delta != 0; delta >>= 1) { + if (searchValue < data[temp = high-delta]) high = temp; + } + if (high == topOfHigh && searchValue >= data[high]) return high; + return high-1; + } + + /** + * For debugging + */ + public String toString() { + String result = "["; + for (int j = 0; j < count; ++j) { + if (j != 0) result += ", "; + result += data[j]; + } + result += "]"; + result += ", power: " + power; + result += ", topOfLow: " + topOfLow; + result += ", topOfHigh: " + topOfHigh; + return result; + } + + + // ================ Privates ================ + + // data + + int data[]; + int count; + + // validate internal parameters + + private void validate() { + if (count <= 1) throw new IllegalArgumentException("Array must have at least 2 elements"); + + // find greatest power of 2 less than or equal to count + for (power = exp2.length-1; power > 0 && exp2[power] > count; power--) {} + + // determine the starting points + topOfLow = exp2[power] - 1; + topOfHigh = count - 1; + deltaStart = exp2[power-1]; + isValid = true; + } + + private boolean isValid = false; + private int topOfLow; + private int topOfHigh; + private int power; + private int deltaStart; + + private static final int exp2[] = { + 0x1, 0x2, 0x4, 0x8, + 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, + 0x1000, 0x2000, 0x4000, 0x8000, + 0x10000, 0x20000, 0x40000, 0x80000, + 0x100000, 0x200000, 0x400000, 0x800000, + 0x1000000, 0x2000000, 0x4000000, 0x8000000, + 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java + }; +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/FileLineIterator.java b/tools/unicodetools/com/ibm/text/utility/FileLineIterator.java new file mode 100644 index 0000000000..fd565682cf --- /dev/null +++ b/tools/unicodetools/com/ibm/text/utility/FileLineIterator.java @@ -0,0 +1,96 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FileLineIterator.java,v $ +* $Date: 2002/10/01 01:12:10 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.utility; + +import java.util.*; +import java.text.*; +import java.io.*; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.ReplaceableString; +import com.ibm.icu.text.UnicodeMatcher; + +import com.ibm.text.UCD.*; + +/** + * Opens a file, and iterates through the lines in the file. + * Options allow trimming and comment handling, and splitting + */ +public class FileLineIterator { + static public final char NOTCHAR = '\uFFFF'; + + // public writable + public boolean doCounter = true; + public int lineLimit = Integer.MAX_VALUE; + public char commentChar = '#'; // NOTCHAR if no comments + public boolean showFilename = true; + + // public readable + public String originalLine = ""; + public String cleanedLine = ""; + public int counter = 0; + + private BufferedReader br = null; + private boolean isUTF8 = false; + + /** + * Open the file for reading. If useGenDir is set, use the normal generation directory + */ + public void open(String filename, boolean isUTF8) throws IOException { + if (showFilename) { + Utility.fixDot(); + System.out.println("Reading File: " + new File(filename).getCanonicalPath()); + } + br = Utility.openReadFile(filename, isUTF8); + this.isUTF8 = isUTF8; + } + + /** + * Fetch a non-zero-length line from the file, stripping comments & using counter, according to settings. + */ + public String read() throws IOException { + while (true) { + if (counter >= lineLimit) return null; + cleanedLine = originalLine = br.readLine(); + if (doCounter) Utility.dot(counter++); + if (cleanedLine == null) return null; + + // drop BOM + if (isUTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) { + cleanedLine = cleanedLine.substring(1); + } + + // drop comment + if (commentChar != NOTCHAR) { + int commentPos = cleanedLine.indexOf(commentChar); + if (commentPos >= 0) cleanedLine = cleanedLine.substring(0, commentPos); + } + cleanedLine = cleanedLine.trim(); + if (cleanedLine.length() != 0) return cleanedLine; + } + } + + public int readSplit(String[] results, char delimiter) throws IOException { + String line = read(); + if (line == null) return 0; + return Utility.split(line, delimiter, results); + } + + public void close() throws IOException { + Utility.fixDot(); + br.close(); + } +} + \ No newline at end of file