392 lines
12 KiB
Java
392 lines
12 KiB
Java
|
package com.ibm.text;
|
||
|
import com.ibm.icu.text.UnicodeSet;
|
||
|
import com.ibm.icu.text.UnicodeSetIterator;
|
||
|
import com.ibm.icu.text.UTF16;
|
||
|
import com.ibm.icu.text.Normalizer;
|
||
|
import com.ibm.icu.lang.UCharacter;
|
||
|
import java.util.BitSet;
|
||
|
import java.util.Set;
|
||
|
import java.util.HashSet;
|
||
|
import java.util.TreeSet;
|
||
|
import java.util.Iterator;
|
||
|
import java.text.NumberFormat;
|
||
|
import com.ibm.text.utility.FastIntBinarySearch;
|
||
|
|
||
|
public class TestICU4J {
|
||
|
public static void main(String[] args) {
|
||
|
String a = UTF16.valueOf(0x10000);
|
||
|
String b = Normalizer.normalize("a\u0308", Normalizer.NFC);
|
||
|
System.out.println(b);
|
||
|
/*
|
||
|
System.out.println(UCharacter.getType(0x10FFFF));
|
||
|
System.out.println(UCharacter.getName(0x61));
|
||
|
*/
|
||
|
testUnicodeSetSpeed(Character.TITLECASE_LETTER, 100);
|
||
|
testUnicodeSetSpeed(Character.UNASSIGNED, 1);
|
||
|
}
|
||
|
|
||
|
static final boolean SHOW_ERRORS = false;
|
||
|
static boolean OPTIMIZATION = true;
|
||
|
|
||
|
static void testUnicodeSetSpeed(int prop, int ITERATIONS) {
|
||
|
NumberFormat numb = NumberFormat.getNumberInstance();
|
||
|
NumberFormat percent = NumberFormat.getPercentInstance();
|
||
|
double start, delta, oldDelta;
|
||
|
int temp = 0;
|
||
|
Set s;
|
||
|
UnicodeSet us;
|
||
|
Iterator it;
|
||
|
UnicodeSetIterator uit;
|
||
|
|
||
|
BitSet bs = new BitSet();
|
||
|
System.out.println();
|
||
|
System.out.println("Getting characters for property " + prop);
|
||
|
int total = 0;
|
||
|
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||
|
if (UCharacter.getType(cp) == prop) {
|
||
|
bs.set(cp);
|
||
|
++total;
|
||
|
}
|
||
|
}
|
||
|
System.out.println("Total characters: " + numb.format(total));
|
||
|
System.out.println("Loop Iterations: " + numb.format(ITERATIONS));
|
||
|
System.out.println();
|
||
|
|
||
|
System.out.println("Testing Add speed");
|
||
|
|
||
|
s = new TreeSet();
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
s.clear();
|
||
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||
|
if (bs.get(cp)) {
|
||
|
s.add(new Integer(cp));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("Set add time: " + numb.format(delta));
|
||
|
System.out.println("Total characters: " + numb.format(s.size()));
|
||
|
|
||
|
us = new UnicodeSet();
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
us.clear();
|
||
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||
|
if (bs.get(cp)) {
|
||
|
optimizedAdd(us,cp);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
optimizedDone(us);
|
||
|
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("UnicodeSet add time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||
|
System.out.println("Total characters: " + numb.format(us.size()) + ", ranges: " + us.getRangeCount());
|
||
|
|
||
|
System.out.println();
|
||
|
System.out.println("Testing Contains speed");
|
||
|
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||
|
if (s.contains(new Integer(cp)) != bs.get(cp)) {
|
||
|
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("Set contains time: " + numb.format(delta));
|
||
|
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||
|
if (us.contains(cp) != bs.get(cp)) {
|
||
|
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||
|
|
||
|
setupBinary(us);
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||
|
if (binaryContains(cp) != bs.get(cp)) {
|
||
|
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("BINARY UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||
|
|
||
|
System.out.println("Testing Iteration speed");
|
||
|
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
it = s.iterator();
|
||
|
while (it.hasNext()) {
|
||
|
temp += ((Integer)it.next()).intValue();
|
||
|
}
|
||
|
}
|
||
|
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("Set iteration time: " + numb.format(delta));
|
||
|
|
||
|
uit = new UnicodeSetIterator(us);
|
||
|
start = System.currentTimeMillis();
|
||
|
for (int i = 0; i < ITERATIONS; ++i) {
|
||
|
uit.reset();
|
||
|
while (uit.next()) {
|
||
|
temp += uit.codepoint;
|
||
|
}
|
||
|
}
|
||
|
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||
|
System.out.println("UnicodeSet iteration time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||
|
|
||
|
uit.reset();
|
||
|
start = System.currentTimeMillis();
|
||
|
while (uit.nextRange()) {
|
||
|
System.out.println(info(uit.codepoint, uit.codepointEnd));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static FastIntBinarySearch fibs;
|
||
|
|
||
|
static void setupBinary(UnicodeSet us) {
|
||
|
int[] dummySearch = new int[us.getRangeCount()*2];
|
||
|
int dummyLimit = 0;
|
||
|
UnicodeSetIterator uit = new UnicodeSetIterator(us);
|
||
|
while (uit.nextRange()) {
|
||
|
dummySearch[dummyLimit++] = uit.codepoint;
|
||
|
dummySearch[dummyLimit++] = uit.codepointEnd+1;
|
||
|
}
|
||
|
fibs = new FastIntBinarySearch(dummySearch);
|
||
|
}
|
||
|
|
||
|
static boolean binaryContains(int cp) {
|
||
|
return ((fibs.findIndex(cp) & 1) != 0); // return true if odd
|
||
|
}
|
||
|
|
||
|
|
||
|
static String info(int cp) {
|
||
|
return Integer.toString(cp, 16).toUpperCase() + " " + UCharacter.getName(cp);
|
||
|
}
|
||
|
|
||
|
static String info(int cpStart, int cpEnd) {
|
||
|
if (cpStart == cpEnd) {
|
||
|
return Integer.toString(cpStart, 16).toUpperCase()
|
||
|
+ " " + UCharacter.getName(cpStart);
|
||
|
}
|
||
|
return Integer.toString(cpStart, 16).toUpperCase() + ".." + Integer.toString(cpEnd, 16).toUpperCase()
|
||
|
+ " " + UCharacter.getName(cpStart) + ".." + UCharacter.getName(cpEnd);
|
||
|
}
|
||
|
|
||
|
static int first;
|
||
|
static int limit = -2;
|
||
|
|
||
|
static void optimizedAdd(UnicodeSet us, int cp) {
|
||
|
if (!OPTIMIZATION) {
|
||
|
us.add(cp);
|
||
|
return;
|
||
|
}
|
||
|
if (cp == limit) {
|
||
|
++limit;
|
||
|
} else {
|
||
|
if (limit > 0) {
|
||
|
us.add(first, limit - 1);
|
||
|
// System.out.println(info(first, limit-1));
|
||
|
}
|
||
|
first = cp;
|
||
|
limit = cp + 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void optimizedDone(UnicodeSet us) {
|
||
|
if (!OPTIMIZATION) return;
|
||
|
if (limit > 0) {
|
||
|
us.add(first, limit - 1);
|
||
|
//System.out.println(info(first, limit-1));
|
||
|
}
|
||
|
limit = -2; // reset to invalid
|
||
|
}
|
||
|
|
||
|
|
||
|
public static class UXCharacter {
|
||
|
/**
|
||
|
* Provides interface for properties in
|
||
|
* http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
|
||
|
* and their values in
|
||
|
* http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* Tests a particular code point to see if the cited property has the given value.
|
||
|
*
|
||
|
* Sample: the following are equivalent
|
||
|
* <pre>
|
||
|
* if (UCharacter.test("LB", "AL", cp)) ...
|
||
|
* if (UCharacter.test("line break", "alphabetic", cp)) ...
|
||
|
* </pre>
|
||
|
*
|
||
|
*/
|
||
|
public static boolean test(String propertyName, String propertyValue, int codePoint) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
|
||
|
* @param set the resulting value. The set is cleared,
|
||
|
* then all the code points with the given <property, value> are added.
|
||
|
*
|
||
|
* Sample: the following are equivalent
|
||
|
* <pre>
|
||
|
* if (UCharacter.test("WSpace", cp)) ...
|
||
|
* if (UCharacter.test("White_Space", cp)) ...
|
||
|
* if (UCharacter.test("White_Space", "true", cp)) ...
|
||
|
* if (!UCharacter.test("White_Space", "false", cp)) ...
|
||
|
* </pre>
|
||
|
*
|
||
|
*/
|
||
|
public static void getSet(String propertyName, String propertyValue, UnicodeSet set) {
|
||
|
// logical implemenation. Real implementation would be way faster!
|
||
|
set.clear();
|
||
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||
|
if (test(propertyName, propertyValue, cp)) set.add(cp);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// ======================================================
|
||
|
// POSSIBLE ADDITIONAL UTILITIES FOR CONVENIENCE OR SPEED
|
||
|
// ======================================================
|
||
|
|
||
|
/**
|
||
|
* Tests a particular code point to see if the cited boolean property is true.
|
||
|
* @param propertyName the cited property
|
||
|
* @param codePoint the particular code point
|
||
|
* @return true if the cited property has the given value for the specified code point.
|
||
|
*
|
||
|
* Sample: the following are equivalent
|
||
|
* <pre>
|
||
|
* if (UCharacter.test("WSpace", cp)) ...
|
||
|
* if (UCharacter.test("White_Space", cp)) ...
|
||
|
* if (UCharacter.test("White_Space", "true", cp)) ...
|
||
|
* if (!UCharacter.test("White_Space", "false", cp)) ...
|
||
|
* </pre>
|
||
|
*
|
||
|
*/
|
||
|
public static boolean test(String booleanPropertyName, int codePoint) {
|
||
|
return test(booleanPropertyName, "true", codePoint);
|
||
|
}
|
||
|
|
||
|
// ===============================================
|
||
|
// The following allow access to properties by number, saving a string lookup
|
||
|
// on each call.
|
||
|
// ===============================================
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Gets an index for higher-speed access to properties.
|
||
|
*
|
||
|
* Sample:
|
||
|
* <pre>
|
||
|
* int prop = UCharacter.getPropertyIndexIndex("LB");
|
||
|
* int value = UCharacter.getValueIndex("LB", "AL");
|
||
|
* while (true) {
|
||
|
* ...
|
||
|
* if (test(prop, value, codePoint)) ...
|
||
|
* </pre>
|
||
|
*
|
||
|
*/
|
||
|
public static int getPropertyIndex(String propertyName) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Gets maximum property index, used for iterating through properties
|
||
|
*
|
||
|
*/
|
||
|
public static int getMaxPropertyIndex() {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
static final byte // NAME_STYLE
|
||
|
SHORT = 0,
|
||
|
DEFAULT = 1,
|
||
|
LONG = 2;
|
||
|
|
||
|
/**
|
||
|
* Gets property name
|
||
|
*
|
||
|
*/
|
||
|
public static String getPropertyName(int propertyIndex, byte namestyle) {
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Tests a particular code point to see if the cited property has the given value.
|
||
|
*/
|
||
|
public static boolean test(int propertyIndex, String propertyValue, int codePoint) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
|
||
|
*/
|
||
|
public static void getSet(int propertyIndex, String propertyValue, UnicodeSet set) {
|
||
|
}
|
||
|
|
||
|
// ===============================================
|
||
|
// The following allow access to enumerated property values by number,
|
||
|
// saving a string lookup on each call.
|
||
|
// They are only valid for enumerated properties
|
||
|
// including the combining character class (0..255).
|
||
|
// ===============================================
|
||
|
|
||
|
/**
|
||
|
* Gets an index for higher-speed access to property values.
|
||
|
* Only valid for enumerated properties.
|
||
|
*/
|
||
|
public static int getValueIndex(String propertyName, String propertyValue) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Gets maximum value index for a given property, used for iterating through property values.
|
||
|
* Only valid for enumerated properties.
|
||
|
*
|
||
|
*/
|
||
|
public static int getMaxValueIndex(int propertyIndex) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Gets property value, corresponding to one of the values passed in
|
||
|
*
|
||
|
*/
|
||
|
public static String getValueName(int propertyIndex, int valueIndex, byte namestyle) {
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Tests a particular code point to see if the cited property has the given value.
|
||
|
*/
|
||
|
public static boolean test(int propertyIndex, int valueIndex, int codePoint) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
|
||
|
*/
|
||
|
public static void getSet(int propertyIndex, int valueIndex, UnicodeSet set) {
|
||
|
}
|
||
|
|
||
|
|
||
|
/* OPEN ISSUES:
|
||
|
- Don't like the names of the functions. Any better options? test => hasValue? hasPropertyValue?
|
||
|
- Should getSet really ADD to the set (avoiding the clear?) and be called addProperties?
|
||
|
Maybe faster sometimes, but might also be more errorprone.
|
||
|
*/
|
||
|
|
||
|
}
|
||
|
}
|