ICU-12450 move com.ibm.icu.dev.util.BagFormatter, CaseIterator, FileUtilities, ICUPropertyFactory, TransliteratorUtilities, UnicodeProperty, UnicodePropertySymbolTable to org.unicode.cldr.util

X-SVN-Rev: 38623
This commit is contained in:
Markus Scherer 2016-04-16 00:09:04 +00:00
parent 9d12e081bc
commit 0ba7b2e17e
11 changed files with 274 additions and 5084 deletions

View File

@ -1,119 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2011-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.translit;
import java.util.List;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.ICUPropertyFactory;
import com.ibm.icu.dev.util.UnicodeProperty;
import com.ibm.icu.dev.util.UnicodeProperty.Factory;
import com.ibm.icu.dev.util.UnicodePropertySymbolTable;
import com.ibm.icu.text.UnicodeSet;
/**
* @author markdavis
*
*/
public class TestUnicodeProperty extends TestFmwk{
public static void main(String[] args) {
new TestUnicodeProperty().run(args);
}
static final UnicodeSet casedLetter = new UnicodeSet("[:gc=cased letter:]");
static final UnicodeSet letter = new UnicodeSet("[:gc=L:]");
public void TestBasic() {
Factory factory = ICUPropertyFactory.make();
UnicodeProperty property = factory.getProperty("gc");
List values = property.getAvailableValues();
assertTrue("Values contain GC values", values.contains("Unassigned"));
final UnicodeSet lu = property.getSet("Lu");
if (!assertTrue("Gc=L contains 'A'", lu.contains('A'))) {
errln("Contents:\t" + lu.complement().complement().toPattern(false));
}
}
public void TestSymbolTable() {
Factory factory = ICUPropertyFactory.make();
UnicodePropertySymbolTable upst = new UnicodePropertySymbolTable(factory);
UnicodeSet.setDefaultXSymbolTable(upst);
try {
final UnicodeSet luSet = new UnicodeSet("[:gc=L:]");
assertTrue("Gc=L contains 'A'", luSet.contains('A'));
assertTrue("Gc=L contains 'Z'", luSet.contains('Z'));
assertFalse("Gc=L contains 'a'", luSet.contains('1'));
UnicodeSet casedLetter2 = new UnicodeSet("[:gc=cased letter:]");
assertEquals("gc=lc are equal", casedLetter, casedLetter2);
} finally {
// restore the world
UnicodeSet.setDefaultXSymbolTable(null);
}
}
public void TestSymbolTable2() {
Factory factory = new MyUnicodePropertyFactory();
UnicodePropertySymbolTable upst = new UnicodePropertySymbolTable(factory);
UnicodeSet.setDefaultXSymbolTable(upst);
try {
final UnicodeSet luSet = new UnicodeSet("[:gc=L:]");
assertFalse("Gc=L contains 'A'", luSet.contains('A'));
if (!assertTrue("Gc=L contains 'Z'", luSet.contains('Z'))) {
errln("Contents:\t" + luSet.complement().complement().toPattern(false));
}
assertFalse("Gc=L contains 'a'", luSet.contains('1'));
UnicodeSet casedLetter2 = new UnicodeSet("[:gc=cased letter:]");
assertNotEquals("gc=lc should not be equal", casedLetter, casedLetter2);
} finally {
// restore the world
UnicodeSet.setDefaultXSymbolTable(null);
}
}
/**
* For testing, override to set A-M to Cn.
*/
static class MyUnicodeGCProperty extends UnicodeProperty.SimpleProperty {
UnicodeProperty icuProperty = ICUPropertyFactory.make().getProperty("Gc");
{
setName(icuProperty.getName());
setType(icuProperty.getType());
}
@Override
protected String _getValue(int codepoint) {
if (codepoint >= 'A' && codepoint <= 'M') {
return "Unassigned";
} else {
return icuProperty.getValue(codepoint);
}
}
@Override
protected List _getValueAliases(String valueAlias, List result) {
return icuProperty.getValueAliases(valueAlias, result);
}
@Override
public List _getNameAliases(List result) {
return icuProperty.getNameAliases();
}
}
/**
* For testing, override to set A-Z to Cn.
*/
static class MyUnicodePropertyFactory extends ICUPropertyFactory {
private MyUnicodePropertyFactory() {
add(new MyUnicodeGCProperty());
}
}
static class MyUnicodePropertySymbolTable extends UnicodePropertySymbolTable {
public MyUnicodePropertySymbolTable(Factory factory) {
super(factory);
}
}
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
* Copyright (C) 1996-2015, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.translit;
@ -10,17 +10,26 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.TestBoilerplate;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.dev.util.UnicodeMapIterator;
import com.ibm.icu.dev.util.UnicodeMap.EntryRange;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
@ -294,4 +303,265 @@ public class UnicodeMapTest extends TestFmwk {
assertNull("original-def", test.get("def"));
assertEquals("copy-def", (Integer) 4, copy.get("def"));
}
private static final int LIMIT = 0x15; // limit to make testing more realistic in terms of collisions
private static final int ITERATIONS = 1000000;
private static final boolean SHOW_PROGRESS = false;
private static final boolean DEBUG = false;
SortedSet<String> log = new TreeSet<String>();
static String[] TEST_VALUES = {"A", "B", "C", "D", "E", "F"};
static Random random = new Random(12345);
public void TestUnicodeMapRandom() {
// do random change to both, then compare
random.setSeed(12345); // reproducible results
logln("Comparing against HashMap");
UnicodeMap<String> map1 = new UnicodeMap();
Map<Integer, String> map2 = new HashMap<Integer, String>();
for (int counter = 0; counter < ITERATIONS; ++counter) {
int start = random.nextInt(LIMIT);
String value = TEST_VALUES[random.nextInt(TEST_VALUES.length)];
String logline = Utility.hex(start) + "\t" + value;
if (SHOW_PROGRESS) logln(counter + "\t" + logline);
log.add(logline);
if (DEBUG && counter == 144) {
System.out.println(" debug");
}
map1.put(start, value);
map2.put(start, value);
check(map1, map2, counter);
}
checkNext(map1, map2, LIMIT);
}
private static final int SET_LIMIT = 0x10FFFF;
private static final int propEnum = UProperty.GENERAL_CATEGORY;
public void TestUnicodeMapGeneralCategory() {
logln("Setting General Category");
UnicodeMap<String> map1 = new UnicodeMap();
Map<Integer, String> map2 = new HashMap<Integer, String>();
//Map<Integer, String> map3 = new TreeMap<Integer, String>();
map1 = new UnicodeMap<String>();
map2 = new TreeMap<Integer,String>();
for (int cp = 0; cp <= SET_LIMIT; ++cp) {
int enumValue = UCharacter.getIntPropertyValue(cp, propEnum);
//if (enumValue <= 0) continue; // for smaller set
String value = UCharacter.getPropertyValueName(propEnum,enumValue, UProperty.NameChoice.LONG);
map1.put(cp, value);
map2.put(cp, value);
}
checkNext(map1, map2, Integer.MAX_VALUE);
logln("Comparing General Category");
check(map1, map2, -1);
logln("Comparing Values");
Set<String> values1 = map1.getAvailableValues(new TreeSet<String>());
Set<String> values2 = new TreeSet<String>(map2.values());
if (!TestBoilerplate.verifySetsIdentical(this, values1, values2)) {
throw new IllegalArgumentException("Halting");
}
logln("Comparing Sets");
for (Iterator<String> it = values1.iterator(); it.hasNext();) {
String value = it.next();
logln(value == null ? "null" : value);
UnicodeSet set1 = map1.keySet(value);
UnicodeSet set2 = TestBoilerplate.getSet(map2, value);
if (!TestBoilerplate.verifySetsIdentical(this, set1, set2)) {
throw new IllegalArgumentException("Halting");
}
}
}
public void testBoilerplate() {
// check boilerplate
List argList = new ArrayList();
argList.add("TestMain");
if (params.verbose) argList.add("-verbose");
String[] args = new String[argList.size()];
argList.toArray(args);
new UnicodeMapBoilerplate().run(args);
// TODO: the following is not being reached
new UnicodeSetBoilerplate().run(args);
}
public void TestAUnicodeMap2() {
UnicodeMap foo = new UnicodeMap();
@SuppressWarnings("unused")
int hash = foo.hashCode(); // make sure doesn't NPE
@SuppressWarnings("unused")
Set fii = foo.stringKeys(); // make sure doesn't NPE
}
public void TestAUnicodeMapInverse() {
UnicodeMap<Character> foo1 = new UnicodeMap<Character>()
.putAll('a', 'z', 'b')
.put("ab", 'c')
.put('x', 'b')
.put("xy", 'c')
;
Map<Character, UnicodeSet> target = new HashMap<Character, UnicodeSet>();
foo1.addInverseTo(target);
UnicodeMap<Character> reverse = new UnicodeMap().putAllInverse(target);
assertEquals("", foo1, reverse);
}
private void checkNext(UnicodeMap<String> map1, Map<Integer,String> map2, int limit) {
logln("Comparing nextRange");
Map localMap = new TreeMap();
UnicodeMapIterator<String> mi = new UnicodeMapIterator<String>(map1);
while (mi.nextRange()) {
logln(Utility.hex(mi.codepoint) + ".." + Utility.hex(mi.codepointEnd) + " => " + mi.value);
for (int i = mi.codepoint; i <= mi.codepointEnd; ++i) {
//if (i >= limit) continue;
localMap.put(i, mi.value);
}
}
checkMap(map2, localMap);
logln("Comparing next");
mi.reset();
localMap = new TreeMap();
// String lastValue = null;
while (mi.next()) {
// if (!UnicodeMap.areEqual(lastValue, mi.value)) {
// // System.out.println("Change: " + Utility.hex(mi.codepoint) + " => " + mi.value);
// lastValue = mi.value;
// }
//if (mi.codepoint >= limit) continue;
localMap.put(mi.codepoint, mi.value);
}
checkMap(map2, localMap);
}
public void check(UnicodeMap<String> map1, Map<Integer,String> map2, int counter) {
for (int i = 0; i < LIMIT; ++i) {
String value1 = map1.getValue(i);
String value2 = map2.get(i);
if (!UnicodeMap.areEqual(value1, value2)) {
errln(counter + " Difference at " + Utility.hex(i)
+ "\t UnicodeMap: " + value1
+ "\t HashMap: " + value2);
errln("UnicodeMap: " + map1);
errln("Log: " + TestBoilerplate.show(log));
errln("HashMap: " + TestBoilerplate.show(map2));
}
}
}
void checkMap(Map m1, Map m2) {
if (m1.equals(m2)) return;
StringBuilder buffer = new StringBuilder();
Set m1entries = m1.entrySet();
Set m2entries = m2.entrySet();
getEntries("\r\nIn First, and not Second", m1entries, m2entries, buffer, 20);
getEntries("\r\nIn Second, and not First", m2entries, m1entries, buffer, 20);
errln(buffer.toString());
}
static Comparator<Map.Entry<Integer, String>> ENTRY_COMPARATOR = new Comparator<Map.Entry<Integer, String>>() {
public int compare(Map.Entry<Integer, String> o1, Map.Entry<Integer, String> o2) {
if (o1 == o2) return 0;
if (o1 == null) return -1;
if (o2 == null) return 1;
Map.Entry<Integer, String> a = o1;
Map.Entry<Integer, String> b = o2;
int result = compare2(a.getKey(), b.getKey());
if (result != 0) return result;
return compare2(a.getValue(), b.getValue());
}
private <T extends Comparable> int compare2(T o1, T o2) {
if (o1 == o2) return 0;
if (o1 == null) return -1;
if (o2 == null) return 1;
return o1.compareTo(o2);
}
};
private void getEntries(String title, Set<Map.Entry<Integer,String>> m1entries, Set<Map.Entry<Integer, String>> m2entries, StringBuilder buffer, int limit) {
Set<Map.Entry<Integer, String>> m1_m2 = new TreeSet<Map.Entry<Integer, String>>(ENTRY_COMPARATOR);
m1_m2.addAll(m1entries);
m1_m2.removeAll(m2entries);
buffer.append(title + ": " + m1_m2.size() + "\r\n");
for (Entry<Integer, String> entry : m1_m2) {
if (limit-- < 0) return;
buffer.append(entry.getKey()).append(" => ")
.append(entry.getValue()).append("\r\n");
}
}
static class UnicodeMapBoilerplate extends TestBoilerplate {
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_hasSameBehavior(java.lang.Object, java.lang.Object)
*/
protected boolean _hasSameBehavior(Object a, Object b) {
// we are pretty confident in the equals method, so won't bother with this right now.
return true;
}
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
*/
protected boolean _addTestObject(List list) {
if (list.size() > 30) return false;
UnicodeMap result = new UnicodeMap();
for (int i = 0; i < 50; ++i) {
int start = random.nextInt(25);
String value = TEST_VALUES[random.nextInt(TEST_VALUES.length)];
result.put(start, value);
}
list.add(result);
return true;
}
}
static class StringBoilerplate extends TestBoilerplate {
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_hasSameBehavior(java.lang.Object, java.lang.Object)
*/
protected boolean _hasSameBehavior(Object a, Object b) {
// we are pretty confident in the equals method, so won't bother with this right now.
return true;
}
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
*/
protected boolean _addTestObject(List list) {
if (list.size() > 31) return false;
StringBuilder result = new StringBuilder();
for (int i = 0; i < 10; ++i) {
result.append((char)random.nextInt(0xFF));
}
list.add(result.toString());
return true;
}
}
static class UnicodeSetBoilerplate extends TestBoilerplate {
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_hasSameBehavior(java.lang.Object, java.lang.Object)
*/
protected boolean _hasSameBehavior(Object a, Object b) {
// we are pretty confident in the equals method, so won't bother with this right now.
return true;
}
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
*/
protected boolean _addTestObject(List list) {
if (list.size() > 32) return false;
UnicodeSet result = new UnicodeSet();
for (int i = 0; i < 50; ++i) {
result.add(random.nextInt(100));
}
list.add(result.toString());
return true;
}
}
}

View File

@ -1,248 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2002-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.util;
// TODO integrate this into the test framework
import java.io.IOException;
import java.io.PrintWriter;
import java.text.Collator;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.dev.util.BagFormatter;
import com.ibm.icu.dev.util.FileUtilities;
import com.ibm.icu.dev.util.ICUPropertyFactory;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.dev.util.UnicodeProperty;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
// TODO change to use test framework
public class TestBagFormatter {
static final void generatePropertyAliases(boolean showValues) {
generatePropertyAliases(showValues, ICUPropertyFactory.make());
}
static final void generatePropertyAliases(boolean showValues, UnicodeProperty.Factory ups) {
Collator order = Collator.getInstance(Locale.ENGLISH);
TreeSet props = new TreeSet(order);
TreeSet values = new TreeSet(order);
BagFormatter bf = new BagFormatter();
props.addAll(ups.getAvailableNames());
for (int i = UnicodeProperty.BINARY; i < UnicodeProperty.LIMIT_TYPE; ++i) {
System.out.println(UnicodeProperty.getTypeName(i));
Iterator it = props.iterator();
while (it.hasNext()) {
String propAlias = (String)it.next();
UnicodeProperty up = ups.getProperty(propAlias);
int type = up.getType();
if (type != i) continue;
System.out.println();
System.out.println(propAlias + "\t" + bf.join(up.getNameAliases()));
if (!showValues) continue;
values.clear();
if (type == UnicodeProperty.NUMERIC || type == UnicodeProperty.EXTENDED_NUMERIC) {
UnicodeMap um = new UnicodeMap();
um.putAll(up.getUnicodeMap());
System.out.println(um.toString(new NumberComparator()));
continue;
}
values.clear();
values.addAll(up.getAvailableValues());
Iterator it2 = values.iterator();
while (it2.hasNext()) {
String valueAlias = (String)it2.next();
System.out.println("\t" + bf.join(valueAlias + "\t" + up.getValueAliases(valueAlias)));
}
}
}
}
static class NumberComparator implements Comparator {
public int compare(Object o1, Object o2) {
if (o1 == o2) return 0;
if (o1 == null) return 1;
if (o2 == null) return -1;
double n1 = Double.parseDouble((String)o1);
double n2 = Double.parseDouble((String)o2);
return n1 < n2 ? -1 : n1 > n2 ? 1 : 0;
}
}
public static void main(String[] args) throws Exception {
System.out.println("Start");
try {
//readCharacters();
UnicodeProperty prop = ICUPropertyFactory.make().getProperty("Canonicalcombiningclass");
prop.getAvailableValues();
generatePropertyAliases(true);
BagFormatter bf = new BagFormatter();
UnicodeSet us = new UnicodeSet("[:gc=nd:]");
FileUtilities.CONSOLE.println("[:gc=nd:]");
bf.showSetNames(FileUtilities.CONSOLE,us);
us = new UnicodeSet("[:numeric_value=2:]");
FileUtilities.CONSOLE.println("[:numeric_value=2:]");
bf.showSetNames(FileUtilities.CONSOLE,us);
us = new UnicodeSet("[:numeric_type=numeric:]");
FileUtilities.CONSOLE.println("[:numeric_type=numeric:]");
bf.showSetNames(FileUtilities.CONSOLE,us);
UnicodeProperty.Factory ups = ICUPropertyFactory.make();
us = ups.getSet("gc=mn", null, null);
FileUtilities.CONSOLE.println("gc=mn");
bf.showSetNames(FileUtilities.CONSOLE, us);
if (true) return;
//showNames("Name", ".*MARK.*");
//showNames("NFD", "a.+");
//showNames("NFD", false);
//showNames("Lowercase_Mapping", false);
//TestUnicodePropertySource.test(true);
//showNames(".*\\ \\-.*");
//checkHTML();
//testIsRTL();
//TestTokenizer.test();
//RandomCollator.generate("collationTest.txt", null);
//TestPick.test();
//printRandoms();
//if (true) return;
//testLocales();
//if (true) return;
/*
TestCollator tc = new TestCollator();
tc.test(RuleBasedCollator.getInstance(),1000);
*/
/*
StringBuffer sb = new StringBuffer();
for (int i = 0; i < 100; ++i) {
sb.setLength(0);
rc.nextRule(sb);
System.out.println(sb);
}
*/
} finally {
System.out.println("End");
}
}
static void testLocales() throws IOException {
Locale[] locales = Collator.getAvailableLocales();
Set s = new TreeSet(Collator.getInstance());
for (int i = 0; i < locales.length; ++i) {
String lang = locales[i].getLanguage();
String dlang = locales[i].getDisplayLanguage();
String country = locales[i].getCountry();
String dcountry = locales[i].getDisplayCountry();
if (country.equals("")) continue;
s.add(""
+ "\t" + dcountry
+ "\t" + country
+ "\t" + dlang
+ "\t" + lang
);
}
//CollectionFormatter cf = new CollectionFormatter();
PrintWriter pw = FileUtilities.openUTF8Writer("", "countries.txt");
Iterator it = s.iterator();
while (it.hasNext()) {
pw.println(it.next());
}
pw.close();
}
/*
* Use the number of significant digits to round get a rounding value.
*/
/* static final double LOG10 = Math.log(10);
public static void useSignificantDigits(double value, int digits) {
double log10 = Math.log(value)/LOG10; // log[e]
}*/
static final UnicodeSet RTL = new UnicodeSet("[[:L:]&[[:bidi class=R:][:bidi class=AL:]]]");
static boolean isRTL(Locale loc) {
// in 2.8 we can use the exemplar characters, but for 2.6 we have to work around it
int[] scripts = UScript.getCode(loc);
return new UnicodeSet()
.applyIntPropertyValue(UProperty.SCRIPT, scripts == null ? UScript.LATIN : scripts[0])
.retainAll(RTL).size() != 0;
}
static void testIsRTL() {
Locale[] locales = Locale.getAvailableLocales();
Set s = new TreeSet();
for (int i = 0; i < locales.length; ++i) {
s.add((isRTL(locales[i]) ? "R " : "L ") + locales[i].getDisplayName());
}
Iterator it = s.iterator();
while (it.hasNext()) {
System.out.println(it.next());
}
}
static final Transliterator toHTML = Transliterator.createFromRules(
"any-html",
"'<' > '&lt;' ;" +
"'&' > '&amp;' ;" +
"'>' > '&gt;' ;" +
"'\"' > '&quot;' ; ",
Transliterator.FORWARD);
static final Transliterator fromHTML = Transliterator.createFromRules(
"html-any",
"'<' < '&'[lL][Tt]';' ;" +
"'&' < '&'[aA][mM][pP]';' ;" +
"'>' < '&'[gG][tT]';' ;" +
"'\"' < '&'[qQ][uU][oO][tT]';' ; ",
Transliterator.REVERSE);
static void checkHTML() {
String foo = "& n < b < \"ab\"";
String fii = toHTML.transliterate(foo);
System.out.println("in: " + foo);
System.out.println("out: " + fii);
System.out.println("in*: " + fromHTML.transliterate(fii));
System.out.println("IN*: " + fromHTML.transliterate(fii.toUpperCase()));
}
/*
static void showNames(String propAlias, boolean matches) {
BagFormatter bf = new BagFormatter();
UnicodeSet stuff;
stuff = new UnicodePropertySource.ICU()
.setPropertyAlias(propAlias)
.getPropertySet(matches, null);
System.out.println(bf.showSetNames(propAlias + " with " + matches, stuff));
}
static void showNames(String propAlias, String pattern) {
BagFormatter bf = new BagFormatter();
UnicodeSet stuff;
stuff = new UnicodePropertySource.ICU()
.setPropertyAlias(propAlias)
.getPropertySet(Pattern.compile(pattern).matcher(""), null);
System.out.println(bf.showSetNames(propAlias + "with " + pattern, stuff));
}
*/
}

View File

@ -6,176 +6,19 @@
*/
package com.ibm.icu.dev.test.util;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.TestBoilerplate;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.dev.util.ICUPropertyFactory;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.dev.util.UnicodeMapIterator;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UnicodeSet;
public class TestUtilities extends TestFmwk {
static final int LIMIT = 0x15; // limit to make testing more realistic in terms of collisions
static final int ITERATIONS = 1000000;
static final boolean SHOW_PROGRESS = false;
static final boolean DEBUG = false;
public static void main(String[] args) throws Exception {
new TestUtilities().run(args);
}
SortedSet<String> log = new TreeSet<String>();
static String[] TEST_VALUES = {"A", "B", "C", "D", "E", "F"};
static Random random = new Random(12345);
public void TestUnicodeMapRandom() {
// do random change to both, then compare
random.setSeed(12345); // reproducable results
logln("Comparing against HashMap");
UnicodeMap<String> map1 = new UnicodeMap();
Map<Integer, String> map2 = new HashMap<Integer, String>();
for (int counter = 0; counter < ITERATIONS; ++counter) {
int start = random.nextInt(LIMIT);
String value = TEST_VALUES[random.nextInt(TEST_VALUES.length)];
String logline = Utility.hex(start) + "\t" + value;
if (SHOW_PROGRESS) logln(counter + "\t" + logline);
log.add(logline);
if (DEBUG && counter == 144) {
System.out.println(" debug");
}
map1.put(start, value);
map2.put(start, value);
check(map1, map2, counter);
}
checkNext(map1, map2, LIMIT);
}
public void TestUnicodeMapGeneralCategory() {
logln("Setting General Category");
UnicodeMap<String> map1 = new UnicodeMap();
Map<Integer, String> map2 = new HashMap<Integer, String>();
//Map<Integer, String> map3 = new TreeMap<Integer, String>();
map1 = new UnicodeMap<String>();
map2 = new TreeMap<Integer,String>();
for (int cp = 0; cp <= SET_LIMIT; ++cp) {
int enumValue = UCharacter.getIntPropertyValue(cp, propEnum);
//if (enumValue <= 0) continue; // for smaller set
String value = UCharacter.getPropertyValueName(propEnum,enumValue, UProperty.NameChoice.LONG);
map1.put(cp, value);
map2.put(cp, value);
}
checkNext(map1, map2, Integer.MAX_VALUE);
logln("Comparing General Category");
check(map1, map2, -1);
logln("Comparing Values");
Set<String> values1 = map1.getAvailableValues(new TreeSet<String>());
Set<String> values2 = new TreeSet<String>(map2.values());
if (!TestBoilerplate.verifySetsIdentical(this, values1, values2)) {
throw new IllegalArgumentException("Halting");
}
logln("Comparing Sets");
for (Iterator<String> it = values1.iterator(); it.hasNext();) {
String value = it.next();
logln(value == null ? "null" : value);
UnicodeSet set1 = map1.keySet(value);
UnicodeSet set2 = TestBoilerplate.getSet(map2, value);
if (!TestBoilerplate.verifySetsIdentical(this, set1, set2)) {
throw new IllegalArgumentException("Halting");
}
}
}
static final UnicodeMap<String> SCRIPTS = ICUPropertyFactory.make().getProperty("script").getUnicodeMap_internal();
static final UnicodeMap<String> GC = ICUPropertyFactory.make().getProperty("general_category").getUnicodeMap_internal();
public void TestUnicodeMapCompose() {
logln("Getting Scripts");
UnicodeMap.Composer<String> composer = new UnicodeMap.Composer<String>() {
@Override
public String compose(int codepoint, String string, String a, String b) {
return a.toString() + "_" + b.toString();
}
};
logln("Trying Compose");
// Map<Integer, String> map2 = new HashMap<Integer, String>();
// Map<Integer, String> map3 = new TreeMap<Integer, String>();
UnicodeMap<String> composed = ((UnicodeMap)SCRIPTS.cloneAsThawed()).composeWith(GC, composer);
String last = "";
for (int i = 0; i < 0x10FFFF; ++i) {
// if (i == 888) {
// int debug = 0;
// }
String comp = composed.getValue(i);
String gc = GC.getValue(i);
String sc = SCRIPTS.getValue(i);
if (!comp.equals(composer.compose(i, null, sc, gc))) {
errln("Failed compose at: " + i);
break;
}
if (!last.equals(comp)) {
logln(Utility.hex(i) + "\t" + comp);
last = comp;
}
}
}
public void testBoilerplate() {
// check boilerplate
List argList = new ArrayList();
argList.add("TestMain");
if (params.verbose) argList.add("-verbose");
String[] args = new String[argList.size()];
argList.toArray(args);
new UnicodeMapBoilerplate().run(args);
// TODO: the following is not being reached
new UnicodeSetBoilerplate().run(args);
}
public void TestAUnicodeMap2() {
UnicodeMap foo = new UnicodeMap();
@SuppressWarnings("unused")
int hash = foo.hashCode(); // make sure doesn't NPE
@SuppressWarnings("unused")
Set fii = foo.stringKeys(); // make sure doesn't NPE
}
public void TestAUnicodeMapInverse() {
UnicodeMap<Character> foo1 = new UnicodeMap<Character>()
.putAll('a', 'z', 'b')
.put("ab", 'c')
.put('x', 'b')
.put("xy", 'c')
;
Map<Character, UnicodeSet> target = new HashMap<Character, UnicodeSet>();
foo1.addInverseTo(target);
UnicodeMap<Character> reverse = new UnicodeMap().putAllInverse(target);
assertEquals("", foo1, reverse);
}
public void TestCollectionUtilitySpeed() {
TreeSet ts1 = new TreeSet();
TreeSet ts2 = new TreeSet();
@ -201,7 +44,7 @@ public class TestUtilities extends TestFmwk {
private void timeAndCompare(TreeSet ts1, TreeSet ts2, int iterations, boolean expected, double factorOfStandard) {
double utilityTimeSorted = timeUtilityContainsAll(iterations, ts1, ts2, expected)/(double)iterations;
double standardTimeSorted = timeStandardContainsAll(iterations, ts1, ts2, expected)/(double)iterations;
if (utilityTimeSorted < standardTimeSorted*factorOfStandard) {
logln("Sorted: Utility time (" + utilityTimeSorted + ") << Standard duration (" + standardTimeSorted + "); " + 100*(utilityTimeSorted/standardTimeSorted) + "%");
} else {
@ -245,7 +88,7 @@ public class TestUtilities extends TestFmwk {
}
return utilityTime;
}
public void TestCollectionUtilities() {
String[][] test = {{"a", "c", "e", "g", "h", "z"}, {"b", "d", "f", "h", "w"}, { "a", "b" }, { "a", "d" }, {"d"}, {}}; //
int resultMask = 0;
@ -312,252 +155,4 @@ public class TestUtilities extends TestFmwk {
errln("Fails relation: " + a + " \t" + RelationName[relation] + " \t" + b);
}
}
private void checkNext(UnicodeMap<String> map1, Map<Integer,String> map2, int limit) {
logln("Comparing nextRange");
Map localMap = new TreeMap();
UnicodeMapIterator<String> mi = new UnicodeMapIterator<String>(map1);
while (mi.nextRange()) {
logln(Utility.hex(mi.codepoint) + ".." + Utility.hex(mi.codepointEnd) + " => " + mi.value);
for (int i = mi.codepoint; i <= mi.codepointEnd; ++i) {
//if (i >= limit) continue;
localMap.put(i, mi.value);
}
}
checkMap(map2, localMap);
logln("Comparing next");
mi.reset();
localMap = new TreeMap();
// String lastValue = null;
while (mi.next()) {
// if (!UnicodeMap.areEqual(lastValue, mi.value)) {
// // System.out.println("Change: " + Utility.hex(mi.codepoint) + " => " + mi.value);
// lastValue = mi.value;
// }
//if (mi.codepoint >= limit) continue;
localMap.put(mi.codepoint, mi.value);
}
checkMap(map2, localMap);
}
public void check(UnicodeMap<String> map1, Map<Integer,String> map2, int counter) {
for (int i = 0; i < LIMIT; ++i) {
String value1 = map1.getValue(i);
String value2 = map2.get(i);
if (!UnicodeMap.areEqual(value1, value2)) {
errln(counter + " Difference at " + Utility.hex(i)
+ "\t UnicodeMap: " + value1
+ "\t HashMap: " + value2);
errln("UnicodeMap: " + map1);
errln("Log: " + TestBoilerplate.show(log));
errln("HashMap: " + TestBoilerplate.show(map2));
}
}
}
void checkMap(Map m1, Map m2) {
if (m1.equals(m2)) return;
StringBuilder buffer = new StringBuilder();
Set m1entries = m1.entrySet();
Set m2entries = m2.entrySet();
getEntries("\r\nIn First, and not Second", m1entries, m2entries, buffer, 20);
getEntries("\r\nIn Second, and not First", m2entries, m1entries, buffer, 20);
errln(buffer.toString());
}
static Comparator<Map.Entry<Integer, String>> ENTRY_COMPARATOR = new Comparator<Map.Entry<Integer, String>>() {
public int compare(Map.Entry<Integer, String> o1, Map.Entry<Integer, String> o2) {
if (o1 == o2) return 0;
if (o1 == null) return -1;
if (o2 == null) return 1;
Map.Entry<Integer, String> a = o1;
Map.Entry<Integer, String> b = o2;
int result = compare2(a.getKey(), b.getKey());
if (result != 0) return result;
return compare2(a.getValue(), b.getValue());
}
private <T extends Comparable> int compare2(T o1, T o2) {
if (o1 == o2) return 0;
if (o1 == null) return -1;
if (o2 == null) return 1;
return o1.compareTo(o2);
}
};
private void getEntries(String title, Set<Map.Entry<Integer,String>> m1entries, Set<Map.Entry<Integer, String>> m2entries, StringBuilder buffer, int limit) {
Set<Map.Entry<Integer, String>> m1_m2 = new TreeSet<Map.Entry<Integer, String>>(ENTRY_COMPARATOR);
m1_m2.addAll(m1entries);
m1_m2.removeAll(m2entries);
buffer.append(title + ": " + m1_m2.size() + "\r\n");
for (Entry<Integer, String> entry : m1_m2) {
if (limit-- < 0) return;
buffer.append(entry.getKey()).append(" => ")
.append(entry.getValue()).append("\r\n");
}
}
static final int SET_LIMIT = 0x10FFFF;
static final int CHECK_LIMIT = 0xFFFF;
static final NumberFormat pf = NumberFormat.getPercentInstance();
static final NumberFormat nf = NumberFormat.getInstance();
public void TestTime() {
boolean shortTest = getInclusion() < 10;
double hashTime, umTime, icuTime, treeTime;
int warmup = shortTest ? 1 : 20;
umTime = checkSetTime(warmup, 0);
hashTime = checkSetTime(warmup, 1);
logln("Percentage: " + pf.format(hashTime/umTime));
treeTime = checkSetTime(warmup, 3);
logln("Percentage: " + pf.format(treeTime/umTime));
//logln(map1.toString());
if (shortTest) {
return;
}
umTime = checkGetTime(1000, 0);
hashTime = checkGetTime(1000, 1);
logln("Percentage: " + pf.format(hashTime/umTime));
icuTime = checkGetTime(1000, 2);
logln("Percentage: " + pf.format(icuTime/umTime));
treeTime = checkGetTime(1000, 3);
logln("Percentage: " + pf.format(treeTime/umTime));
}
int propEnum = UProperty.GENERAL_CATEGORY;
double checkSetTime(int iterations, int type) {
_checkSetTime(1,type);
double result = _checkSetTime(iterations, type);
logln((type == 0 ? "UnicodeMap" : type == 1 ? "HashMap" : type == 2 ? "ICU" : "TreeMap") + "\t" + nf.format(result));
return result;
}
double _checkSetTime(int iterations, int type) {
UnicodeMap<String> map1 = SCRIPTS;
Map<Integer,String> map2 = map1.putAllCodepointsInto(new HashMap<Integer,String>());
Map<Integer, String> map3 = new TreeMap<Integer, String>(map2);
System.gc();
double start = System.currentTimeMillis();
for (int j = 0; j < iterations; ++j)
for (int cp = 0; cp <= SET_LIMIT; ++cp) {
int enumValue = UCharacter.getIntPropertyValue(cp, propEnum);
if (enumValue <= 0) continue; // for smaller set
String value = UCharacter.getPropertyValueName(propEnum,enumValue, UProperty.NameChoice.LONG);
switch(type) {
case 0: map1.put(cp, value); break;
case 1: map2.put(cp, value); break;
case 3: map3.put(cp, value); break;
}
}
double end = System.currentTimeMillis();
return (end-start)/1000/iterations;
}
double checkGetTime(int iterations, int type) {
UnicodeMap<String> map1 = new UnicodeMap<String>();
Map<Integer,String> map2 = map1.putAllCodepointsInto(new HashMap<Integer,String>());
Map<Integer, String> map3 = new TreeMap<Integer, String>();
_checkGetTime(map1, map2, map3, 1,type); // warmup
double result = _checkGetTime(map1, map2, map3, iterations, type);
logln((type == 0 ? "UnicodeMap" : type == 1 ? "HashMap" : type == 2 ? "ICU" : "TreeMap") + "\t" + nf.format(result));
return result;
}
double _checkGetTime(UnicodeMap<String> map1, Map<Integer,String> map2, Map<Integer,String> map3, int iterations, int type) {
System.gc();
double start = System.currentTimeMillis();
for (int j = 0; j < iterations; ++j)
for (int cp = 0; cp < CHECK_LIMIT; ++cp) {
switch (type) {
case 0: map1.getValue(cp); break;
case 1: map2.get(cp); break;
case 2:
int enumValue = UCharacter.getIntPropertyValue(cp, propEnum);
//if (enumValue <= 0) continue;
UCharacter.getPropertyValueName(propEnum,enumValue, UProperty.NameChoice.LONG);
break;
case 3: map3.get(cp); break;
}
}
double end = System.currentTimeMillis();
return (end-start)/1000/iterations;
}
static class UnicodeMapBoilerplate extends TestBoilerplate {
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_hasSameBehavior(java.lang.Object, java.lang.Object)
*/
protected boolean _hasSameBehavior(Object a, Object b) {
// we are pretty confident in the equals method, so won't bother with this right now.
return true;
}
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
*/
protected boolean _addTestObject(List list) {
if (list.size() > 30) return false;
UnicodeMap result = new UnicodeMap();
for (int i = 0; i < 50; ++i) {
int start = random.nextInt(25);
String value = TEST_VALUES[random.nextInt(TEST_VALUES.length)];
result.put(start, value);
}
list.add(result);
return true;
}
}
static class StringBoilerplate extends TestBoilerplate {
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_hasSameBehavior(java.lang.Object, java.lang.Object)
*/
protected boolean _hasSameBehavior(Object a, Object b) {
// we are pretty confident in the equals method, so won't bother with this right now.
return true;
}
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
*/
protected boolean _addTestObject(List list) {
if (list.size() > 31) return false;
StringBuilder result = new StringBuilder();
for (int i = 0; i < 10; ++i) {
result.append((char)random.nextInt(0xFF));
}
list.add(result.toString());
return true;
}
}
static class UnicodeSetBoilerplate extends TestBoilerplate {
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_hasSameBehavior(java.lang.Object, java.lang.Object)
*/
protected boolean _hasSameBehavior(Object a, Object b) {
// we are pretty confident in the equals method, so won't bother with this right now.
return true;
}
/*
* @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
*/
protected boolean _addTestObject(List list) {
if (list.size() > 32) return false;
UnicodeSet result = new UnicodeSet();
for (int i = 0; i < 50; ++i) {
result.add(random.nextInt(100));
}
list.add(result.toString());
return true;
}
}
}

View File

@ -1,563 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
// copied from the Transliterator demo
package com.ibm.icu.dev.util;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* Incrementally returns the set of all strings that case-fold to the same value.
*/
public class CaseIterator {
// testing stuff
private static Transliterator toName = Transliterator.getInstance("[:^ascii:] Any-Name");
private static Transliterator toHex = Transliterator.getInstance("[:^ascii:] Any-Hex");
private static Transliterator toHex2 = Transliterator.getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");
// global tables (could be precompiled)
private static Map fromCaseFold = new HashMap();
private static Map toCaseFold = new HashMap();
private static int maxLength = 0;
// This exception list is generated on the console by turning on the GENERATED flag,
// which MUST be false for normal operation.
// Once the list is generated, it is pasted in here.
// A bit of a cludge, but this bootstrapping is the easiest way
// to get around certain complications in the data.
private static final boolean GENERATE = false;
private static final boolean DUMP = false;
private static String[][] exceptionList = {
// a\N{MODIFIER LETTER RIGHT HALF RING}
{"a\u02BE","A\u02BE","a\u02BE",},
// ff
{"ff","FF","Ff","fF","ff",},
// ffi
{"ffi","FFI","FFi","FfI","Ffi","F\uFB01","fFI","fFi","ffI","ffi","f\uFB01","\uFB00I","\uFB00i",},
// ffl
{"ffl","FFL","FFl","FfL","Ffl","F\uFB02","fFL","fFl","ffL","ffl","f\uFB02","\uFB00L","\uFB00l",},
// fi
{"fi","FI","Fi","fI","fi",},
// fl
{"fl","FL","Fl","fL","fl",},
// h\N{COMBINING MACRON BELOW}
{"h\u0331","H\u0331","h\u0331",},
// i\N{COMBINING DOT ABOVE}
{"i\u0307","I\u0307","i\u0307",},
// j\N{COMBINING CARON}
{"j\u030C","J\u030C","j\u030C",},
// ss
{"ss","SS","Ss","S\u017F","sS","ss","s\u017F","\u017FS","\u017Fs","\u017F\u017F",},
// st
{"st","ST","St","sT","st","\u017FT","\u017Ft",},
// t\N{COMBINING DIAERESIS}
{"t\u0308","T\u0308","t\u0308",},
// w\N{COMBINING RING ABOVE}
{"w\u030A","W\u030A","w\u030A",},
// y\N{COMBINING RING ABOVE}
{"y\u030A","Y\u030A","y\u030A",},
// \N{MODIFIER LETTER APOSTROPHE}n
{"\u02BCn","\u02BCN","\u02BCn",},
// \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
{"\u03AC\u03B9","\u0386\u0345","\u0386\u0399","\u0386\u03B9","\u0386\u1FBE","\u03AC\u0345","\u03AC\u0399","\u03AC\u03B9","\u03AC\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
{"\u03AE\u03B9","\u0389\u0345","\u0389\u0399","\u0389\u03B9","\u0389\u1FBE","\u03AE\u0345","\u03AE\u0399","\u03AE\u03B9","\u03AE\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}
{"\u03B1\u0342","\u0391\u0342","\u03B1\u0342",},
// \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u03B1\u0342\u03B9","\u0391\u0342\u0345","\u0391\u0342\u0399","\u0391\u0342\u03B9","\u0391\u0342\u1FBE",
"\u03B1\u0342\u0345","\u03B1\u0342\u0399","\u03B1\u0342\u03B9","\u03B1\u0342\u1FBE","\u1FB6\u0345",
"\u1FB6\u0399","\u1FB6\u03B9","\u1FB6\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}
{"\u03B1\u03B9","\u0391\u0345","\u0391\u0399","\u0391\u03B9","\u0391\u1FBE","\u03B1\u0345","\u03B1\u0399","\u03B1\u03B9","\u03B1\u1FBE",},
// \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}
{"\u03B7\u0342","\u0397\u0342","\u03B7\u0342",},
// \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u03B7\u0342\u03B9","\u0397\u0342\u0345","\u0397\u0342\u0399","\u0397\u0342\u03B9","\u0397\u0342\u1FBE",
"\u03B7\u0342\u0345","\u03B7\u0342\u0399","\u03B7\u0342\u03B9","\u03B7\u0342\u1FBE","\u1FC6\u0345","\u1FC6\u0399",
"\u1FC6\u03B9","\u1FC6\u1FBE",},
// \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}
{"\u03B7\u03B9","\u0397\u0345","\u0397\u0399","\u0397\u03B9","\u0397\u1FBE","\u03B7\u0345","\u03B7\u0399","\u03B7\u03B9","\u03B7\u1FBE",},
// \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
{"\u03B9\u0308\u0300","\u0345\u0308\u0300","\u0399\u0308\u0300","\u03B9\u0308\u0300","\u1FBE\u0308\u0300",},
// \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
{"\u03B9\u0308\u0301","\u0345\u0308\u0301","\u0399\u0308\u0301","\u03B9\u0308\u0301","\u1FBE\u0308\u0301",},
// \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
{"\u03B9\u0308\u0342","\u0345\u0308\u0342","\u0399\u0308\u0342","\u03B9\u0308\u0342","\u1FBE\u0308\u0342",},
// \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}
{"\u03B9\u0342","\u0345\u0342","\u0399\u0342","\u03B9\u0342","\u1FBE\u0342",},
// \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}
{"\u03C1\u0313","\u03A1\u0313","\u03C1\u0313","\u03F1\u0313",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
{"\u03C5\u0308\u0300","\u03A5\u0308\u0300","\u03C5\u0308\u0300",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
{"\u03C5\u0308\u0301","\u03A5\u0308\u0301","\u03C5\u0308\u0301",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
{"\u03C5\u0308\u0342","\u03A5\u0308\u0342","\u03C5\u0308\u0342",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}
{"\u03C5\u0313","\u03A5\u0313","\u03C5\u0313",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}
{"\u03C5\u0313\u0300","\u03A5\u0313\u0300","\u03C5\u0313\u0300","\u1F50\u0300",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}
{"\u03C5\u0313\u0301","\u03A5\u0313\u0301","\u03C5\u0313\u0301","\u1F50\u0301",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}
{"\u03C5\u0313\u0342","\u03A5\u0313\u0342","\u03C5\u0313\u0342","\u1F50\u0342",},
// \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}
{"\u03C5\u0342","\u03A5\u0342","\u03C5\u0342",},
// \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}
{"\u03C9\u0342","\u03A9\u0342","\u03C9\u0342","\u2126\u0342",},
// \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u03C9\u0342\u03B9","\u03A9\u0342\u0345","\u03A9\u0342\u0399","\u03A9\u0342\u03B9","\u03A9\u0342\u1FBE","\u03C9\u0342\u0345","\u03C9\u0342\u0399","\u03C9\u0342\u03B9","\u03C9\u0342\u1FBE","\u1FF6\u0345",
"\u1FF6\u0399","\u1FF6\u03B9","\u1FF6\u1FBE","\u2126\u0342\u0345","\u2126\u0342\u0399","\u2126\u0342\u03B9","\u2126\u0342\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}
{"\u03C9\u03B9","\u03A9\u0345","\u03A9\u0399","\u03A9\u03B9","\u03A9\u1FBE","\u03C9\u0345","\u03C9\u0399","\u03C9\u03B9","\u03C9\u1FBE","\u2126\u0345","\u2126\u0399","\u2126\u03B9","\u2126\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
{"\u03CE\u03B9","\u038F\u0345","\u038F\u0399","\u038F\u03B9","\u038F\u1FBE","\u03CE\u0345","\u03CE\u0399","\u03CE\u03B9","\u03CE\u1FBE",},
// \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}
{"\u0565\u0582","\u0535\u0552","\u0535\u0582","\u0565\u0552","\u0565\u0582",},
// \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}
{"\u0574\u0565","\u0544\u0535","\u0544\u0565","\u0574\u0535","\u0574\u0565",},
// \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}
{"\u0574\u056B","\u0544\u053B","\u0544\u056B","\u0574\u053B","\u0574\u056B",},
// \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}
{"\u0574\u056D","\u0544\u053D","\u0544\u056D","\u0574\u053D","\u0574\u056D",},
// \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}
{"\u0574\u0576","\u0544\u0546","\u0544\u0576","\u0574\u0546","\u0574\u0576",},
// \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}
{"\u057E\u0576","\u054E\u0546","\u054E\u0576","\u057E\u0546","\u057E\u0576",},
// \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
{"\u1F00\u03B9","\u1F00\u0345","\u1F00\u0399","\u1F00\u03B9","\u1F00\u1FBE","\u1F08\u0345","\u1F08\u0399","\u1F08\u03B9","\u1F08\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F01\u03B9","\u1F01\u0345","\u1F01\u0399","\u1F01\u03B9","\u1F01\u1FBE","\u1F09\u0345","\u1F09\u0399","\u1F09\u03B9","\u1F09\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F02\u03B9","\u1F02\u0345","\u1F02\u0399","\u1F02\u03B9","\u1F02\u1FBE","\u1F0A\u0345","\u1F0A\u0399","\u1F0A\u03B9","\u1F0A\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F03\u03B9","\u1F03\u0345","\u1F03\u0399","\u1F03\u03B9","\u1F03\u1FBE","\u1F0B\u0345","\u1F0B\u0399","\u1F0B\u03B9","\u1F0B\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F04\u03B9","\u1F04\u0345","\u1F04\u0399","\u1F04\u03B9","\u1F04\u1FBE","\u1F0C\u0345","\u1F0C\u0399","\u1F0C\u03B9","\u1F0C\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F05\u03B9","\u1F05\u0345","\u1F05\u0399","\u1F05\u03B9","\u1F05\u1FBE","\u1F0D\u0345","\u1F0D\u0399","\u1F0D\u03B9","\u1F0D\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u1F06\u03B9","\u1F06\u0345","\u1F06\u0399","\u1F06\u03B9","\u1F06\u1FBE","\u1F0E\u0345","\u1F0E\u0399","\u1F0E\u03B9","\u1F0E\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u1F07\u03B9","\u1F07\u0345","\u1F07\u0399","\u1F07\u03B9","\u1F07\u1FBE","\u1F0F\u0345","\u1F0F\u0399","\u1F0F\u03B9","\u1F0F\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
{"\u1F20\u03B9","\u1F20\u0345","\u1F20\u0399","\u1F20\u03B9","\u1F20\u1FBE","\u1F28\u0345","\u1F28\u0399","\u1F28\u03B9","\u1F28\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F21\u03B9","\u1F21\u0345","\u1F21\u0399","\u1F21\u03B9","\u1F21\u1FBE","\u1F29\u0345","\u1F29\u0399","\u1F29\u03B9","\u1F29\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F22\u03B9","\u1F22\u0345","\u1F22\u0399","\u1F22\u03B9","\u1F22\u1FBE","\u1F2A\u0345","\u1F2A\u0399","\u1F2A\u03B9","\u1F2A\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F23\u03B9","\u1F23\u0345","\u1F23\u0399","\u1F23\u03B9","\u1F23\u1FBE","\u1F2B\u0345","\u1F2B\u0399","\u1F2B\u03B9","\u1F2B\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F24\u03B9","\u1F24\u0345","\u1F24\u0399","\u1F24\u03B9","\u1F24\u1FBE","\u1F2C\u0345","\u1F2C\u0399","\u1F2C\u03B9","\u1F2C\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F25\u03B9","\u1F25\u0345","\u1F25\u0399","\u1F25\u03B9","\u1F25\u1FBE","\u1F2D\u0345","\u1F2D\u0399","\u1F2D\u03B9","\u1F2D\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u1F26\u03B9","\u1F26\u0345","\u1F26\u0399","\u1F26\u03B9","\u1F26\u1FBE","\u1F2E\u0345","\u1F2E\u0399","\u1F2E\u03B9","\u1F2E\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u1F27\u03B9","\u1F27\u0345","\u1F27\u0399","\u1F27\u03B9","\u1F27\u1FBE","\u1F2F\u0345","\u1F2F\u0399","\u1F2F\u03B9","\u1F2F\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
{"\u1F60\u03B9","\u1F60\u0345","\u1F60\u0399","\u1F60\u03B9","\u1F60\u1FBE","\u1F68\u0345","\u1F68\u0399","\u1F68\u03B9","\u1F68\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F61\u03B9","\u1F61\u0345","\u1F61\u0399","\u1F61\u03B9","\u1F61\u1FBE","\u1F69\u0345","\u1F69\u0399","\u1F69\u03B9","\u1F69\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F62\u03B9","\u1F62\u0345","\u1F62\u0399","\u1F62\u03B9","\u1F62\u1FBE","\u1F6A\u0345","\u1F6A\u0399","\u1F6A\u03B9","\u1F6A\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F63\u03B9","\u1F63\u0345","\u1F63\u0399","\u1F63\u03B9","\u1F63\u1FBE","\u1F6B\u0345","\u1F6B\u0399","\u1F6B\u03B9","\u1F6B\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F64\u03B9","\u1F64\u0345","\u1F64\u0399","\u1F64\u03B9","\u1F64\u1FBE","\u1F6C\u0345","\u1F6C\u0399","\u1F6C\u03B9","\u1F6C\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F65\u03B9","\u1F65\u0345","\u1F65\u0399","\u1F65\u03B9","\u1F65\u1FBE","\u1F6D\u0345","\u1F6D\u0399","\u1F6D\u03B9","\u1F6D\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u1F66\u03B9","\u1F66\u0345","\u1F66\u0399","\u1F66\u03B9","\u1F66\u1FBE","\u1F6E\u0345","\u1F6E\u0399","\u1F6E\u03B9","\u1F6E\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
{"\u1F67\u03B9","\u1F67\u0345","\u1F67\u0399","\u1F67\u03B9","\u1F67\u1FBE","\u1F6F\u0345","\u1F6F\u0399","\u1F6F\u03B9","\u1F6F\u1FBE",},
// \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F70\u03B9","\u1F70\u0345","\u1F70\u0399","\u1F70\u03B9","\u1F70\u1FBE","\u1FBA\u0345","\u1FBA\u0399","\u1FBA\u03B9","\u1FBA\u1FBE",},
// \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F74\u03B9","\u1F74\u0345","\u1F74\u0399","\u1F74\u03B9","\u1F74\u1FBE","\u1FCA\u0345","\u1FCA\u0399","\u1FCA\u03B9","\u1FCA\u1FBE",},
// \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
{"\u1F7C\u03B9","\u1F7C\u0345","\u1F7C\u0399","\u1F7C\u03B9","\u1F7C\u1FBE","\u1FFA\u0345","\u1FFA\u0399","\u1FFA\u03B9","\u1FFA\u1FBE",},
};
// this initializes the data used to generated the case-equivalents
static {
// Gather up the exceptions in a form we can use
if (!GENERATE) {
for (int i = 0; i < exceptionList.length; ++i) {
String[] exception = exceptionList[i];
Set s = new HashSet();
// there has to be some method to do the following, but I can't find it in the collections
for (int j = 0; j < exception.length; ++j) {
s.add(exception[j]);
}
fromCaseFold.put(exception[0], s);
}
}
// walk through all the characters, and at every case fold result,
// put a set of all the characters that map to that result
boolean defaultmapping = true; // false for turkish
for (int i = 0; i <= 0x10FFFF; ++i) {
int cat = UCharacter.getType(i);
if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue;
String cp = UTF16.valueOf(i);
String mapped = UCharacter.foldCase(cp, defaultmapping);
if (mapped.equals(cp)) continue;
if (maxLength < mapped.length()) maxLength = mapped.length();
// at this point, have different case folding
Set s = (Set) fromCaseFold.get(mapped);
if (s == null) {
s = new HashSet();
s.add(mapped); // add the case fold result itself
fromCaseFold.put(mapped, s);
}
s.add(cp);
toCaseFold.put(cp, mapped);
toCaseFold.put(mapped, mapped); // add mapping to self
}
// Emit the final data
if (DUMP) {
System.out.println("maxLength = " + maxLength);
System.out.println("\nfromCaseFold:");
Iterator it = fromCaseFold.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
System.out.print(" " + toHex2.transliterate((String)key) + ": ");
Set s = (Set) fromCaseFold.get(key);
Iterator it2 = s.iterator();
boolean first = true;
while (it2.hasNext()) {
if (first) {
first = false;
} else {
System.out.print(", ");
}
System.out.print(toHex2.transliterate((String)it2.next()));
}
System.out.println("");
}
System.out.println("\ntoCaseFold:");
it = toCaseFold.keySet().iterator();
while (it.hasNext()) {
String key = (String) it.next();
String value = (String) toCaseFold.get(key);
System.out.println(" " + toHex2.transliterate(key) + ": " + toHex2.transliterate(value));
}
}
// Now convert all those sets into linear arrays
// We can't do this in place in Java, so make a temporary target array
// Note: This could be transformed into a single array, with offsets into it.
// Might be best choice in C.
Map fromCaseFold2 = new HashMap();
Iterator it = fromCaseFold.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
Set s = (Set) fromCaseFold.get(key);
String[] temp = new String[s.size()];
s.toArray(temp);
fromCaseFold2.put(key, temp);
}
fromCaseFold = fromCaseFold2;
// We have processed everything, so the iterator will now work
// The following is normally OFF.
// It is here to generate (under the GENERATE flag) the static exception list.
// It must be at the very end of initialization, so that the iterator is functional.
// (easiest to do it that way)
if (GENERATE) {
// first get small set of items that have multiple characters
Set multichars = new TreeSet();
it = fromCaseFold.keySet().iterator();
while (it.hasNext()) {
String key = (String) it.next();
if (UTF16.countCodePoint(key) < 2) continue;
multichars.add(key);
}
// now we will go through each of them.
CaseIterator ci = new CaseIterator();
it = multichars.iterator();
while (it.hasNext()) {
String key = (String) it.next();
// here is a nasty complication. Take 'ffi' ligature. We
// can't just close it, since we would miss the combination
// that includes the 'fi' => "fi" ligature
// so first do a pass through, and add substring combinations
// we call this a 'partial closure'
Set partialClosure = new TreeSet();
partialClosure.add(key);
if (UTF16.countCodePoint(key) > 2) {
Iterator multiIt2 = multichars.iterator();
while (multiIt2.hasNext()) {
String otherKey = (String) multiIt2.next();
if (otherKey.length() >= key.length()) continue;
int pos = -1;
while (true) {
// The following is not completely general
// but works for the actual cased stuff,
// and should work for future characters, since we won't have
// more ligatures & other oddities.
pos = key.indexOf(otherKey, pos+1);
if (pos < 0) break;
int endPos = pos + otherKey.length();
// we know we have a proper substring,
// so get the combinations
String[] choices = (String[]) fromCaseFold.get(otherKey);
for (int ii = 0; ii < choices.length; ++ii) {
String patchwork = key.substring(0, pos)
+ choices[ii]
+ key.substring(endPos);
partialClosure.add(patchwork);
}
}
}
}
// now, for each thing in the partial closure, get its
// case closure and add it to the final result.
Set closure = new TreeSet(); // this will be the real closure
Iterator partialIt = partialClosure.iterator();
while (partialIt.hasNext()) {
String key2 = (String) partialIt.next();
ci.reset(key2);
for (String temp = ci.next(); temp != null; temp = ci.next()) {
closure.add(temp);
}
// form closure
/*String[] choices = (String[]) fromCaseFold.get(key2);
for (int i = 0; i < choices.length; ++i) {
ci.reset(choices[i]);
String temp;
while (null != (temp = ci.next())) {
closure.add(temp);
}
}
*/
}
// print it out, so that it can be cut and pasted back into this document.
Iterator it2 = closure.iterator();
System.out.println("\t// " + toName.transliterate(key));
System.out.print("\t{\"" + toHex.transliterate(key) + "\",");
while (it2.hasNext()) {
String item = (String)it2.next();
System.out.print("\"" + toHex.transliterate(item) + "\",");
}
System.out.println("},");
}
}
}
// ============ PRIVATE CLASS DATA ============
// pieces that we will put together
// is not changed during iteration
private int count = 0;
private String[][] variants;
// state information, changes during iteration
private boolean done = false;
private int[] counts;
// internal buffer for efficiency
private StringBuffer nextBuffer = new StringBuffer();
// ========================
/**
* Reset to different source. Once reset, the iteration starts from the beginning.
* @param source The string to get case variants for
*/
public void reset(String source) {
// allocate arrays to store pieces
// using length might be slightly too long, but we don't care much
counts = new int[source.length()];
variants = new String[source.length()][];
// walk through the source, and break up into pieces
// each piece becomes an array of equivalent values
// TODO: could optimized this later to coalesce all single string pieces
String piece = null;
count = 0;
for (int i = 0; i < source.length(); i += piece.length()) {
// find *longest* matching piece
String caseFold = null;
if (GENERATE) {
// do exactly one CP
piece = UTF16.valueOf(source, i);
caseFold = (String) toCaseFold.get(piece);
} else {
int max = i + maxLength;
if (max > source.length()) max = source.length();
for (int j = max; j > i; --j) {
piece = source.substring(i, j);
caseFold = (String) toCaseFold.get(piece);
if (caseFold != null) break;
}
}
// if we fail, pick one code point
if (caseFold == null) {
piece = UTF16.valueOf(source, i);
variants[count++] = new String[] {piece}; // single item string
} else {
variants[count++] = (String[])fromCaseFold.get(caseFold);
}
}
reset();
}
/**
* Restart the iteration from the beginning, but with same source
*/
public void reset() {
done = false;
for (int i = 0; i < count; ++i) {
counts[i] = 0;
}
}
/**
* Iterates through the case variants.
* @return next case variant. Each variant will case-fold to the same value as the source will.
* When the iteration is done, null is returned.
*/
public String next() {
if (done) return null;
int i;
// TODO Optimize so we keep the piece before and after the current position
// so we don't have so much concatenation
// get the result, a concatenation
nextBuffer.setLength(0);
for (i = 0; i < count; ++i) {
nextBuffer.append(variants[i][counts[i]]);
}
// find the next right set of pieces to concatenate
for (i = count-1; i >= 0; --i) {
counts[i]++;
if (counts[i] < variants[i].length) break;
counts[i] = 0;
}
// if we go too far, bail
if (i < 0) {
done = true;
}
return nextBuffer.toString();
}
/**
* Temporary test, just to see how the stuff works.
*/
static public void main(String[] args) {
String[] testCases = {"fiss", "h\u03a3"};
CaseIterator ci = new CaseIterator();
for (int i = 0; i < testCases.length; ++i) {
String item = testCases[i];
System.out.println();
System.out.println("Testing: " + toName.transliterate(item));
System.out.println();
ci.reset(item);
int count = 0;
for (String temp = ci.next(); temp != null; temp = ci.next()) {
System.out.println(toName.transliterate(temp));
count++;
}
System.out.println("Total: " + count);
}
// generate a list of all caseless characters -- characters whose
// case closure is themselves.
UnicodeSet caseless = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String cp = UTF16.valueOf(i);
ci.reset(cp);
int count = 0;
String fold = null;
for (String temp = ci.next(); temp != null; temp = ci.next()) {
fold = temp;
if (++count > 1) break;
}
if (count==1 && fold.equals(cp)) {
caseless.add(i);
}
}
System.out.println("caseless = " + caseless.toPattern(true));
UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
UnicodeSet a = new UnicodeSet();
a.set(not_lc);
a.removeAll(caseless);
System.out.println("[:^lc:] - caseless = " + a.toPattern(true));
a.set(caseless);
a.removeAll(not_lc);
System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
}
}

View File

@ -1,147 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2002-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Locale;
public class FileUtilities {
public static final boolean SHOW_FILES;
static {
boolean showFiles = false;
try {
showFiles = System.getProperty("SHOW_FILES") != null;
} catch (SecurityException ignored) {
}
SHOW_FILES = showFiles;
}
public static final PrintWriter CONSOLE = new PrintWriter(System.out,true);
private static PrintWriter log = CONSOLE;
public static BufferedReader openUTF8Reader(String dir, String filename) throws IOException {
return openReader(dir, filename, "UTF-8");
}
public static BufferedReader openReader(String dir, String filename, String encoding) throws IOException {
File file = dir.length() == 0 ? new File(filename) : new File(dir, filename);
if (SHOW_FILES && log != null) {
log.println("Opening File: "
+ file.getCanonicalPath());
}
return new BufferedReader(
new InputStreamReader(
new FileInputStream(file),
encoding),
4*1024);
}
public static PrintWriter openUTF8Writer(String dir, String filename) throws IOException {
return openWriter(dir, filename, "UTF-8");
}
public static PrintWriter openWriter(String dir, String filename, String encoding) throws IOException {
File file = new File(dir, filename);
if (SHOW_FILES && log != null) {
log.println("Creating File: "
+ file.getCanonicalPath());
}
String parentName = file.getParent();
if (parentName != null) {
File parent = new File(parentName);
parent.mkdirs();
}
return new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(file),
encoding),
4*1024));
}
public static void appendFile(String filename, String encoding, PrintWriter output) throws IOException {
appendFile(filename, encoding, output, null);
}
public static void appendFile(String filename, String encoding, PrintWriter output, String[] replacementList) throws IOException {
BufferedReader br = openReader("", filename, encoding);
/*
FileInputStream fis = new FileInputStream(filename);
InputStreamReader isr = (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis);
BufferedReader br = new BufferedReader(isr, 32*1024);
*/
try {
appendBufferedReader(br, output, replacementList);
} finally {
br.close();
}
}
public static void appendBufferedReader(BufferedReader br,
PrintWriter output, String[] replacementList) throws IOException {
while (true) {
String line = br.readLine();
if (line == null) break;
if (replacementList != null) {
for (int i = 0; i < replacementList.length; i += 2) {
line = replace(line, replacementList[i], replacementList[i+1]);
}
}
output.println(line);
}
br.close();
}
/**
* Replaces all occurrences of piece with replacement, and returns new String
*/
public static String replace(String source, String piece, String replacement) {
if (source == null || source.length() < piece.length()) return source;
int pos = 0;
while (true) {
pos = source.indexOf(piece, pos);
if (pos < 0) return source;
source = source.substring(0,pos) + replacement + source.substring(pos + piece.length());
pos += replacement.length();
}
}
public static String replace(String source, String[][] replacements) {
return replace(source, replacements, replacements.length);
}
public static String replace(String source, String[][] replacements, int count) {
for (int i = 0; i < count; ++i) {
source = replace(source, replacements[i][0], replacements[i][1]);
}
return source;
}
public static String replace(String source, String[][] replacements, boolean reverse) {
if (!reverse) return replace(source, replacements);
for (int i = 0; i < replacements.length; ++i) {
source = replace(source, replacements[i][1], replacements[i][0]);
}
return source;
}
public static String anchorize(String source) {
String result = source.toLowerCase(Locale.ENGLISH).replaceAll("[^\\p{L}\\p{N}]+", "_");
if (result.endsWith("_")) result = result.substring(0,result.length()-1);
if (result.startsWith("_")) result = result.substring(1);
return result;
}
}

View File

@ -1,556 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2002-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.VersionInfo;
/**
* Provides a general interface for Unicode Properties, and
* extracting sets based on those values.
* @author Davis
*/
public class ICUPropertyFactory extends UnicodeProperty.Factory {
static class ICUProperty extends UnicodeProperty {
protected int propEnum = Integer.MIN_VALUE;
protected ICUProperty(String propName, int propEnum) {
setName(propName);
this.propEnum = propEnum;
setType(internalGetPropertyType(propEnum));
if (propEnum == UProperty.DEFAULT_IGNORABLE_CODE_POINT || propEnum == UProperty.BIDI_CLASS || propEnum == UProperty.GENERAL_CATEGORY) {
setUniformUnassigned(false);
} else {
setUniformUnassigned(true);
}
}
boolean shownException = false;
public String _getValue(int codePoint) {
switch (propEnum) {
case UProperty.AGE:
return getAge(codePoint);
case UProperty.BIDI_MIRRORING_GLYPH:
return UTF16.valueOf(UCharacter.getMirror(codePoint));
case UProperty.CASE_FOLDING:
return UCharacter.foldCase(UTF16.valueOf(codePoint), true);
case UProperty.ISO_COMMENT:
return UCharacter.getISOComment(codePoint);
case UProperty.LOWERCASE_MAPPING:
return UCharacter.toLowerCase(Locale.ENGLISH, UTF16.valueOf(codePoint));
case UProperty.NAME:
return UCharacter.getName(codePoint);
case UProperty.SIMPLE_CASE_FOLDING:
return UTF16.valueOf(UCharacter.foldCase(codePoint, true));
case UProperty.SIMPLE_LOWERCASE_MAPPING:
return UTF16.valueOf(UCharacter.toLowerCase(codePoint));
case UProperty.SIMPLE_TITLECASE_MAPPING:
return UTF16.valueOf(UCharacter.toTitleCase(codePoint));
case UProperty.SIMPLE_UPPERCASE_MAPPING:
return UTF16.valueOf(UCharacter.toUpperCase(codePoint));
case UProperty.TITLECASE_MAPPING:
return UCharacter.toTitleCase(Locale.ENGLISH, UTF16.valueOf(codePoint), null);
case UProperty.UNICODE_1_NAME:
return UCharacter.getName1_0(codePoint);
case UProperty.UPPERCASE_MAPPING:
return UCharacter.toUpperCase(Locale.ENGLISH, UTF16.valueOf(codePoint));
// case NFC: return Normalizer.normalize(codePoint, Normalizer.NFC);
// case NFD: return Normalizer.normalize(codePoint, Normalizer.NFD);
// case NFKC: return Normalizer.normalize(codePoint, Normalizer.NFKC);
// case NFKD: return Normalizer.normalize(codePoint, Normalizer.NFKD);
case isNFC:
return String.valueOf(Normalizer.normalize(codePoint, Normalizer.NFC).equals(UTF16.valueOf(codePoint)));
case isNFD:
return String.valueOf(Normalizer.normalize(codePoint, Normalizer.NFD).equals(UTF16.valueOf(codePoint)));
case isNFKC:
return String
.valueOf(Normalizer.normalize(codePoint, Normalizer.NFKC).equals(UTF16.valueOf(codePoint)));
case isNFKD:
return String
.valueOf(Normalizer.normalize(codePoint, Normalizer.NFKD).equals(UTF16.valueOf(codePoint)));
case isLowercase:
return String.valueOf(UCharacter.toLowerCase(Locale.ENGLISH, UTF16.valueOf(codePoint)).equals(
UTF16.valueOf(codePoint)));
case isUppercase:
return String.valueOf(UCharacter.toUpperCase(Locale.ENGLISH, UTF16.valueOf(codePoint)).equals(
UTF16.valueOf(codePoint)));
case isTitlecase:
return String.valueOf(UCharacter.toTitleCase(Locale.ENGLISH, UTF16.valueOf(codePoint), null).equals(
UTF16.valueOf(codePoint)));
case isCasefolded:
return String.valueOf(UCharacter.foldCase(UTF16.valueOf(codePoint), true).equals(
UTF16.valueOf(codePoint)));
case isCased:
return String.valueOf(UCharacter.toLowerCase(Locale.ENGLISH, UTF16.valueOf(codePoint)).equals(
UTF16.valueOf(codePoint)));
case UProperty.SCRIPT_EXTENSIONS:
return getStringScriptExtensions(codePoint);
}
if (propEnum < UProperty.INT_LIMIT) {
int enumValue = -1;
String value = null;
try {
enumValue = UCharacter.getIntPropertyValue(codePoint, propEnum);
if (enumValue >= 0)
value = fixedGetPropertyValueName(propEnum, enumValue, UProperty.NameChoice.LONG);
} catch (IllegalArgumentException e) {
if (!shownException) {
System.out.println("Fail: " + getName() + ", " + Integer.toHexString(codePoint));
shownException = true;
}
}
return value != null ? value : String.valueOf(enumValue);
} else if (propEnum < UProperty.DOUBLE_LIMIT) {
double num = UCharacter.getUnicodeNumericValue(codePoint);
if (num == UCharacter.NO_NUMERIC_VALUE)
return null;
return Double.toString(num);
// TODO: Fix HACK -- API deficient
}
return null;
}
private String getAge(int codePoint) {
String temp = UCharacter.getAge(codePoint).toString();
if (temp.equals("0.0.0.0"))
return "unassigned";
if (temp.endsWith(".0.0"))
return temp.substring(0, temp.length() - 4);
return temp;
}
/**
* @param valueAlias null if unused.
* @param valueEnum -1 if unused
* @param nameChoice
* @return
*/
private String getFixedValueAlias(String valueAlias, int valueEnum, int nameChoice) {
if (propEnum >= UProperty.STRING_START) {
if (nameChoice > UProperty.NameChoice.LONG)
throw new IllegalArgumentException();
if (nameChoice != UProperty.NameChoice.LONG)
return null;
return "<string>";
} else if (propEnum >= UProperty.DOUBLE_START) {
if (nameChoice > UProperty.NameChoice.LONG)
throw new IllegalArgumentException();
if (nameChoice != UProperty.NameChoice.LONG)
return null;
return "<number>";
}
if (valueAlias != null && !valueAlias.equals("<integer>")) {
valueEnum = fixedGetPropertyValueEnum(propEnum, valueAlias);
}
// because these are defined badly, there may be no normal (long) name.
// if there is
String result = fixedGetPropertyValueName(propEnum, valueEnum, nameChoice);
if (result != null)
return result;
// HACK try other namechoice
if (nameChoice == UProperty.NameChoice.LONG) {
result = fixedGetPropertyValueName(propEnum, valueEnum, UProperty.NameChoice.SHORT);
if (result != null)
return result;
if (isCombiningClassProperty())
return null;
return "<integer>";
}
return null;
}
public boolean isCombiningClassProperty() {
return (propEnum == UProperty.CANONICAL_COMBINING_CLASS
|| propEnum == UProperty.LEAD_CANONICAL_COMBINING_CLASS
|| propEnum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
}
private static int fixedGetPropertyValueEnum(int propEnum, String valueAlias) {
try {
if (propEnum < BINARY_LIMIT) {
propEnum = UProperty.ALPHABETIC;
}
return UCharacter.getPropertyValueEnum(propEnum, valueAlias);
} catch (Exception e) {
return Integer.parseInt(valueAlias);
}
}
static Map fixSkeleton = new HashMap();
private static String fixedGetPropertyValueName(int propEnum, int valueEnum, int nameChoice) {
String value = UCharacter.getPropertyValueName(propEnum, valueEnum, nameChoice);
String newValue = (String) fixSkeleton.get(value);
if (newValue == null) {
newValue = value;
if (propEnum == UProperty.JOINING_GROUP) {
newValue = newValue == null ? null : newValue.toLowerCase(Locale.ENGLISH);
}
newValue = regularize(newValue, true);
fixSkeleton.put(value, newValue);
}
return newValue;
}
public List _getNameAliases(List result) {
if (result == null)
result = new ArrayList();
// String alias = String_Extras.get(propEnum);
// if (alias == null)
String alias = Binary_Extras.get(propEnum);
if (alias != null) {
addUnique(alias, result);
} else {
addUnique(getFixedPropertyName(propEnum, UProperty.NameChoice.SHORT), result);
addUnique(getFixedPropertyName(propEnum, UProperty.NameChoice.LONG), result);
}
return result;
}
public String getFixedPropertyName(int propName, int nameChoice) {
try {
return UCharacter.getPropertyName(propEnum, nameChoice);
} catch (IllegalArgumentException e) {
return null;
}
}
private static Map cccHack = new HashMap();
private static Set cccExtras = new HashSet();
static {
for (int i = 0; i <= 255; ++i) {
String alias = UCharacter.getPropertyValueName(UProperty.CANONICAL_COMBINING_CLASS, i,
UProperty.NameChoice.LONG);
String numStr = String.valueOf(i);
if (alias != null) {
cccHack.put(alias, numStr);
} else {
cccHack.put(numStr, numStr);
cccExtras.add(numStr);
}
}
}
public List _getAvailableValues(List result) {
if (result == null)
result = new ArrayList();
if (propEnum == UProperty.AGE) {
addAllUnique(getAges(), result);
return result;
}
if (propEnum < UProperty.INT_LIMIT) {
if (Binary_Extras.isInRange(propEnum)) {
propEnum = UProperty.BINARY_START; // HACK
}
int start = UCharacter.getIntPropertyMinValue(propEnum);
int end = UCharacter.getIntPropertyMaxValue(propEnum);
for (int i = start; i <= end; ++i) {
String alias = getFixedValueAlias(null, i, UProperty.NameChoice.LONG);
String alias2 = getFixedValueAlias(null, i, UProperty.NameChoice.SHORT);
if (alias == null) {
alias = alias2;
if (alias == null && isCombiningClassProperty()) {
alias = String.valueOf(i);
}
}
// System.out.println(propertyAlias + "\t" + i + ":\t" + alias);
addUnique(alias, result);
}
} else if (propEnum >= UProperty.DOUBLE_START && propEnum < UProperty.DOUBLE_LIMIT) {
UnicodeMap map = getUnicodeMap();
Collection values = map.values();
addAllUnique(values, result);
} else {
String alias = getFixedValueAlias(null, -1, UProperty.NameChoice.LONG);
addUnique(alias, result);
}
return result;
}
static String[] AGES = null;
private String[] getAges() {
if (AGES == null) {
Set ages = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
ages.add(getAge(i));
}
AGES = (String[]) ages.toArray(new String[ages.size()]);
}
return AGES;
}
public List _getValueAliases(String valueAlias, List result) {
if (result == null)
result = new ArrayList();
if (propEnum == UProperty.AGE) {
addUnique(valueAlias, result);
return result;
}
if (isCombiningClassProperty()) {
addUnique(cccHack.get(valueAlias), result); // add number
}
int type = getType();
if (type == UnicodeProperty.NUMERIC || type == EXTENDED_NUMERIC) {
addUnique(valueAlias, result);
if (valueAlias.endsWith(".0")) {
addUnique(valueAlias.substring(0, valueAlias.length() - 2), result);
}
} else {
for (int nameChoice = UProperty.NameChoice.SHORT;; ++nameChoice) {
try {
addUnique(getFixedValueAlias(valueAlias, -1, nameChoice), result);
} catch (Exception e) {
break;
}
}
}
return result;
}
/* (non-Javadoc)
* @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyType()
*/
private int internalGetPropertyType(int prop) {
switch (prop) {
case UProperty.AGE:
case UProperty.BLOCK:
case UProperty.SCRIPT:
return UnicodeProperty.CATALOG;
case UProperty.ISO_COMMENT:
case UProperty.NAME:
case UProperty.UNICODE_1_NAME:
case UProperty.SCRIPT_EXTENSIONS:
return UnicodeProperty.MISC;
case UProperty.BIDI_MIRRORING_GLYPH:
case UProperty.CASE_FOLDING:
case UProperty.LOWERCASE_MAPPING:
case UProperty.SIMPLE_CASE_FOLDING:
case UProperty.SIMPLE_LOWERCASE_MAPPING:
case UProperty.SIMPLE_TITLECASE_MAPPING:
case UProperty.SIMPLE_UPPERCASE_MAPPING:
case UProperty.TITLECASE_MAPPING:
case UProperty.UPPERCASE_MAPPING:
return UnicodeProperty.EXTENDED_STRING;
}
if (prop < UProperty.BINARY_START)
return UnicodeProperty.UNKNOWN;
if (prop < UProperty.BINARY_LIMIT)
return UnicodeProperty.BINARY;
if (prop < UProperty.INT_START)
return UnicodeProperty.EXTENDED_BINARY;
if (prop < UProperty.INT_LIMIT)
return UnicodeProperty.ENUMERATED;
if (prop < UProperty.DOUBLE_START)
return UnicodeProperty.EXTENDED_ENUMERATED;
if (prop < UProperty.DOUBLE_LIMIT)
return UnicodeProperty.NUMERIC;
if (prop < UProperty.STRING_START)
return UnicodeProperty.EXTENDED_NUMERIC;
if (prop < UProperty.STRING_LIMIT)
return UnicodeProperty.STRING;
return UnicodeProperty.EXTENDED_STRING;
}
/*
* (non-Javadoc)
*
* @see com.ibm.icu.dev.test.util.UnicodeProperty#getVersion()
*/
public String _getVersion() {
return VersionInfo.ICU_VERSION.toString();
}
}
/*{
matchIterator = new UnicodeSetIterator(
new UnicodeSet("[^[:Cn:]-[:Default_Ignorable_Code_Point:]]"));
}*/
/*
* Other Missing Functions:
Expands_On_NFC
Expands_On_NFD
Expands_On_NFKC
Expands_On_NFKD
Composition_Exclusion
Decomposition_Mapping
FC_NFKC_Closure
ISO_Comment
NFC_Quick_Check
NFD_Quick_Check
NFKC_Quick_Check
NFKD_Quick_Check
Special_Case_Condition
Unicode_Radical_Stroke
*/
static final Names Binary_Extras = new Names(UProperty.BINARY_LIMIT,
new String[] {
"isNFC", "isNFD", "isNFKC", "isNFKD",
"isLowercase", "isUppercase", "isTitlecase", "isCasefolded", "isCased",
});
// static final Names String_Extras = new Names(UProperty.STRING_LIMIT,
// new String[] {
// "toNFC", "toNFD", "toNFKC", "toNKFD",
// });
static final int
isNFC = UProperty.BINARY_LIMIT,
isNFD = UProperty.BINARY_LIMIT+1,
isNFKC = UProperty.BINARY_LIMIT+2,
isNFKD = UProperty.BINARY_LIMIT+3,
isLowercase = UProperty.BINARY_LIMIT+4,
isUppercase = UProperty.BINARY_LIMIT+5,
isTitlecase = UProperty.BINARY_LIMIT+6,
isCasefolded = UProperty.BINARY_LIMIT+7,
isCased = UProperty.BINARY_LIMIT+8,
BINARY_LIMIT = UProperty.BINARY_LIMIT+9
// NFC = UProperty.STRING_LIMIT,
// NFD = UProperty.STRING_LIMIT+1,
// NFKC = UProperty.STRING_LIMIT+2,
// NFKD = UProperty.STRING_LIMIT+3
;
protected ICUPropertyFactory() {
Collection c = getInternalAvailablePropertyAliases(new ArrayList());
Iterator it = c.iterator();
while (it.hasNext()) {
add(getInternalProperty((String) it.next()));
}
}
static BitSet BITSET = new BitSet();
public static synchronized String getStringScriptExtensions(int codePoint) {
int result = UScript.getScriptExtensions(codePoint, BITSET);
if (result >= 0) {
return UScript.getName(result);
}
TreeMap<String,String> sorted = new TreeMap<String,String>();
for (int scriptCode = BITSET.nextSetBit(0); scriptCode >= 0; scriptCode = BITSET.nextSetBit(scriptCode+1)) {
// sort by short form
sorted.put(UScript.getShortName(scriptCode), UScript.getName(scriptCode));
}
return CollectionUtilities.join(sorted.values(), " ");
}
private static ICUPropertyFactory singleton = null;
public static synchronized ICUPropertyFactory make() {
if (singleton != null)
return singleton;
singleton = new ICUPropertyFactory();
return singleton;
}
public List getInternalAvailablePropertyAliases(List result) {
int[][] ranges = {
{UProperty.BINARY_START, UProperty.BINARY_LIMIT},
{UProperty.INT_START, UProperty.INT_LIMIT},
{UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT},
{UProperty.STRING_START, UProperty.STRING_LIMIT},
{UProperty.OTHER_PROPERTY_START, UProperty.OTHER_PROPERTY_LIMIT},
};
for (int i = 0; i < ranges.length; ++i) {
for (int j = ranges[i][0]; j < ranges[i][1]; ++j) {
String alias = UCharacter.getPropertyName(j, UProperty.NameChoice.LONG);
UnicodeProperty.addUnique(alias, result);
if (!result.contains(alias))
result.add(alias);
}
}
// result.addAll(String_Extras.getNames());
result.addAll(Binary_Extras.getNames());
return result;
}
public UnicodeProperty getInternalProperty(String propertyAlias) {
int propEnum;
main: {
int possibleItem = Binary_Extras.get(propertyAlias);
if (possibleItem >= 0) {
propEnum = possibleItem;
break main;
}
// possibleItem = String_Extras.get(propertyAlias);
// if (possibleItem >= 0) {
// propEnum = possibleItem;
// break main;
// }
propEnum = UCharacter.getPropertyEnum(propertyAlias);
}
return new ICUProperty(propertyAlias, propEnum);
}
/*
* (non-Javadoc)
*
* @see com.ibm.icu.dev.test.util.UnicodePropertySource#getProperty(java.lang.String)
*/
// TODO file bug on getPropertyValueName for Canonical_Combining_Class
public static class Names {
private String[] names;
private int base;
public Names(int base, String[] names) {
this.base = base;
this.names = names;
}
public int get(String name) {
for (int i = 0; i < names.length; ++i) {
if (name.equalsIgnoreCase(names[i]))
return base + i;
}
return -1;
}
public String get(int number) {
number -= base;
if (number < 0 || names.length <= number)
return null;
return names[number];
}
public boolean isInRange(int number) {
number -= base;
return (0 <= number && number < names.length);
}
public List getNames() {
return Arrays.asList(names);
}
}
}

View File

@ -1,149 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2002-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.util;
import java.io.BufferedReader;
import java.io.IOException;
import com.ibm.icu.text.Transliterator;
public class TransliteratorUtilities {
public static boolean DEBUG = false;
public static void registerTransliteratorFromFile(String dir, String id) {
try {
String filename = id.replace('-', '_') + ".txt";
String rules = getFileContents(dir, filename);
Transliterator t;
int pos = id.indexOf('-');
String rid;
if (pos < 0) {
rid = id + "-Any";
id = "Any-" + id;
} else {
rid = id.substring(pos+1) + "-" + id.substring(0, pos);
}
t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD);
Transliterator.unregister(id);
Transliterator.registerInstance(t);
/*String test = "\u049A\u0430\u0437\u0430\u049B";
System.out.println(t.transliterate(test));
t = Transliterator.getInstance(id);
System.out.println(t.transliterate(test));
*/
t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE);
Transliterator.unregister(rid);
Transliterator.registerInstance(t);
if (DEBUG) System.out.println("Registered new Transliterator: " + id + ", " + rid);
} catch (IOException e) {
//#if defined(FOUNDATION10) || defined(J2SE13)
//## throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + id+" "+ e.getMessage());
//#else
throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + id).initCause(e);
//#endif
}
}
/**
*
*/
public static String getFileContents(String dir, String filename) throws IOException {
//#if defined(FOUNDATION10) || defined(J2SE13)
//## BufferedReader br = TestUtil.openUTF8Reader(dir, filename);
//#else
BufferedReader br = FileUtilities.openUTF8Reader(dir, filename);
//#endif
StringBuffer buffer = new StringBuffer();
while (true) {
String line = br.readLine();
if (line == null) break;
if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1);
buffer.append(line).append("\r\n");
}
br.close();
return buffer.toString();
}
private static final String BASE_RULES =
":: (hex-any/xml);" +
":: (hex-any/xml10);" +
"'<' > '&lt;' ;" +
"'<' < '&'[lL][Tt]';' ;" +
"'&' > '&amp;' ;" +
"'&' < '&'[aA][mM][pP]';' ;" +
"'>' < '&'[gG][tT]';' ;" +
"'\"' < '&'[qQ][uU][oO][tT]';' ; " +
"'' < '&'[aA][pP][oO][sS]';' ; ";
private static final String CONTENT_RULES =
"'>' > '&gt;' ;";
private static final String HTML_RULES = BASE_RULES + CONTENT_RULES +
"'\"' > '&quot;' ; ";
private static final String HTML_RULES_CONTROLS = HTML_RULES +
":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; ";
private static final String HTML_RULES_ASCII = HTML_RULES +
":: [[:C:][:^ASCII:]] any-hex/xml ; ";
private static final String XML_RULES = HTML_RULES +
"'' > '&apos;' ; "
;
/*
The ampersand character (&) and the left angle bracket (<) MUST NOT appear
in their literal form, except when used as markup delimiters, or within a
comment, a processing instruction, or a CDATA section. If they are needed
elsewhere, they MUST be escaped using either numeric character references or
the strings "&amp;" and "&lt;" respectively. The right angle bracket (>) MAY
be represented using the string "&gt;", and MUST, for compatibility, be
escaped using either "&gt;" or a character reference when it appears in the string
"]]>" in content, when that string is not marking the end of a CDATA section.
In the content of elements, character data is any string of characters which does
not contain the start-delimiter of any markup and does not include the
CDATA-section-close delimiter, "]]>". In a CDATA section, character data is
any string of characters not including the CDATA-section-close delimiter,
"]]>".
To allow attribute values to contain both single and double quotes, the
apostrophe or single-quote character (') MAY be represented as "&apos;", and
the double-quote character (") as "&quot;".
*/
public static final Transliterator toXML = Transliterator.createFromRules(
"any-xml", XML_RULES, Transliterator.FORWARD);
public static final Transliterator fromXML = Transliterator.createFromRules(
"xml-any", XML_RULES, Transliterator.REVERSE);
public static final Transliterator toHTML = Transliterator.createFromRules(
"any-html", HTML_RULES, Transliterator.FORWARD);
public static final Transliterator toHTMLControl = Transliterator.createFromRules(
"any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD);
public static final Transliterator toHTMLAscii = Transliterator.createFromRules(
"any-html", HTML_RULES_ASCII, Transliterator.FORWARD);
public static final Transliterator fromHTML = Transliterator.createFromRules(
"html-any", HTML_RULES, Transliterator.REVERSE);
}

View File

@ -1,248 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 1996-2012, Google, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.util;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import com.ibm.icu.dev.util.UnicodeProperty.PatternMatcher;
import com.ibm.icu.impl.UnicodeRegex;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* Allows for overriding the parsing of UnicodeSet property patterns.
* <p>
* WARNING: If this UnicodePropertySymbolTable is used with {@code UnicodeSet.setDefaultXSymbolTable}, and the
* Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call
* {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
* with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
*
* @author markdavis
*/
public class UnicodePropertySymbolTable extends UnicodeSet.XSymbolTable {
UnicodeRegex unicodeRegex;
final UnicodeProperty.Factory factory;
public UnicodePropertySymbolTable(UnicodeProperty.Factory factory) {
unicodeRegex = new UnicodeRegex().setSymbolTable(this);
this.factory = factory;
}
// public boolean applyPropertyAlias0(String propertyName,
// String propertyValue, UnicodeSet result) {
// if (!propertyName.contains("*")) {
// return applyPropertyAlias(propertyName, propertyValue, result);
// }
// String[] propertyNames = propertyName.split("[*]");
// for (int i = propertyNames.length - 1; i >= 0; ++i) {
// String pname = propertyNames[i];
//
// }
// return null;
// }
public boolean applyPropertyAlias(String propertyName,
String propertyValue, UnicodeSet result) {
boolean status = false;
boolean invert = false;
int posNotEqual = propertyName.indexOf('\u2260');
int posColon = propertyName.indexOf(':');
if (posNotEqual >= 0 || posColon >= 0) {
if (posNotEqual < 0) posNotEqual = propertyName.length();
if (posColon < 0) posColon = propertyName.length();
int opPos = posNotEqual < posColon ? posNotEqual : posColon;
propertyValue = propertyValue.length() == 0 ? propertyName.substring(opPos+1)
: propertyName.substring(opPos+1) + "=" + propertyValue;
propertyName = propertyName.substring(0,opPos);
if (posNotEqual < posColon) {
invert = true;
}
}
if (propertyName.endsWith("!")) {
propertyName = propertyName.substring(0, propertyName.length() - 1);
invert = !invert;
}
propertyValue = propertyValue.trim();
if (propertyValue.length() != 0) {
status = applyPropertyAlias0(propertyName, propertyValue, result);
} else {
try {
status = applyPropertyAlias0("gc", propertyName, result);
} catch (Exception e) {};
if (!status) {
try {
status = applyPropertyAlias0("sc", propertyName, result);
} catch (Exception e) {};
if (!status) {
try {
status = applyPropertyAlias0(propertyName, "Yes", result);
} catch (Exception e) {};
if (!status) {
status = applyPropertyAlias0(propertyName, "", result);
}
}
}
}
if (status && invert) {
result.complement();
}
return status;
}
static final HashMap<String,String[]> GC_REMAP = new HashMap();
{
GC_REMAP.put("c", "Cc Cf Cn Co Cs".split(" "));
GC_REMAP.put("other", GC_REMAP.get("c"));
GC_REMAP.put("l", "Ll Lm Lo Lt Lu".split(" "));
GC_REMAP.put("letter", GC_REMAP.get("l"));
GC_REMAP.put("lc", "Ll Lt Lu".split(" "));
GC_REMAP.put("casedletter", GC_REMAP.get("lc"));
GC_REMAP.put("m", "Mc Me Mn".split(" "));
GC_REMAP.put("mark", GC_REMAP.get("m"));
GC_REMAP.put("n", "Nd Nl No".split(" "));
GC_REMAP.put("number", GC_REMAP.get("n"));
GC_REMAP.put("p", "Pc Pd Pe Pf Pi Po Ps".split(" "));
GC_REMAP.put("punctuation", GC_REMAP.get("p"));
GC_REMAP.put("punct", GC_REMAP.get("p"));
GC_REMAP.put("s", "Sc Sk Sm So".split(" "));
GC_REMAP.put("symbol", GC_REMAP.get("s"));
GC_REMAP.put("z", "Zl Zp Zs".split(" "));
GC_REMAP.put("separator", GC_REMAP.get("z"));
}
public boolean applyPropertyAlias0(String propertyName,
String propertyValue, UnicodeSet result) {
result.clear();
UnicodeProperty prop = factory.getProperty(propertyName);
String canonicalName = prop.getName();
boolean isAge = UnicodeProperty.equalNames("Age", canonicalName);
// Hack for special GC values
if (canonicalName.equals("General_Category")) {
String[] parts = GC_REMAP.get(UnicodeProperty.toSkeleton(propertyValue));
if (parts != null) {
for (String part : parts) {
prop.getSet(part, result);
}
return true;
}
}
PatternMatcher patternMatcher = null;
if (propertyValue.length() > 1 && propertyValue.startsWith("/") && propertyValue.endsWith("/")) {
String fixedRegex = unicodeRegex.transform(propertyValue.substring(1, propertyValue.length() - 1));
patternMatcher = new UnicodeProperty.RegexMatcher().set(fixedRegex);
}
UnicodeProperty otherProperty = null;
boolean testCp = false;
if (propertyValue.length() > 1 && propertyValue.startsWith("@") && propertyValue.endsWith("@")) {
String otherPropName = propertyValue.substring(1, propertyValue.length() - 1).trim();
if ("cp".equalsIgnoreCase(otherPropName)) {
testCp = true;
} else {
otherProperty = factory.getProperty(otherPropName);
}
}
if (prop != null) {
UnicodeSet set;
if (testCp) {
set = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (UnicodeProperty.equals(i, prop.getValue(i))) {
set.add(i);
}
}
} else if (otherProperty != null) {
set = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String v1 = prop.getValue(i);
String v2 = otherProperty.getValue(i);
if (UnicodeProperty.equals(v1, v2)) {
set.add(i);
}
}
} else if (patternMatcher == null) {
if (!isValid(prop, propertyValue)) {
throw new IllegalArgumentException("The value '" + propertyValue + "' is illegal. Values for " + propertyName
+ " must be in "
+ prop.getAvailableValues() + " or in " + prop.getValueAliases());
}
if (isAge) {
set = prop.getSet(new ComparisonMatcher(propertyValue, Relation.geq));
} else {
set = prop.getSet(propertyValue);
}
} else if (isAge) {
set = new UnicodeSet();
List<String> values = prop.getAvailableValues();
for (String value : values) {
if (patternMatcher.matches(value)) {
for (String other : values) {
if (other.compareTo(value) <= 0) {
set.addAll(prop.getSet(other));
}
}
}
}
} else {
set = prop.getSet(patternMatcher);
}
result.addAll(set);
return true;
}
throw new IllegalArgumentException("Illegal property: " + propertyName);
}
private boolean isValid(UnicodeProperty prop, String propertyValue) {
// if (prop.getName().equals("General_Category")) {
// if (propertyValue)
// }
return prop.isValidValue(propertyValue);
}
public enum Relation {less, leq, equal, geq, greater}
public static class ComparisonMatcher implements PatternMatcher {
Relation relation;
static Comparator comparator = new UTF16.StringComparator(true, false,0);
String pattern;
public ComparisonMatcher(String pattern, Relation comparator) {
this.relation = comparator;
this.pattern = pattern;
}
public boolean matches(Object value) {
int comp = comparator.compare(pattern, value.toString());
switch (relation) {
case less: return comp < 0;
case leq: return comp <= 0;
default: return comp == 0;
case geq: return comp >= 0;
case greater: return comp > 0;
}
}
public PatternMatcher set(String pattern) {
this.pattern = pattern;
return this;
}
}
}