ICU-0 fixes for generation of Unicode 4.1.0 properties

X-SVN-Rev: 16858
This commit is contained in:
Mark Davis 2004-11-13 23:10:32 +00:00
parent 4c8340b33f
commit bd1094eaca
16 changed files with 1415 additions and 543 deletions

View File

@ -0,0 +1,27 @@
/*
*******************************************************************************
* Copyright (C) 2002-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.util;
public interface Equator {
/**
* Comparator function. If overridden, must handle case of null,
* and compare any two objects that could be compared.
* Must obey normal rules of symmetry: a=b => b=a
* and transitivity: a=b & b=c => a=b)
* @param a
* @param b
* @return true if a and b are equal
*/
public boolean isEqual(Object a, Object b);
/**
* Must obey normal rules: a=b => getHashCode(a)=getHashCode(b)
* @param object
* @return a hash code for the object
*/
public int getHashCode(Object object);
}

View File

@ -0,0 +1,247 @@
/*
*******************************************************************************
* Copyright (C) 2002-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.util;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.ListIterator;
import java.util.Set;
import java.util.List;
import java.util.TreeSet;
/**
* A list with unique items. It does not permit multiple items to be added, and does not support (at
* least for now) adding elements at a position. (Support may be added later). Also should add support
* for Equator.
* @author davis
*/
public class ListSet implements Set, List {
List list = new ArrayList();
Set set;
Comparator comparator;
ListSet(Comparator comparator) {
this.comparator = comparator;
set = new TreeSet(comparator);
}
/**
* @param index
* @param element
*/
public void add(int index, Object element) {
throw new UnsupportedOperationException();
}
/**
* @param o
* @return
*/
public boolean add(Object o) {
boolean result = set.add(o);
if (result) list.add(o);
return result;
}
/**
* @param index
* @param c
* @return
*/
public boolean addAll(int index, Collection c) {
throw new UnsupportedOperationException();
}
/**
* @param c
* @return
*/
public boolean addAll(Collection c) {
// TODO optimize
boolean result = false;
for (Iterator it = c.iterator(); it.hasNext();) {
result = result || add(it.next());
}
return result;
}
/**
*
*/
public void clear() {
list.clear();
}
/**
* @param o
* @return
*/
public boolean contains(Object o) {
return set.contains(o);
}
/**
* @param c
* @return
*/
public boolean containsAll(Collection c) {
return set.containsAll(c);
}
/* (non-Javadoc)
* @see java.lang.Object#equals(java.lang.Object)
*/
public boolean equals(Object obj) {
try {
ListSet other = (ListSet) obj;
return list.equals(other.list) && set.equals(other.set);
} catch (ClassCastException e) {
return false;
}
}
/**
* @param index
* @return
*/
public Object get(int index) {
return list.get(index);
}
/* (non-Javadoc)
* @see java.lang.Object#hashCode()
*/
public int hashCode() {
return list.hashCode();
}
/**
* @param o
* @return
*/
public int indexOf(Object o) {
for (int i = 0; i < list.size(); ++i) {
if (0 == comparator.compare(list.get(i), o)) return i;
}
return -1;
}
/**
* @return
*/
public boolean isEmpty() {
return list.isEmpty();
}
/**
* @return
*/
public Iterator iterator() {
return list.iterator();
}
/**
* @param o
* @return
*/
public int lastIndexOf(Object o) {
for (int i = list.size()-1; i >= 0 ; --i) {
if (0 == comparator.compare(list.get(i), o)) return i;
}
return -1;
}
/**
* @return
*/
public ListIterator listIterator() {
return list.listIterator();
}
/**
* @param index
* @return
*/
public ListIterator listIterator(int index) {
return list.listIterator(index);
}
/**
* @param index
* @return
*/
public Object remove(int index) {
throw new UnsupportedOperationException();
}
/**
* @param o
* @return
*/
public boolean remove(Object o) {
boolean result = set.remove(o);
if (!result) return false;
return matchListToSet();
}
/**
* @param c
* @return
*/
public boolean removeAll(Collection c) {
boolean result = set.removeAll(c);
if (!result) return false;
return matchListToSet();
}
/**
* @param c
* @return
*/
public boolean retainAll(Collection c) {
boolean result = set.retainAll(c);
if (!result) return false;
return matchListToSet();
}
/**
* @return
*/
private boolean matchListToSet() {
for (Iterator it = list.iterator(); it.hasNext();) {
Object o = it.next();
if (!set.contains(o)) it.remove();
}
return true;
}
/**
* @param index
* @param element
* @return
*/
public Object set(int index, Object element) {
throw new UnsupportedOperationException();
}
/**
* @return
*/
public int size() {
return list.size();
}
/**
* @param fromIndex
* @param toIndex
* @return
*/
public List subList(int fromIndex, int toIndex) {
ListSet result = new ListSet(comparator);
result.add(list.subList(fromIndex, toIndex));
return result;
}
/**
* @return
*/
public Object[] toArray() {
return list.toArray();
}
/**
* @param a
* @return
*/
public Object[] toArray(Object[] a) {
return list.toArray(a);
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
public String toString() {
return list.toString();
}
}

View File

@ -39,7 +39,7 @@ public class TestUtilities extends TestFmwk {
UnicodeMap map1 = new UnicodeMap();
Map map2 = new HashMap();
Map map3 = new TreeMap();
UnicodeMap.Equator equator = UnicodeMap.SIMPLE_EQUATOR;
Comparator equator = UnicodeMap.SIMPLE_EQUATOR;
SortedSet log = new TreeSet();
static String[] TEST_VALUES = {null, "A", "B", "C", "D", "E", "F"};
static Random random = new Random(12345);
@ -126,7 +126,7 @@ public class TestUtilities extends TestFmwk {
map3 = new TreeMap();
Object lastValue = new Object();
while (mi.next()) {
if (!UnicodeMap.SIMPLE_EQUATOR.isEqual(lastValue, mi.value)) {
if (UnicodeMap.SIMPLE_EQUATOR.compare(lastValue, mi.value) != 0) {
// System.out.println("Change: " + Utility.hex(mi.codepoint) + " => " + mi.value);
lastValue = mi.value;
}
@ -140,7 +140,7 @@ public class TestUtilities extends TestFmwk {
for (int i = 0; i < LIMIT; ++i) {
Object value1 = map1.getValue(i);
Object value2 = map2.get(new Integer(i));
if (!equator.isEqual(value1, value2)) {
if (equator.compare(value1, value2) != 0) {
errln(counter + " Difference at " + Utility.hex(i)
+ "\t UnicodeMap: " + value1
+ "\t HashMap: " + value2);

View File

@ -10,6 +10,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
@ -30,11 +31,16 @@ public final class UnicodeMap implements Cloneable {
private int length = 2;
private int[] transitions = {0,0x110000,0,0,0,0,0,0,0,0};
private Object[] values = new Object[10];
private boolean errorOnReset = false;
private ListSet availableValues;
boolean staleAvailableValues = false;
private int lastIndex = 0;
public UnicodeMap(Equator equator) {
public UnicodeMap(Comparator equator) {
this.equator = equator;
availableValues = new ListSet(equator);
}
public UnicodeMap() {
@ -49,7 +55,7 @@ public final class UnicodeMap implements Cloneable {
if (length != that.length || !equator.equals(that.equator)) return false;
for (int i = 0; i < length-1; ++i) {
if (transitions[i] != that.transitions[i]) return false;
if (!equator.isEqual(values[i], that.values[i])) return false;
if (!areEqual(values[i], that.values[i])) return false;
}
return true;
} catch (ClassCastException e) {
@ -57,12 +63,22 @@ public final class UnicodeMap implements Cloneable {
}
}
public int getHashCode(Object o) {
return o.hashCode();
//equator.getHashCode
}
public boolean areEqual(Object a, Object b) {
return equator.compare(a, b) == 0;
//equator.getHashCode
}
public int hashCode() {
int result = length;
// TODO might want to abbreviate this for speed.
for (int i = 0; i < length-1; ++i) {
result = 37*result + transitions[i];
result = 37*result + equator.getHashCode(values[i]);
result = 37*result + getHashCode(values[i]);
}
return result;
}
@ -75,6 +91,8 @@ public final class UnicodeMap implements Cloneable {
that.length = length;
that.transitions = (int[]) transitions.clone();
that.values = (Object[]) values.clone();
that.equator = equator;
that.availableValues = new ListSet(equator);
return that;
}
@ -87,7 +105,7 @@ public final class UnicodeMap implements Cloneable {
throw new IllegalArgumentException("Invariant failed: Lengths bad");
}
for (int i = 1; i < length-1; ++i) {
if (equator.isEqual(values[i-1], values[i])) {
if (areEqual(values[i-1], values[i])) {
throw new IllegalArgumentException("Invariant failed: values shared at "
+ "\t" + Utility.hex(i-1) + ": <" + values[i-1] + ">"
+ "\t" + Utility.hex(i) + ": <" + values[i] + ">"
@ -107,39 +125,20 @@ public final class UnicodeMap implements Cloneable {
}
}
public interface Equator {
/**
* Comparator function. If overridden, must handle case of null,
* and compare any two objects that could be compared.
* Must obey normal rules of symmetry: a=b => b=a
* and transitivity: a=b & b=c => a=b)
* @param a
* @param b
* @return true if a and b are equal
*/
public boolean isEqual(Object a, Object b);
/**
* Must obey normal rules: a=b => getHashCode(a)=getHashCode(b)
* @param object
* @return a hash code for the object
*/
public int getHashCode(Object object);
}
private static final class SimpleEquator implements Equator {
public boolean isEqual(Object a, Object b) {
if (a == b) return true;
if (a == null || b == null) return false;
return a.equals(b);
private static final class SimpleEquator implements Comparator {
public int compare(Object a, Object b) {
if (a == b) return 0;
if (a == null) return -1;
if (b == null) return 1;
return ((Comparable)a).compareTo((Comparable)b);
}
public int getHashCode(Object a) {
if (a == null) return 0;
return a.hashCode();
}
}
public static Equator SIMPLE_EQUATOR = new SimpleEquator();
private Equator equator = SIMPLE_EQUATOR;
public static Comparator SIMPLE_EQUATOR = new SimpleEquator();
private Comparator equator = SIMPLE_EQUATOR;
/**
* Finds an index such that inversionList[i] <= codepoint < inversionList[i+1]
@ -261,7 +260,16 @@ public final class UnicodeMap implements Cloneable {
}
int limitIndex = baseIndex + 1;
// cases are (a) value is already set
if (equator.isEqual(values[baseIndex], value)) return this;
if (areEqual(values[baseIndex], value)) return this;
if (errorOnReset && values[baseIndex] != null) {
throw new IllegalArgumentException("Attempt to reset value for " + Utility.hex(codepoint)
+ " when that is disallowed. Old: " + values[baseIndex] + "; New: " + value);
}
// adjust the available values
staleAvailableValues = true;
availableValues.add(value); // add if not there already
int baseCP = transitions[baseIndex];
int limitCP = transitions[limitIndex];
// we now start walking through the difference case,
@ -271,12 +279,12 @@ public final class UnicodeMap implements Cloneable {
if (baseCP == codepoint) {
// CASE: At very start of range
boolean connectsWithPrevious =
baseIndex != 0 && equator.isEqual(value, values[baseIndex-1]);
baseIndex != 0 && areEqual(value, values[baseIndex-1]);
if (limitCP == codepoint + 1) {
// CASE: Single codepoint range
boolean connectsWithFollowing =
baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]);
baseIndex < length - 1 && areEqual(value, values[limitIndex]);
if (connectsWithPrevious) {
// A1a connects with previous & following, so remove index
@ -308,7 +316,7 @@ public final class UnicodeMap implements Cloneable {
// CASE: at end of range
// if connects, just back up range
boolean connectsWithFollowing =
baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]);
baseIndex < length - 1 && areEqual(value, values[limitIndex]);
if (connectsWithFollowing) {
--transitions[limitIndex];
@ -396,6 +404,8 @@ public final class UnicodeMap implements Cloneable {
* @return this (for chaining)
*/
public UnicodeMap setMissing(Object value) {
staleAvailableValues = true;
availableValues.add(value);
for (int i = 0; i < length; ++i) {
if (values[i] == null) values[i] = value;
}
@ -412,7 +422,7 @@ public final class UnicodeMap implements Cloneable {
public UnicodeSet getSet(Object value, UnicodeSet result) {
if (result == null) result = new UnicodeSet();
for (int i = 0; i < length - 1; ++i) {
if (equator.isEqual(value, values[i])) {
if (areEqual(value, values[i])) {
result.add(transitions[i], transitions[i+1]-1);
}
}
@ -429,13 +439,18 @@ public final class UnicodeMap implements Cloneable {
* @return result
*/
public Collection getAvailableValues(Collection result) {
if (result == null) result = new ArrayList(1);
for (int i = 0; i < length - 1; ++i) {
Object value = values[i];
if (value == null) continue;
if (result.contains(value)) continue;
result.add(value);
}
if (staleAvailableValues) {
// collect all the current values
// retain them in the availableValues
Set temp = new TreeSet(equator);
for (int i = 0; i < length - 1; ++i) {
temp.add(values[i]);
}
availableValues.retainAll(temp);
staleAvailableValues = false;
}
if (result == null) result = new ArrayList(1);
result.addAll(availableValues);
return result;
}
@ -539,4 +554,16 @@ public final class UnicodeMap implements Cloneable {
}
return result.toString();
}
/**
* @return Returns the errorOnReset.
*/
public boolean isErrorOnReset() {
return errorOnReset;
}
/**
* @param errorOnReset The errorOnReset to set.
*/
public void setErrorOnReset(boolean errorOnReset) {
this.errorOnReset = errorOnReset;
}
}

View File

@ -110,6 +110,10 @@ public abstract class UnicodeProperty extends UnicodeLabel {
return _getValue(codepoint);
}
//public String getValue(int codepoint, boolean isShort) {
// return getValue(codepoint);
//}
public List getNameAliases(List result) {
if (result == null) result = new ArrayList(1);
return _getNameAliases(result);
@ -118,6 +122,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
if (result == null) result = new ArrayList(1);
result = _getValueAliases(valueAlias, result);
if (!result.contains(valueAlias) && type < NUMERIC) {
result = _getValueAliases(valueAlias, result); // for debugging
throw new IllegalArgumentException(
"Internal error: " + getName() + " doesn't contain " + valueAlias
+ ": " + new BagFormatter().join(result));
@ -146,6 +151,372 @@ public abstract class UnicodeProperty extends UnicodeLabel {
return getAvailableValues(null);
}
public final String getValue(int codepoint, boolean getShortest) {
String result = getValue(codepoint);
if (type >= MISC || result == null || !getShortest) return result;
return getFirstValueAlias(result);
}
public final String getFirstNameAlias() {
if (firstNameAlias == null) {
firstNameAlias = (String) getNameAliases().get(0);
}
return firstNameAlias;
}
public final String getFirstValueAlias(String value) {
if (valueToFirstValueAlias == null) _getFirstValueAliasCache();
return (String)valueToFirstValueAlias.get(value);
}
private void _getFirstValueAliasCache() {
maxValueWidth = 0;
maxFirstValueAliasWidth = 0;
valueToFirstValueAlias = new HashMap(1);
Iterator it = getAvailableValues().iterator();
while (it.hasNext()) {
String value = (String)it.next();
String first = (String) getValueAliases(value).get(0);
if (first == null) { // internal error
throw new IllegalArgumentException("Value not in value aliases: " + value);
}
if (DEBUG && CHECK_NAME.equals(getName())) {
System.out.println("First Alias: " + getName() + ": " + value + " => "
+ first + new BagFormatter().join(getValueAliases(value)));
}
valueToFirstValueAlias.put(value,first);
if (value.length() > maxValueWidth) {
maxValueWidth = value.length();
}
if (first.length() > maxFirstValueAliasWidth) {
maxFirstValueAliasWidth = first.length();
}
}
}
private int maxValueWidth = -1;
private int maxFirstValueAliasWidth = -1;
public int getMaxWidth(boolean getShortest) {
if (maxValueWidth < 0) _getFirstValueAliasCache();
if (getShortest) return maxFirstValueAliasWidth;
return maxValueWidth;
}
public final UnicodeSet getSet(String propertyValue) {
return getSet(propertyValue,null);
}
public final UnicodeSet getSet(Matcher matcher) {
return getSet(matcher,null);
}
public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
return getSet(new SimpleMatcher(propertyValue,
isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR),
result);
}
private UnicodeMap unicodeMap = null;
public static final String UNUSED = "??";
public final UnicodeSet getSet(Matcher matcher, UnicodeSet result) {
if (result == null) result = new UnicodeSet();
if (isType(STRING_OR_MISC_MASK)) {
for (int i = 0; i <= 0x10FFFF; ++i) {
String value = getValue(i);
if (value != null && matcher.matches(value)) {
result.add(i);
}
}
return result;
}
List temp = new ArrayList(1); // to avoid reallocating...
UnicodeMap um = getUnicodeMap();
Iterator it = um.getAvailableValues(null).iterator();
main:
while (it.hasNext()) {
String value = (String)it.next();
temp.clear();
Iterator it2 = getValueAliases(value,temp).iterator();
while (it2.hasNext()) {
String value2 = (String)it2.next();
//System.out.println("Values:" + value2);
if (matcher.matches(value2)
|| matcher.matches(toSkeleton(value2))) {
um.getSet(value, result);
continue main;
}
}
}
return result;
}
/*
public UnicodeSet getMatchSet(UnicodeSet result) {
if (result == null) result = new UnicodeSet();
addAll(matchIterator, result);
return result;
}
public void setMatchSet(UnicodeSet set) {
matchIterator = new UnicodeSetIterator(set);
}
*/
/**
* Utility for debugging
*/
public static String getStack() {
Exception e = new Exception();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
pw.flush();
return "Showing Stack with fake " + sw.getBuffer().toString();
}
// TODO use this instead of plain strings
public static class Name implements Comparable {
private static Map skeletonCache;
private String skeleton;
private String pretty;
public final int RAW = 0, TITLE = 1, NORMAL = 2;
public Name(String name, int style) {
if (name == null) name = "";
if (style == RAW) {
skeleton = pretty = name;
} else {
pretty = regularize(name, style == TITLE);
skeleton = toSkeleton(pretty);
}
}
public int compareTo(Object o) {
return skeleton.compareTo(((Name)o).skeleton);
}
public boolean equals(Object o) {
return skeleton.equals(((Name)o).skeleton);
}
public int hashCode() {
return skeleton.hashCode();
}
public String toString() {
return pretty;
}
}
/**
* @return the unicode map
*/
protected UnicodeMap getUnicodeMap() {
if (unicodeMap == null) unicodeMap = _getUnicodeMap();
return unicodeMap;
}
protected UnicodeMap _getUnicodeMap() {
UnicodeMap result = new UnicodeMap();
for (int i = 0; i <= 0x10FFFF; ++i) {
//if (DEBUG && i == 0x41) System.out.println(i + "\t" + getValue(i));
result.put(i, getValue(i));
}
if (DEBUG && CHECK_NAME.equals(getName())) {
System.out.println(getName() + ":\t" + getClass().getName()
+ "\t" + getVersion());
System.out.println(getStack());
System.out.println(result);
}
return result;
}
/**
* Really ought to create a Collection UniqueList, that forces uniqueness. But for now...
*/
public static Collection addUnique(Object obj, Collection result) {
if (obj != null && !result.contains(obj)) result.add(obj);
return result;
}
/**
* Utility for managing property & non-string value aliases
*/
public static final Comparator PROPERTY_COMPARATOR = new Comparator() {
public int compare(Object o1, Object o2) {
return compareNames((String)o1, (String)o2);
}
};
/**
* Utility for managing property & non-string value aliases
*
*/
// TODO optimize
public static boolean equalNames(String a, String b) {
if (a == b) return true;
if (a == null) return false;
return toSkeleton(a).equals(toSkeleton(b));
}
/**
* Utility for managing property & non-string value aliases
*/
// TODO optimize
public static int compareNames(String a, String b) {
if (a == b) return 0;
if (a == null) return -1;
if (b == null) return 1;
return toSkeleton(a).compareTo(toSkeleton(b));
}
/**
* Utility for managing property & non-string value aliases
*/
// TODO account for special names, tibetan, hangul
public static String toSkeleton(String source) {
if (source == null) return null;
StringBuffer skeletonBuffer = new StringBuffer();
boolean gotOne = false;
// remove spaces, '_', '-'
// we can do this with char, since no surrogates are involved
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
if (i > 0 && (ch == '_' || ch == ' ' || ch == '-')) {
gotOne = true;
} else {
char ch2 = Character.toLowerCase(ch);
if (ch2 != ch) {
gotOne = true;
skeletonBuffer.append(ch2);
} else {
skeletonBuffer.append(ch);
}
}
}
if (!gotOne) return source; // avoid string creation
return skeletonBuffer.toString();
}
// get the name skeleton
public static String toNameSkeleton(String source) {
if (source == null) return null;
StringBuffer result = new StringBuffer();
// remove spaces, medial '-'
// we can do this with char, since no surrogates are involved
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') {
result.append(ch);
} else if (ch == ' ') {
// don't copy ever
} else if (ch == '-') {
// only copy non-medials AND trailing O-E
if (0 == i
|| i == source.length() - 1
|| source.charAt(i-1) == ' '
|| source.charAt(i+1) == ' '
|| (i == source.length() - 2
&& source.charAt(i-1) == 'O'
&& source.charAt(i+1) == 'E')) {
System.out.println("****** EXCEPTION " + source);
result.append(ch);
}
// otherwise don't copy
} else {
throw new IllegalArgumentException("Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
}
}
return result.toString();
}
/**
* These routines use the Java functions, because they only need to act on ASCII
* Changes space, - into _, inserts _ between lower and UPPER.
*/
public static String regularize(String source, boolean titlecaseStart) {
if (source == null) return source;
/*if (source.equals("noBreak")) { // HACK
if (titlecaseStart) return "NoBreak";
return source;
}
*/
StringBuffer result = new StringBuffer();
int lastCat = -1;
boolean haveFirstCased = true;
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c == ' ' || c == '-' || c == '_') {
c = '_';
haveFirstCased = true;
}
if (c == '=') haveFirstCased = true;
int cat = Character.getType(c);
if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
result.append('_');
}
if (haveFirstCased && (cat == Character.LOWERCASE_LETTER
|| cat == Character.TITLECASE_LETTER || cat == Character.UPPERCASE_LETTER)) {
if (titlecaseStart) {
c = Character.toUpperCase(c);
}
haveFirstCased = false;
}
result.append(c);
lastCat = cat;
}
return result.toString();
}
/**
* Utility function for comparing codepoint to string without
* generating new string.
* @param codepoint
* @param other
* @return true if the codepoint equals the string
*/
public static final boolean equals(int codepoint, String other) {
if (other.length() == 1) {
return codepoint == other.charAt(0);
}
if (other.length() == 2) {
return other.equals(UTF16.valueOf(codepoint));
}
return false;
}
/**
* Utility that should be on UnicodeSet
* @param source
* @param result
*/
static public void addAll(UnicodeSetIterator source, UnicodeSet result) {
while (source.nextRange()) {
if (source.codepoint == UnicodeSetIterator.IS_STRING) {
result.add(source.string);
} else {
result.add(source.codepoint, source.codepointEnd);
}
}
}
/**
* Really ought to create a Collection UniqueList, that forces uniqueness. But for now...
*/
public static Collection addAllUnique(Collection source, Collection result) {
for (Iterator it = source.iterator(); it.hasNext();) {
addUnique(it.next(), result);
}
return result;
}
/**
* Really ought to create a Collection UniqueList, that forces uniqueness. But for now...
*/
public static Collection addAllUnique(Object[] source, Collection result) {
for (int i = 0; i < source.length; ++i) {
addUnique(source[i], result);
}
return result;
}
static public class Factory {
static boolean DEBUG = false;
@ -503,21 +874,31 @@ public abstract class UnicodeProperty extends UnicodeLabel {
}
}
public static abstract class SimpleProperty extends UnicodeProperty {
private List propertyAliases = new ArrayList(1);
public static abstract class BaseProperty extends UnicodeProperty {
protected List propertyAliases = new ArrayList(1);
String version;
public BaseProperty setMain(String alias, String shortAlias, int propertyType,
String version) {
setName(alias);
setType(propertyType);
propertyAliases.add(shortAlias);
propertyAliases.add(alias);
this.version = version;
return this;
}
public String _getVersion() {
return version;
}
public List _getNameAliases(List result) {
addAllUnique(propertyAliases, result);
return result;
}
}
public static abstract class SimpleProperty extends BaseProperty {
List values;
Map toValueAliases = new HashMap(1);
String version;
public SimpleProperty setMain(String alias, String shortAlias, int propertyType,
String version) {
setName(alias);
setType(propertyType);
propertyAliases.add(shortAlias);
propertyAliases.add(alias);
this.version = version;
return this;
}
public SimpleProperty addName(String alias) {
propertyAliases.add(alias);
@ -546,11 +927,6 @@ public abstract class UnicodeProperty extends UnicodeLabel {
return this;
}
public List _getNameAliases(List result) {
addAllUnique(propertyAliases, result);
return result;
}
public List _getValueAliases(String valueAlias, List result) {
if (toValueAliases == null) _fillValues();
List a = (List) toValueAliases.get(valueAlias);
@ -582,384 +958,27 @@ public abstract class UnicodeProperty extends UnicodeLabel {
addUnique(alias, aliases);
addUnique(item, aliases);
}
public String _getVersion() {
return version;
}
}
public static class UnicodeMapProperty extends SimpleProperty {
private UnicodeMap unicodeMap;
public static class UnicodeMapProperty extends BaseProperty {
protected UnicodeMap unicodeMap;
protected String _getValue(int codepoint) {
return (String) unicodeMap.getValue(codepoint);
}
protected List _getValueAliases(String valueAlias, List result) {
if (!unicodeMap.getAvailableValues().contains(valueAlias)) return result;
result.add(valueAlias);
return result; // no other aliases
}
protected List _getAvailableValues(List result) {
return (List) unicodeMap.getAvailableValues(result);
}
}
public final String getValue(int codepoint, boolean getShortest) {
String result = getValue(codepoint);
if (type >= MISC || result == null || !getShortest) return result;
return getFirstValueAlias(result);
}
public final String getFirstNameAlias() {
if (firstNameAlias == null) {
firstNameAlias = (String) getNameAliases().get(0);
}
return firstNameAlias;
}
public final String getFirstValueAlias(String value) {
if (valueToFirstValueAlias == null) _getFirstValueAliasCache();
return (String)valueToFirstValueAlias.get(value);
}
private void _getFirstValueAliasCache() {
maxValueWidth = 0;
maxFirstValueAliasWidth = 0;
valueToFirstValueAlias = new HashMap(1);
Iterator it = getAvailableValues().iterator();
while (it.hasNext()) {
String value = (String)it.next();
String first = (String) getValueAliases(value).get(0);
if (first == null) { // internal error
throw new IllegalArgumentException("Value not in value aliases: " + value);
}
if (DEBUG && CHECK_NAME.equals(getName())) {
System.out.println("First Alias: " + getName() + ": " + value + " => "
+ first + new BagFormatter().join(getValueAliases(value)));
}
valueToFirstValueAlias.put(value,first);
if (value.length() > maxValueWidth) {
maxValueWidth = value.length();
}
if (first.length() > maxFirstValueAliasWidth) {
maxFirstValueAliasWidth = first.length();
}
}
}
private int maxValueWidth = -1;
private int maxFirstValueAliasWidth = -1;
public int getMaxWidth(boolean getShortest) {
if (maxValueWidth < 0) _getFirstValueAliasCache();
if (getShortest) return maxFirstValueAliasWidth;
return maxValueWidth;
}
public final UnicodeSet getSet(String propertyValue) {
return getSet(propertyValue,null);
}
public final UnicodeSet getSet(Matcher matcher) {
return getSet(matcher,null);
}
public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
return getSet(new SimpleMatcher(propertyValue,
isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR),
result);
}
private UnicodeMap unicodeMap = null;
public static final String UNUSED = "??";
public final UnicodeSet getSet(Matcher matcher, UnicodeSet result) {
if (result == null) result = new UnicodeSet();
if (isType(STRING_OR_MISC_MASK)) {
for (int i = 0; i <= 0x10FFFF; ++i) {
String value = getValue(i);
if (value != null && matcher.matches(value)) {
result.add(i);
}
}
return result;
}
List temp = new ArrayList(1); // to avoid reallocating...
UnicodeMap um = getUnicodeMap();
Iterator it = um.getAvailableValues(null).iterator();
main:
while (it.hasNext()) {
String value = (String)it.next();
temp.clear();
Iterator it2 = getValueAliases(value,temp).iterator();
while (it2.hasNext()) {
String value2 = (String)it2.next();
//System.out.println("Values:" + value2);
if (matcher.matches(value2)
|| matcher.matches(toSkeleton(value2))) {
um.getSet(value, result);
continue main;
}
}
}
return result;
}
/*
public UnicodeSet getMatchSet(UnicodeSet result) {
if (result == null) result = new UnicodeSet();
addAll(matchIterator, result);
return result;
}
public void setMatchSet(UnicodeSet set) {
matchIterator = new UnicodeSetIterator(set);
}
*/
/**
* Utility for debugging
*/
public static String getStack() {
Exception e = new Exception();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
pw.flush();
return "Showing Stack with fake " + sw.getBuffer().toString();
}
// TODO use this instead of plain strings
public static class Name implements Comparable {
private static Map skeletonCache;
private String skeleton;
private String pretty;
public final int RAW = 0, TITLE = 1, NORMAL = 2;
public Name(String name, int style) {
if (name == null) name = "";
if (style == RAW) {
skeleton = pretty = name;
} else {
pretty = regularize(name, style == TITLE);
skeleton = toSkeleton(pretty);
}
}
public int compareTo(Object o) {
return skeleton.compareTo(((Name)o).skeleton);
}
public boolean equals(Object o) {
return skeleton.equals(((Name)o).skeleton);
}
public int hashCode() {
return skeleton.hashCode();
}
public String toString() {
return pretty;
}
}
/**
* Utility for managing property & non-string value aliases
*/
public static final Comparator PROPERTY_COMPARATOR = new Comparator() {
public int compare(Object o1, Object o2) {
return compareNames((String)o1, (String)o2);
}
};
/**
* Utility for managing property & non-string value aliases
*
*/
// TODO optimize
public static boolean equalNames(String a, String b) {
if (a == b) return true;
if (a == null) return false;
return toSkeleton(a).equals(toSkeleton(b));
}
/**
* Utility for managing property & non-string value aliases
*/
// TODO optimize
public static int compareNames(String a, String b) {
if (a == b) return 0;
if (a == null) return -1;
if (b == null) return 1;
return toSkeleton(a).compareTo(toSkeleton(b));
}
/**
* Utility for managing property & non-string value aliases
*/
// TODO account for special names, tibetan, hangul
public static String toSkeleton(String source) {
if (source == null) return null;
StringBuffer skeletonBuffer = new StringBuffer();
boolean gotOne = false;
// remove spaces, '_', '-'
// we can do this with char, since no surrogates are involved
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
if (i > 0 && (ch == '_' || ch == ' ' || ch == '-')) {
gotOne = true;
} else {
char ch2 = Character.toLowerCase(ch);
if (ch2 != ch) {
gotOne = true;
skeletonBuffer.append(ch2);
} else {
skeletonBuffer.append(ch);
}
}
}
if (!gotOne) return source; // avoid string creation
return skeletonBuffer.toString();
}
// get the name skeleton
public static String toNameSkeleton(String source) {
if (source == null) return null;
StringBuffer result = new StringBuffer();
// remove spaces, medial '-'
// we can do this with char, since no surrogates are involved
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') {
result.append(ch);
} else if (ch == ' ') {
// don't copy ever
} else if (ch == '-') {
// only copy non-medials AND trailing O-E
if (0 == i
|| i == source.length() - 1
|| source.charAt(i-1) == ' '
|| source.charAt(i+1) == ' '
|| (i == source.length() - 2
&& source.charAt(i-1) == 'O'
&& source.charAt(i+1) == 'E')) {
System.out.println("****** EXCEPTION " + source);
result.append(ch);
}
// otherwise don't copy
} else {
throw new IllegalArgumentException("Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
}
}
return result.toString();
}
/**
* These routines use the Java functions, because they only need to act on ASCII
* Changes space, - into _, inserts _ between lower and UPPER.
*/
public static String regularize(String source, boolean titlecaseStart) {
if (source == null) return source;
/*if (source.equals("noBreak")) { // HACK
if (titlecaseStart) return "NoBreak";
return source;
}
*/
StringBuffer result = new StringBuffer();
int lastCat = -1;
boolean haveFirstCased = true;
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c == ' ' || c == '-' || c == '_') {
c = '_';
haveFirstCased = true;
}
if (c == '=') haveFirstCased = true;
int cat = Character.getType(c);
if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
result.append('_');
}
if (haveFirstCased && (cat == Character.LOWERCASE_LETTER
|| cat == Character.TITLECASE_LETTER || cat == Character.UPPERCASE_LETTER)) {
if (titlecaseStart) {
c = Character.toUpperCase(c);
}
haveFirstCased = false;
}
result.append(c);
lastCat = cat;
}
return result.toString();
}
/**
* Utility function for comparing codepoint to string without
* generating new string.
* @param codepoint
* @param other
* @return true if the codepoint equals the string
*/
public static final boolean equals(int codepoint, String other) {
if (other.length() == 1) {
return codepoint == other.charAt(0);
}
if (other.length() == 2) {
return other.equals(UTF16.valueOf(codepoint));
}
return false;
}
/**
* Utility that should be on UnicodeSet
* @param source
* @param result
*/
static public void addAll(UnicodeSetIterator source, UnicodeSet result) {
while (source.nextRange()) {
if (source.codepoint == UnicodeSetIterator.IS_STRING) {
result.add(source.string);
} else {
result.add(source.codepoint, source.codepointEnd);
}
}
}
/**
* Really ought to create a Collection UniqueList, that forces uniqueness. But for now...
*/
public static Collection addUnique(Object obj, Collection result) {
if (obj != null && !result.contains(obj)) result.add(obj);
return result;
}
/**
* Really ought to create a Collection UniqueList, that forces uniqueness. But for now...
*/
public static Collection addAllUnique(Collection source, Collection result) {
for (Iterator it = source.iterator(); it.hasNext();) {
addUnique(it.next(), result);
}
return result;
}
/**
* Really ought to create a Collection UniqueList, that forces uniqueness. But for now...
*/
public static Collection addAllUnique(Object[] source, Collection result) {
for (int i = 0; i < source.length; ++i) {
addUnique(source[i], result);
}
return result;
}
/**
* @return the unicode map
*/
protected UnicodeMap getUnicodeMap() {
if (unicodeMap == null) unicodeMap = _getUnicodeMap();
return unicodeMap;
}
protected UnicodeMap _getUnicodeMap() {
UnicodeMap result = new UnicodeMap();
for (int i = 0; i <= 0x10FFFF; ++i) {
//if (DEBUG && i == 0x41) System.out.println(i + "\t" + getValue(i));
result.put(i, getValue(i));
}
if (DEBUG && CHECK_NAME.equals(getName())) {
System.out.println(getName() + ":\t" + getClass().getName()
+ "\t" + getVersion());
System.out.println(getStack());
System.out.println(result);
}
return result;
}
}

View File

@ -8,7 +8,9 @@
*/
package com.ibm.icu.dev.tool.cldr;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
@ -78,7 +80,9 @@ public class GenerateCldrTests {
LOGDIR = 3,
SOURCEDIR =4,
MATCH = 5,
FULLY_RESOLVED = 6;
FULLY_RESOLVED = 6,
LANGUAGES = 7,
TZADIR = 8;
private static final UOption[] options = {
UOption.HELP_H(),
@ -88,17 +92,38 @@ public class GenerateCldrTests {
UOption.SOURCEDIR().setDefault("C:\\ICU4C\\locale\\common\\"),
UOption.create("match", 'm', UOption.REQUIRES_ARG).setDefault(".*"),
UOption.create("fullyresolved", 'f', UOption.NO_ARG),
UOption.create("languages", 'g', UOption.NO_ARG),
UOption.create("tzadir", 't', UOption.REQUIRES_ARG).setDefault("C:\\ICU4J\\icu4j\\src\\com\\ibm\\icu\\dev\\tool\\cldr\\"),
};
CldrCollations cldrCollations;
static String logDir = null, destDir = null;
public static void main(String[] args) throws Exception {
public static boolean hasLocalizedLanguageFor(ULocale locale, ULocale otherLocale) {
String lang = otherLocale.getLanguage();
String localizedVersion = otherLocale.getDisplayLanguage(locale);
return !lang.equals(localizedVersion);
}
public static boolean hasLocalizedCountryFor(ULocale locale, ULocale otherLocale) {
String country = otherLocale.getCountry();
if (country.equals("")) return true;
String localizedVersion = otherLocale.getDisplayCountry(locale);
return !country.equals(localizedVersion);
}
public static void main(String[] args) throws Exception {
UOption.parseArgs(args, options);
log = BagFormatter.openUTF8Writer(options[LOGDIR].value, "log.txt");
try {
if (options[LANGUAGES].doesOccur) {
generateSize(true);
return;
}
//generateSize();
//if (true) return;
//compareAvailable();
//compareAvailable();
//if (true) return;
//System.out.println(createCaseClosure(new UnicodeSet("[a{bc}{def}{oss}]")));
//System.out.println(createCaseClosure(new UnicodeSet("[a-z\u00c3\u0178{aa}]")));
@ -118,9 +143,271 @@ public class GenerateCldrTests {
}
/**
*
*/
/*
* @throws IOException
*
*/
private static void generateSize(boolean transliterate) throws IOException {
PrintWriter logHtml = BagFormatter.openUTF8Writer(options[LOGDIR].value, "log.html");
String dir = options[SOURCEDIR].value + "main" + File.separator;
DraftChecker dc = new DraftChecker(dir);
Set filenames = getMatchingXMLFiles(dir, ".*");
Collator col = Collator.getInstance(ULocale.ENGLISH);
Set languages = new TreeSet(col), countries = new TreeSet(col),
draftLanguages = new TreeSet(col), draftCountries = new TreeSet(col);
Map nativeLanguages = new TreeMap(col), nativeCountries = new TreeMap(col),
draftNativeLanguages = new TreeMap(col), draftNativeCountries = new TreeMap(col);
int localeCount = 0;
int draftLocaleCount = 0;
for (Iterator it = filenames.iterator(); it.hasNext();) {
String localeName = (String) it.next();
if (localeName.equals("root")) continue; // skip root
boolean draft = dc.isDraft(localeName);
if (draft) {
draftLocaleCount++;
addCounts(localeName, true, draftLanguages, draftCountries, draftNativeLanguages, draftNativeCountries, col);
} else {
localeCount++;
addCounts(localeName, false, languages, countries, nativeLanguages, nativeCountries, col);
}
if (false) log.println(draft + ", " + localeCount + ", " + languages.size() + ", " + countries.size() + ", "
+ draftLocaleCount + ", " + draftLanguages.size() + ", " + draftCountries.size());
}
draftLanguages.removeAll(languages);
for (Iterator it = nativeLanguages.keySet().iterator(); it.hasNext();) {
draftNativeLanguages.remove(it.next());
}
logHtml.println("<html><head>");
logHtml.println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
logHtml.println("</head><body>");
logHtml.println("<p><b>Locales:</b> " + localeCount);
logHtml.println("<p><b>Languages:</b> " + languages.size());
logHtml.println(showSet(nativeLanguages, transliterate, true));
logHtml.println("<p><b>Countries:</b> " + countries.size());
logHtml.println(showSet(nativeCountries, transliterate, false));
logHtml.println("<p><b>Draft locales:</b> " + draftLocaleCount);
logHtml.println("<p><b>Draft languages:</b> " + draftLanguages.size());
logHtml.println(showSet(draftNativeLanguages, transliterate, true));
logHtml.println("<p><b>Draft countries:</b> " + draftCountries.size());
logHtml.println(showSet(draftNativeCountries, transliterate, false));
logHtml.println("</body></html>");
logHtml.close();
}
static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]");
/**
* @param uloc
* @param isDraft TODO
* @param draftLanguages
* @param draftCountries
* @param draftNativeLanguages
* @param draftNativeCountries
* @param lang
* @param country
*/
private static void addCounts(String localeName, boolean isDraft, Set draftLanguages, Set draftCountries,
Map draftNativeLanguages, Map draftNativeCountries, Comparator col) {
ULocale uloc = new ULocale(localeName);
String lang = localeName, country = "";
if (localeName.length() > 3 && localeName.charAt(localeName.length() - 3) == '_') {
lang = localeName.substring(0, localeName.length() - 3);
country = localeName.substring(localeName.length() - 2);
}
String nativeName, englishName;
draftLanguages.add(lang);
nativeName = uloc.getDisplayLanguage(uloc);
englishName = uloc.getDisplayLanguage(ULocale.ENGLISH);
if (!lang.equals("en") && nativeName.equals(englishName)) {
log.println((isDraft ? "D" : "") +"\tWarning: in " + localeName + ", display name for " + lang + " equals English: " + nativeName);
}
draftNativeLanguages.put(fixedTitleCase(uloc, nativeName), localeName);
if (!country.equals("")) {
draftCountries.add(country);
nativeName = getFixedDisplayCountry(uloc, uloc);
englishName = getFixedDisplayCountry(uloc, ULocale.ENGLISH);
if (!lang.equals("en") && nativeName.equals(englishName)) {
log.println((isDraft ? "D" : "") + "\tWarning: in " + localeName + ", display name for " + country + " equals English: " + nativeName);
}
draftNativeCountries.put(fixedTitleCase(uloc, nativeName), localeName);
}
}
static String fixedTitleCase(ULocale uloc, String in) {
String result = UCharacter.toTitleCase(uloc, in, null);
result = replace(result, "U.s.", "U.S.");
result = replace(result, "S.a.r.", "S.A.R.");
return result;
}
/*
static void addMapSet(Map m, Object key, Object value, Comparator com) {
Set valueSet = (Set) m.get(key);
if (valueSet == null) {
valueSet = new TreeSet(com);
m.put(key, valueSet);
}
valueSet.add(value);
}
*/
/**
* @param uloc
* @return
*/
private static String getFixedDisplayCountry(ULocale uloc, ULocale forLanguage) {
String name = uloc.getDisplayCountry(forLanguage);
Object trial = fixCountryNames.get(name);
if (trial != null) {
return (String)trial;
}
return name;
}
static Map fixCountryNames = new HashMap();
static {
fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430", "\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430");
fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora");
fixCountryNames.put("Yugoslavia", "Serbia and Montenegro");
}
static {
// HACK around lack of Armenian, Ethiopic
registerTransliteratorFromFile(options[TZADIR].value, "Latin-Armenian");
registerTransliteratorFromFile(options[TZADIR].value, "Latin-Ethiopic");
registerTransliteratorFromFile(options[TZADIR].value, "Cyrillic-Latin");
registerTransliteratorFromFile(options[TZADIR].value, "Arabic-Latin");
}
public static final Transliterator toLatin = Transliterator.getInstance("any-latin");
static void registerTransliteratorFromFile(String dir, String id) {
try {
String filename = id.replace('-', '_');
BufferedReader br = BagFormatter.openUTF8Reader(dir, filename + ".txt");
StringBuffer buffer = new StringBuffer();
while (true) {
String line = br.readLine();
if (line == null) break;
if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1);
buffer.append(line).append("\r\n");
}
br.close();
String rules = buffer.toString();
Transliterator t;
int pos = id.indexOf('-');
String rid;
if (pos < 0) {
rid = id + "-Any";
id = "Any-" + id;
} else {
rid = id.substring(pos+1) + "-" + id.substring(0, pos);
}
Transliterator.unregister(id);
t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD);
Transliterator.registerInstance(t);
/*String test = "\u049A\u0430\u0437\u0430\u049B";
System.out.println(t.transliterate(test));
t = Transliterator.getInstance(id);
System.out.println(t.transliterate(test));
*/
Transliterator.unregister(rid);
t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE);
Transliterator.registerInstance(t);
System.out.println("Registered new Transliterator: " + id + ", " + rid);
} catch (IOException e) {
e.printStackTrace();
throw new IllegalArgumentException("Can't open " + dir + ", " + id);
}
}
/**
* @param nativeCountries
* @param transliterate TODO
* @param isLanguage TODO
*/
private static String showSet(Map nativeCountries, boolean transliterate, boolean isLanguage) {
UnicodeSet BIDI_R = new UnicodeSet("[[:Bidi_Class=R:][:Bidi_Class=AL:]]");
StringBuffer result = new StringBuffer();
for (Iterator it = nativeCountries.keySet().iterator(); it.hasNext();) {
String name = (String) it.next();
String locale = (String) nativeCountries.get(name);
String lang = locale, country = "";
if (locale.length() > 3 && locale.charAt(locale.length() - 3) == '_') {
lang = locale.substring(0, locale.length() - 3);
country = locale.substring(locale.length() - 2);
}
if (result.length() != 0) {
result.append(", ");
}
String title = "";
if (isLanguage) {
title = lang + ", " + new ULocale(locale).getDisplayLanguage(ULocale.ENGLISH);
} else {
title = country + ", " + getFixedDisplayCountry(new ULocale(locale), ULocale.ENGLISH);
}
if (transliterate && NON_LATIN.containsSome(name) && !lang.equals("ja")) {
String transName = fixedTitleCase(ULocale.ENGLISH, toLatin.transliterate(name));
if (NON_LATIN.containsSome(transName)) {
log.println("Can't transliterate " + name + ": " + transName);
} else {
title += ", " + transName;
}
}
String before = "", after = "";
if (title.length() != 0) {
before = "<span title=\'" + BagFormatter.toHTML.transliterate(title) + "'>";
after = "</span>";
}
boolean isBIDI = BIDI_R.containsSome(name);
if (isBIDI) result.append('\u200E');
result.append(before).append(BagFormatter.toHTML.transliterate(name)).append(after);
if (isBIDI) result.append('\u200E');
}
return result.toString();
}
public static class DraftChecker {
String dir;
Map cache = new HashMap();
Object TRUE = new Object();
Object FALSE = new Object();
public DraftChecker(String dir) {
this.dir = dir;
}
public boolean isDraft(String localeName) {
Object check = cache.get(localeName);
if (check != null) {
return check == TRUE;
}
BufferedReader pw = null;
boolean result = true;
try {
pw = BagFormatter.openUTF8Reader(dir, localeName + ".xml");
while (true) {
String line = pw.readLine();
assert (line != null); // should never get here
if (line.indexOf("<ldml") >= 0) {
if (line.indexOf("draft") >= 0) {
check = TRUE;
} else {
check = FALSE;
}
break;
}
}
pw.close();
} catch (IOException e) {
e.printStackTrace();
throw new IllegalArgumentException("Failure on " + localeName + ": " + dir + localeName + ".xml");
}
cache.put(localeName, check);
return check == TRUE;
}
}
/*
private static void compareAvailable() {
ULocale[] cols = Collator.getAvailableULocales();
Locale[] alocs = NumberFormat.getAvailableLocales();
@ -137,11 +424,29 @@ public class GenerateCldrTests {
}
*/
/**
* @param sLocs
*/
private static void showLocales(Set sLocs) {
for (Iterator it = sLocs.iterator(); it.hasNext();) {
/**
*
*/
private static void checkLocaleNames() {
ULocale[] locales = ULocale.getAvailableLocales();
for (int i = 0; i < locales.length; ++i) {
if (!hasLocalizedCountryFor(ULocale.ENGLISH, locales[i])
|| !hasLocalizedLanguageFor(ULocale.ENGLISH, locales[i])
|| !hasLocalizedCountryFor(locales[i], locales[i])
|| !hasLocalizedLanguageFor(locales[i], locales[i])) {
log.print("FAILURE\t");
} else {
log.print(" \t");
}
log.println(locales[i] + "\t" + locales[i].getDisplayName(ULocale.ENGLISH) + "\t" + locales[i].getDisplayName(locales[i]));
}
}
/**
* @param sLocs
*/
private static void showLocales(Set sLocs) {
for (Iterator it = sLocs.iterator(); it.hasNext();) {
String s = (String) it.next();
log.println(s + "\t" + ULocale.getDisplayLanguage(s,"en"));
}
@ -239,9 +544,14 @@ public class GenerateCldrTests {
void generate(String pat) throws Exception {
cldrOthers = new CldrOthers(options[SOURCEDIR].value + "main" + File.separator, pat);
cldrOthers.show();
//if (true) return;
cldrCollations = new CldrCollations(options[SOURCEDIR].value + "collation" + File.separator, pat);
cldrCollations.show();
cldrOthers = new CldrOthers(options[SOURCEDIR].value + "main" + File.separator, pat);
cldrOthers.show();
getLocaleList();
Matcher m = Pattern.compile(pat).matcher("");
@ -600,8 +910,57 @@ public class GenerateCldrTests {
return cldrCollations.getInstance(loc1).equals(cldrCollations.getInstance(loc2)); // Collator.getInstance(loc1).equals(Collator.getInstance(loc2));
}
};
static ULocale zhHack = new ULocale("zh"); // FIXME hack for zh
DataShower CollationShower = new DataShower() {
public void show(ULocale locale, Collection others) {
if (locale.equals(zhHack)) return;
showLocales("collation", others);
Collator col = cldrCollations.getInstance(locale); // Collator.getInstance(locale);
UnicodeSet tailored = col.getTailoredSet();
if (locale.getLanguage().equals("zh")) {
tailored.addAll(new UnicodeSet("[[a-z]-[v]]"));
log.println("HACK for Pinyin");
}
tailored = createCaseClosure(tailored);
tailored = nfc(tailored);
//System.out.println(tailored.toPattern(true));
UnicodeSet exemplars = getExemplarSet(locale, UnicodeSet.CASE);
// add all the exemplars
if (false)
for (Iterator it = others.iterator(); it.hasNext();) {
exemplars.addAll(getExemplarSet((ULocale) it.next(),
UnicodeSet.CASE));
}
exemplars = createCaseClosure(exemplars);
exemplars = nfc(exemplars);
//System.out.println(exemplars.toPattern(true));
tailored.addAll(exemplars);
//UnicodeSet tailoredMinusHan = new
// UnicodeSet(tailored).removeAll(SKIP_COLLATION_SET);
if (!exemplars.containsAll(tailored)) {
//BagFormatter bf = new BagFormatter();
log.println("In Tailored, but not Exemplar; Locale: " + locale
+ "\t" + locale.getDisplayName());
log.println(new UnicodeSet(tailored).removeAll(exemplars)
.toPattern(false));
//bf.(log,"tailored", tailored, "exemplars", exemplars);
log.flush();
}
tailored.addAll(new UnicodeSet("[\\ .02{12}]"));
tailored.removeAll(SKIP_COLLATION_SET);
SortedBag bag = new SortedBag(col);
doCollationResult(col, tailored, bag);
out.println(" </collation>");
}
};
/*
public void show(ULocale locale, Collection others) {
showLocales("collation", others);
@ -641,6 +1000,7 @@ public class GenerateCldrTests {
doCollationResult(col, tailored, bag);
out.println(" </collation>");
}};
*/
static final UnicodeSet SKIP_COLLATION_SET = new UnicodeSet(
"[[:script=han:][:script=hangul:]-[\u4e00-\u4eff \u9f00-\u9fff \uac00-\uacff \ud700-\ud7ff]]");
@ -804,6 +1164,13 @@ public class GenerateCldrTests {
return null;
}
public static String replace(String source, String pattern, String replacement) {
// dumb code for now
for (int pos = source.indexOf(pattern, 0); pos >= 0; pos = source.indexOf(pattern, pos + 1)) {
source = source.substring(0, pos) + replacement + source.substring(pos+pattern.length());
}
return source;
}
static class CldrCollations {
Set validLocales = new TreeSet();
@ -867,13 +1234,6 @@ public class GenerateCldrTests {
}
}
public static String replace(String source, String pattern, String replacement) {
// dumb code for now
for (int pos = source.indexOf(pattern, 0); pos >= 0; pos = source.indexOf(pattern, pos + 1)) {
source = source.substring(0, pos) + replacement + source.substring(pos+pattern.length());
}
return source;
}
static Transliterator fromHex = Transliterator.getInstance("hex-any");
private void getCollationRules(String locale) throws Exception {
@ -886,7 +1246,10 @@ public class GenerateCldrTests {
Map types_rules = new TreeMap();
locale_types_rules.put(locale, types_rules);
for (Resource current = resource.first; current != null; current = current.next) {
//System.out.println(current.name);
if (current.name == null) {
log.println("Collation: null name found in " + locale);
continue;
}
if (current instanceof ICUResourceWriter.ResourceTable) {
ICUResourceWriter.ResourceTable table = (ICUResourceWriter.ResourceTable) current;
for (Resource current2 = table.first; current2 != null; current2 = current2.next) {
@ -905,7 +1268,7 @@ public class GenerateCldrTests {
String rules = fromHex.transliterate(foo.val);
RuleBasedCollator fixed = generateCollator(locale, current.name, foo.name, rules);
if (fixed != null) {
log.println("Rules for: " + locale + "," + current.name);
log.println("Rules for: " + locale + ", " + current.name);
log.println(rules);
if (!rules.equals(foo.val)) {
log.println("Original Rules from Ram: ");

View File

@ -90,7 +90,8 @@ public class GenerateSidewaysView {
SKIP = 5,
TZADIR = 6,
NONVALIDATING = 7,
SHOW_DTD = 8;
SHOW_DTD = 8,
TRANSLIT = 9;
private static final String NEWLINE = "\n";
@ -104,11 +105,11 @@ public class GenerateSidewaysView {
UOption.create("tzadir", 't', UOption.REQUIRES_ARG).setDefault("C:\\ICU4J\\icu4j\\src\\com\\ibm\\icu\\dev\\tool\\cldr\\"),
UOption.create("nonvalidating", 'n', UOption.NO_ARG),
UOption.create("dtd", 'w', UOption.NO_ARG),
UOption.create("transliterate", 'y', UOption.NO_ARG),
};
private static String timeZoneAliasDir = null;
public static void main(String[] args) throws SAXException, IOException {
UOption.parseArgs(args, options);
Matcher skipper = Pattern.compile(options[SKIP].value).matcher("");
@ -1553,6 +1554,10 @@ public class GenerateSidewaysView {
}
*/
void showCacheData() throws IOException {
UnicodeSet untransliteratedCharacters = new UnicodeSet();
Set translitErrors = new TreeSet();
GenerateCldrTests.DraftChecker dc = new GenerateCldrTests.DraftChecker(options[SOURCEDIR].value);
dc.isDraft("en");
writeStyleSheet();
PrintWriter out = null;
String lastChainName = "";
@ -1597,18 +1602,35 @@ public class GenerateSidewaysView {
files.addAll(remainingFiles);
dataStyle = " class='nodata'";
}
String extra = "";
if (data.string != null && options[TRANSLIT].doesOccur
&& GenerateCldrTests.NON_LATIN.containsSome(data.string)) {
try {
extra = GenerateCldrTests.toLatin.transliterate(data.string);
untransliteratedCharacters.addAll(extra);
if (extra.equals(data.string)) extra = "";
else extra = "<br>(\"" + BagFormatter.toHTML.transliterate(extra) + "\")";
} catch (RuntimeException e) {
translitErrors.add(e.getMessage());
}
}
out.print("<tr><th" + dataStyle +
(lineCounter == 1 ? " width='20%'" : "")
+ ">\"" + data + "\"</th><td>");
+ ">\"" + data + "\""
+ extra
+ "</th><td>");
boolean first = true;
for (Iterator it3 = files.iterator(); it3.hasNext();) {
if (first) first = false;
else out.print(" ");
String localeID = (String)it3.next();
boolean emphasize = localeID.equals("root") || localeID.indexOf('_') >= 0;
if (dc.isDraft(localeID)) out.print("<i>");
if (emphasize) out.print("<b>");
out.print("\u00B7" + localeID + "\u00B7");
if (emphasize) out.print("</b>");
if (dc.isDraft(localeID)) out.print("</i>");
}
out.println("</td></tr>");
}
@ -1622,6 +1644,15 @@ public class GenerateSidewaysView {
}
writeIndex();
tripleData.writeData();
untransliteratedCharacters.retainAll(GenerateCldrTests.NON_LATIN);
log.println("Untranslated Characters*: " + untransliteratedCharacters.toPattern(false));
log.println("Untranslated Characters* (hex): " + untransliteratedCharacters.toPattern(true));
untransliteratedCharacters.closeOver(UnicodeSet.CASE);
log.println("Untranslated Characters: " + untransliteratedCharacters.toPattern(false));
log.println("Untranslated Characters (hex): " + untransliteratedCharacters.toPattern(true));
for (Iterator it = translitErrors.iterator(); it.hasNext();) {
log.println(it.next());
}
}
/**
@ -1651,8 +1682,9 @@ public class GenerateSidewaysView {
"Each value is listed under the field designator (in XML XPath format), " +
"followed by all the locales that use it. " +
"Locales are omitted if the value would be the same as the parent's. " +
"The locales are listed in the format: \u00B7aa\u00B7 for searching. " +
"The value appears in red if it is the same as the root. </p>");
"The locales are listed in the format: \u00B7aa\u00B7 for searching. " +
"The value appears in red if it is the same as the root. " +
"Draft locales are italic-gray; territory locales are bold.</p>");
out.println("<table>");
return out;
}
@ -1661,6 +1693,7 @@ public class GenerateSidewaysView {
out.println(".head { font-weight:bold; background-color:#DDDDFF }");
out.println("td, th { border: 1px solid #0000FF; text-align }");
out.println("th { width:10% }");
out.println("i { color: gray }");
out.println(".nodata { background-color:#FF0000 }");
out.println("table {margin-top: 1em}");
out.close();

View File

@ -38,6 +38,8 @@ import com.ibm.icu.text.DateFormat;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.SimpleDateFormat;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* This is a file that runs the CLDR tests for ICU4J, to verify that ICU4J implements them
@ -201,37 +203,53 @@ public class TestCldr {
static String[] NumberNames = {"standard", "integer", "decimal", "percent", "scientific"};
// ============ Handler for Collation ============
static UnicodeSet controlsAndSpace = new UnicodeSet("[:cc:]");
static String remove(String in, UnicodeSet toRemove) {
int cp;
StringBuffer result = new StringBuffer();
for (int i = 0; i < in.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(in, i);
if (!toRemove.contains(cp)) UTF16.append(result, cp);
}
return result.toString();
}
{
addHandler("collation", new Handler() {
public void handleResult(ULocale currentLocale, String value) {
Collator col = Collator.getInstance(currentLocale);
String lastLine = "";
int count = 0;
for (int pos = 0; pos < value.length();) {
int nextPos = value.indexOf('\n', pos);
if (nextPos < 0)
nextPos = value.length();
String line = value.substring(pos, nextPos).trim(); // HACK for SAX
if (line.length() != 0) { // HACK for SAX
int comp = col.compare(lastLine, line);
if (comp > 0) {
failures++;
logln("\tLine " + (count + 1) + "\tFailure: " + showString(lastLine) + " should be leq " + showString(line));
} else if (DEBUG) {
System.out.println("OK: " + line);
}
}
pos = nextPos + 1;
lastLine = line;
count++;
}
}
});
addHandler("collation", new Handler() {
public void handleResult(ULocale currentLocale, String value) {
Collator col = Collator.getInstance(currentLocale);
String lastLine = "";
int count = 0;
for (int pos = 0; pos < value.length();) {
int nextPos = value.indexOf('\n', pos);
if (nextPos < 0)
nextPos = value.length();
String line = value.substring(pos, nextPos);
line = remove(line, controlsAndSpace); // HACK for SAX
if (line.trim().length() != 0) { // HACK for SAX
int comp = col.compare(lastLine, line);
if (comp > 0) {
failures++;
logln("\tLine " + (count + 1) + "\tFailure: "
+ showString(lastLine) + " should be leq "
+ showString(line));
} else if (DEBUG) {
System.out.println("OK: " + line);
}
lastLine = line;
}
pos = nextPos + 1;
count++;
}
}
});
// ============ Handler for Numbers ============
addHandler("number", new Handler() {
public void handleResult(ULocale locale, String result) {
addHandler("number", new Handler() {
public void handleResult(ULocale locale, String result) {
NumberFormat nf = null;
double v = Double.NaN;
for (Iterator it = settings.keySet().iterator(); it.hasNext();) {

View File

@ -314,9 +314,10 @@ public class MakeUnicodeFiles {
addValueComments(property, value, comments);
comments = "";
if (line.startsWith("Generate:")) {
filesToDo = Utility.split(lineValue, ' ');
if (filesToDo.length == 0) {
filesToDo = new String[] {""};
filesToDo = Utility.split(lineValue.trim(), ' ');
if (filesToDo.length == 0
|| (filesToDo.length == 1 && filesToDo[0].length() == 0)) {
filesToDo = new String[] {".*"};
}
} else if (line.startsWith("DeltaVersion:")) {
dVersion = Integer.parseInt(lineValue);
@ -476,24 +477,22 @@ public class MakeUnicodeFiles {
}
public static void generateFile() throws IOException {
String[] lines = new String[2];
Utility.filesAreIdentical("C:\\DATA\\UCD\\4.0.1-Update\\CaseFolding-4.0.1.txt",
"C:\\DATA\\GEN\\DerivedData\\CaseFolding-4.1.0d13.txt", lines);
for (int i = 0; i < Format.theFormat.filesToDo.length; ++i) {
String fileName =
Format.theFormat.filesToDo[i].trim().toLowerCase(
Locale.ENGLISH);
String fileNamePattern =
Format.theFormat.filesToDo[i].trim(); // .toLowerCase(Locale.ENGLISH);
Matcher matcher = Pattern.compile(fileNamePattern, Pattern.CASE_INSENSITIVE).matcher("");
Iterator it = Format.theFormat.getFiles().iterator();
boolean gotOne = false;
while (it.hasNext()) {
String propname = (String) it.next();
if (!propname.toLowerCase(Locale.ENGLISH).startsWith(fileName)) continue;
if (!matcher.reset(propname).matches()) continue;
//if (!propname.toLowerCase(Locale.ENGLISH).startsWith(fileName)) continue;
generateFile(propname);
gotOne = true;
}
if (!gotOne) {
throw new IllegalArgumentException(
"Non-matching file name: " + fileName);
"Non-matching file name: " + fileNamePattern);
}
}
}
@ -715,7 +714,8 @@ public class MakeUnicodeFiles {
List propList = Format.theFormat.getPropertiesFromFile(filename);
for (Iterator propIt = propList.iterator(); propIt.hasNext();) {
BagFormatter bf = new BagFormatter(toolFactory);
UnicodeProperty prop = toolFactory.getProperty((String) propIt.next());
String nextPropName = (String) propIt.next();
UnicodeProperty prop = toolFactory.getProperty(nextPropName);
String name = prop.getName();
System.out.println("Property: " + name + "; " + prop.getTypeName(prop.getType()));
pw.println();

View File

@ -1,6 +1,18 @@
Generate:
DeltaVersion: 7
File: GraphemeClusterBreakProperty
Property: Grapheme_Cluster_Break
Format: skipValue=Other
File: WordBreakProperty
Property: Word_Break
Format: skipValue=Other
File: SentenceBreakProperty
Property: Sentence_Break
Format: skipValue=Other
File: Blocks
Property: Block
# Note: When comparing block names, casing, whitespace, hyphens,
@ -41,6 +53,9 @@ Value: 3.2
Value: 4.0
# Newly assigned in Unicode 4.0.0 (April, 2003)
Value: 4.1
# Newly assigned in Unicode 4.1.0 (XXX, 2005)
File: extracted/DerivedBidiClass
Property: Bidi_Class
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
@ -86,7 +101,7 @@ Property: ID_Start
Property: ID_Continue
# Derived Property: ID_Continue
# Characters that can continue an identifier.
# Generated from: ID_Start + Mn+Mc+Nd+Pc
# Generated from: ID_Start + Mn+Mc+Nd+Pc + Other_ID_Continue
# NOTE: Cf characters should be filtered out.
@ -109,7 +124,8 @@ Property: XID_Continue
Property: Default_Ignorable_Code_Point
# Derived Property: Default_Ignorable_Code_Point
# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - Annotation_characters
# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters
# - White_Space - FFF9..FFFB (Annotation Characters)
Property: Grapheme_Extend
# Derived Property: Grapheme_Extend
@ -307,6 +323,9 @@ Property: Other_ID_Continue
Property: STerm
Property: Variation_Selector
Property: Pattern_White_Space
Property: Pattern_Syntax
File: PropertyAliases
Property: SPECIAL
@ -315,7 +334,6 @@ File: PropertyValueAliases
Property: SPECIAL
File: Scripts
Property: Script
Format: nameStyle=none skipUnassigned=Common

View File

@ -12,6 +12,7 @@ import java.util.TreeSet;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
@ -49,8 +50,8 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getName(codepoint);
}
}.setMain("Name", "na", UnicodeProperty.MISC, version)
.setValues("<string>"));
}.setValues("<string>")
.setMain("Name", "na", UnicodeProperty.MISC, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
@ -63,24 +64,24 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
protected UnicodeMap _getUnicodeMap() {
return ucd.blockData;
}
}.setMain("Block", "blk", UnicodeProperty.CATALOG, version)
.setValues(ucd.getBlockNames(null)));
}.setValues(ucd.getBlockNames(null))
.setMain("Block", "blk", UnicodeProperty.CATALOG, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getBidiMirror(codepoint);
}
}.setMain("Bidi_Mirroring_Glyph", "bmg", UnicodeProperty.STRING, version)
.setValues("<string>"));
}.setValues("<string>")
.setMain("Bidi_Mirroring_Glyph", "bmg", UnicodeProperty.STRING, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getCase(codepoint,UCD_Types.FULL,UCD_Types.FOLD);
}
}.setMain("Case_Folding", "cf", UnicodeProperty.STRING, version)
.setValues("<string>"));
}.setValues("<string>")
.setMain("Case_Folding", "cf", UnicodeProperty.STRING, version));
add(new UnicodeProperty.SimpleProperty() {
NumberFormat nf = NumberFormat.getInstance();
@ -121,8 +122,8 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version)
.setValues(LONG_YES_NO, YES_NO));
}.setValues(LONG_YES_NO, YES_NO)
.setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
@ -133,8 +134,8 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version)
.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE));
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
.setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
@ -145,8 +146,8 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version)
.setValues(LONG_YES_NO, YES_NO));
}.setValues(LONG_YES_NO, YES_NO)
.setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
@ -157,8 +158,11 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version)
.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE));
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
.setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));
/*
add(new UnicodeProperty.SimpleProperty() {
@ -207,6 +211,102 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
add(new ToolUnicodeProperty(name));
}
add(new UnicodeProperty.UnicodeMapProperty() {
{
unicodeMap = new UnicodeMap();
unicodeMap.setErrorOnReset(true);
unicodeMap.put(0xD, "CR");
unicodeMap.put(0xA, "LF");
UnicodeProperty cat = getProperty("General_Category");
UnicodeSet temp = cat.getSet("Line_Separator")
.addAll(cat.getSet("Paragraph_Separator"))
.addAll(cat.getSet("Control"))
.addAll(cat.getSet("Format"))
.remove(0xD).remove(0xA).remove(0x200C).remove(0x200D);
unicodeMap.putAll(temp, "Control");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
unicodeMap.putAll(graphemeExtend,"Extend");
UnicodeProperty hangul = getProperty("Hangul_Syllable_Type");
unicodeMap.putAll(hangul.getSet("L"),"L");
unicodeMap.putAll(hangul.getSet("V"),"V");
unicodeMap.putAll(hangul.getSet("T"),"T");
unicodeMap.putAll(hangul.getSet("LV"),"LV");
unicodeMap.putAll(hangul.getSet("LVT"),"LVT");
unicodeMap.setMissing("Other");
}
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.UnicodeMapProperty() {
{
unicodeMap = new UnicodeMap();
unicodeMap.setErrorOnReset(true);
UnicodeProperty cat = getProperty("General_Category");
unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format");
UnicodeProperty script = getProperty("Script");
unicodeMap.putAll(script.getSet("Katakana")
.addAll(new UnicodeSet("[\u3031\u3032\u3033\u3034\u3035\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]"))
, "Katakana");
Object foo = unicodeMap.getSet("Katakana");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
unicodeMap.putAll(getProperty("Alphabetic").getSet("true")
.add(0xA0).add(0x05F3)
.removeAll(getProperty("Ideographic").getSet("true"))
.removeAll(unicodeMap.getSet("Katakana"))
.removeAll(script.getSet("Thai"))
.removeAll(script.getSet("Lao"))
.removeAll(script.getSet("Hiragana"))
.removeAll(graphemeExtend),
"ALetter");
unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]")
,"MidLetter");
UnicodeProperty lineBreak = getProperty("Line_Break");
unicodeMap.putAll(lineBreak.getSet("Infix_Numeric")
.remove(0x003A), "MidNum");
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "Numeric");
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.UnicodeMapProperty() {
{
unicodeMap = new UnicodeMap();
unicodeMap.setErrorOnReset(true);
unicodeMap.putAll(new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"), "Sep");
UnicodeProperty cat = getProperty("General_Category");
unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format");
unicodeMap.putAll(getProperty("Whitespace").getSet("true")
.removeAll(unicodeMap.getSet("Sep"))
.remove(0xA0), "Sp");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
unicodeMap.putAll(getProperty("Lowercase").getSet("true")
.removeAll(graphemeExtend), "Lower");
unicodeMap.putAll(getProperty("Uppercase").getSet("true")
.addAll(cat.getSet("Titlecase_Letter"))
, "Upper");
UnicodeSet temp = getProperty("Alphabetic").getSet("true")
.add(0xA0).add(0x5F3)
.removeAll(unicodeMap.getSet("Lower"))
.removeAll(unicodeMap.getSet("Upper"))
.removeAll(graphemeExtend);
unicodeMap.putAll(temp, "OLetter");
UnicodeProperty lineBreak = getProperty("Line_Break");
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
unicodeMap.put(0x002E, "ATerm");
unicodeMap.putAll(getProperty("STerm").getSet("true")
.removeAll(unicodeMap.getSet("ATerm")), "STerm");
unicodeMap.putAll(cat.getSet("Open_Punctuation")
.addAll(cat.getSet("Close_Punctuation"))
.addAll(lineBreak.getSet("Quotation"))
.remove(0x05F3)
.removeAll(unicodeMap.getSet("ATerm"))
.removeAll(unicodeMap.getSet("STerm"))
, "Close");
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version));
}
static String[] YES_NO_MAYBE = {"N", "M", "Y"};

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2004/11/12 23:17:15 $
* $Revision: 1.35 $
* $Date: 2004/11/13 23:10:32 $
* $Revision: 1.36 $
*
*******************************************************************************
*/
@ -1579,8 +1579,13 @@ to guarantee identifier closure.
//T = Mc + (Cf - ZWNJ - ZWJ)
int cp = uData.codePoint;
byte old = uData.joiningType;
byte cat = uData.generalCategory;
byte old = uData.joiningType;
byte cat = uData.generalCategory;
if (cat == Me) {
if (compositeVersion >= 0x40100) {
uData.joiningType = JT_T;
}
}
//if (cp == 0x200D) {
// uData.joiningType = JT_C;
//} else

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2004/11/12 23:17:15 $
* $Revision: 1.28 $
* $Date: 2004/11/13 23:10:32 $
* $Revision: 1.29 $
*
*******************************************************************************
*/
@ -154,6 +154,8 @@ final class UCD_Names implements UCD_Types {
"STerm",
"Variation_Selector",
"Other_ID_Continue",
"Pattern_White_Space",
"Pattern_Syntax"
};
static final String[] SHORT_BP = {
@ -191,7 +193,9 @@ final class UCD_Names implements UCD_Types {
"OIDS",
"STerm",
"VS",
"OIDC"
"OIDC",
"PatWS",
"PatSyn"
};
/*
@ -253,7 +257,7 @@ final class UCD_Names implements UCD_Types {
"CM", "BB", "BA", "SP", "BK", "CR", "LF", "CB",
"SA", "AI", "B2", "SG", "ZW",
"NL",
"WJ",
"WJ", "JL", "JV", "JT", "H2", "H3"
//"JL",
//"JV",
//"JT",
@ -269,7 +273,7 @@ final class UCD_Names implements UCD_Types {
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
"ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace",
"Next_Line",
"Word_Joiner"
"Word_Joiner", "JL", "JV", "JT", "H2", "H3"
//"Leading_Jamo",
//"Vowel_Jamo",
//"Trailing_Jamo",
@ -412,7 +416,8 @@ final class UCD_Names implements UCD_Types {
"3.0",
"3.1",
"3.2",
"4.0"
"4.0",
"4.1",
};

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2004/03/11 19:03:16 $
* $Revision: 1.28 $
* $Date: 2004/11/13 23:10:32 $
* $Revision: 1.29 $
*
*******************************************************************************
*/
@ -15,7 +15,7 @@ package com.ibm.text.UCD;
public interface UCD_Types {
static final byte BINARY_FORMAT = 15; // bumped if binary format of UCD changes. Forces rebuild
static final byte BINARY_FORMAT = 16; // bumped if binary format of UCD changes. Forces rebuild
public static final String BASE_DIR = "C:\\DATA\\";
public static final String UCD_DIR = BASE_DIR + "UCD\\";
@ -214,7 +214,9 @@ public interface UCD_Types {
Sentence_Terminal = 32,
Variation_Selector = 33,
Other_ID_Continue = 34,
LIMIT_BINARY_PROPERTIES = 35;
Pattern_White_Space = 35,
Pattern_Syntax = 36,
LIMIT_BINARY_PROPERTIES = 37;
/*
static final int
@ -247,10 +249,15 @@ public interface UCD_Types {
LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
LB_NL = 29,
LB_WJ = 30,
LB_JL = 31,
LB_JV = 32,
LB_JT = 33,
LB_H2 = 34,
LB_H3 = 35,
//LB_JL = 29,
//LB_JV = 30,
//LB_JT = 31,
LIMIT_LINE_BREAK = 31,
LIMIT_LINE_BREAK = 36,
LB_LIMIT = LIMIT_LINE_BREAK;
// east asian width
@ -394,7 +401,8 @@ public interface UCD_Types {
AGE31 = 5,
AGE32 = 6,
AGE40 = 7,
LIMIT_AGE = 8;
AGE41 = 8,
LIMIT_AGE = 9;
static final String[] AGE_VERSIONS = {
"?",
@ -404,7 +412,8 @@ public interface UCD_Types {
"3.0.0",
"3.1.0",
"3.2.0",
"4.0.0"
"4.0.0",
"4.1.0"
};
public static byte

View File

@ -71,7 +71,7 @@ $Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
$Lowercase = [$GC:Ll $Other_Lowercase]
$Uppercase = [$GC:Lu $Other_Uppercase]
$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc]
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc] $Other_ID_Continue
$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2004/11/12 23:17:15 $
* $Revision: 1.45 $
* $Date: 2004/11/13 23:10:32 $
* $Revision: 1.46 $
*
*******************************************************************************
*/
@ -673,6 +673,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
private static final String[] searchPath = {
"EXTRAS",
"4.1.0",
"4.0.1",
"4.0.0",
"3.2.0",